Python joblib.Parallel() Examples
The following are 30
code examples of joblib.Parallel().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
joblib
, or try the search function
.
Example #1
Source File: utils.py From nmp_qc with MIT License | 8 votes |
def get_graph_stats(graph_obj_handle, prop='degrees'): # if prop == 'degrees': num_cores = multiprocessing.cpu_count() inputs = [int(i*len(graph_obj_handle)/num_cores) for i in range(num_cores)] + [len(graph_obj_handle)] res = Parallel(n_jobs=num_cores)(delayed(get_values)(graph_obj_handle, inputs[i], inputs[i+1], prop) for i in range(num_cores)) stat_dict = {} if 'degrees' in prop: stat_dict['degrees'] = list(set([d for core_res in res for file_res in core_res for d in file_res['degrees']])) if 'edge_labels' in prop: stat_dict['edge_labels'] = list(set([d for core_res in res for file_res in core_res for d in file_res['edge_labels']])) if 'target_mean' in prop or 'target_std' in prop: param = np.array([file_res['params'] for core_res in res for file_res in core_res]) if 'target_mean' in prop: stat_dict['target_mean'] = np.mean(param, axis=0) if 'target_std' in prop: stat_dict['target_std'] = np.std(param, axis=0) return stat_dict
Example #2
Source File: atlas2.py From ssbio with MIT License | 6 votes |
def build_strain_specific_models(self, joblib=False, cores=1, force_rerun=False): """Wrapper function for _build_strain_specific_model""" if len(self.df_orthology_matrix) == 0: raise RuntimeError('Empty orthology matrix, please calculate first!') ref_functional_genes = [g.id for g in self.reference_gempro.functional_genes] log.info('Building strain specific models...') if joblib: result = DictList(Parallel(n_jobs=cores)(delayed(self._build_strain_specific_model)(s, ref_functional_genes, self.df_orthology_matrix, force_rerun=force_rerun) for s in self.strain_ids)) # if sc: # strains_rdd = sc.parallelize(self.strain_ids) # result = strains_rdd.map(self._build_strain_specific_model).collect() else: result = [] for s in tqdm(self.strain_ids): result.append(self._build_strain_specific_model(s, ref_functional_genes, self.df_orthology_matrix, force_rerun=force_rerun)) for strain_id, gp_noseqs_path in result: self.strain_infodict[strain_id]['gp_noseqs_path'] = gp_noseqs_path
Example #3
Source File: librispeech.py From End-to-end-ASR-Pytorch with MIT License | 6 votes |
def __init__(self, path, split, tokenizer, bucket_size, ascending=False): # Setup self.path = path self.bucket_size = bucket_size # List all wave files file_list = [] for s in split: split_list = list(Path(join(path, s)).rglob("*.flac")) assert len(split_list) > 0, "No data found @ {}".format(join(path,s)) file_list += split_list # Read text text = Parallel(n_jobs=READ_FILE_THREADS)( delayed(read_text)(str(f)) for f in file_list) #text = Parallel(n_jobs=-1)(delayed(tokenizer.encode)(txt) for txt in text) text = [tokenizer.encode(txt) for txt in text] # Sort dataset by text length #file_len = Parallel(n_jobs=READ_FILE_THREADS)(delayed(getsize)(f) for f in file_list) self.file_list, self.text = zip(*[(f_name, txt) for f_name, txt in sorted(zip(file_list, text), reverse=not ascending, key=lambda x:len(x[1]))])
Example #4
Source File: _glm_reporter_visual_inspection_suite_.py From nistats with BSD 3-Clause "New" or "Revised" License | 6 votes |
def prefer_parallel_execution(functions_to_be_called): # pragma: no cover try: import joblib import multiprocessing except ImportError: print('Joblib not installed, switching to serial execution') [run_function(fn) for fn in functions_to_be_called] else: try: import tqdm except ImportError: inputs = functions_to_be_called else: inputs = tqdm.tqdm(functions_to_be_called) n_jobs = multiprocessing.cpu_count() print('Parallelizing execution using Joblib') joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(run_function)(fn) for fn in inputs)
Example #5
Source File: utils.py From DeepLab_v3 with MIT License | 6 votes |
def next_minibatch(self): image_filenames_minibatch = self.image_filenames[self.current_index: self.current_index + self.minibatch_size] label_filenames_minibatch = self.label_filenames[self.current_index: self.current_index + self.minibatch_size] self.current_index += self.minibatch_size if self.current_index >= self.dataset_size: self.current_index = 0 # Multithread image processing # Reference: https://www.kaggle.com/inoryy/fast-image-pre-process-in-parallel results = Parallel(n_jobs=self.num_jobs)(delayed(self.process_func)(image_filename, label_filename) for image_filename, label_filename in zip(image_filenames_minibatch, label_filenames_minibatch)) images, labels = zip(*results) images = np.asarray(images) labels = np.asarray(labels) return images, labels
Example #6
Source File: action_detector_diagnosis.py From DETAD with MIT License | 6 votes |
def wrapper_compute_average_precision(self): """Computes average precision for each class in the subset. """ ap = np.zeros((len(self.tiou_thresholds), len(self.activity_index))) recall = np.zeros((len(self.tiou_thresholds), len(self.activity_index))) precision = np.zeros((len(self.tiou_thresholds), len(self.activity_index))) matched_gt_id = np.zeros((len(self.tiou_thresholds), len(self.prediction))) results = Parallel(n_jobs=len(self.activity_index))( delayed(compute_average_precision_detection)( ground_truth=self.ground_truth.loc[self.ground_truth['label'] == cidx].reset_index(drop=True), prediction=self.prediction.loc[self.prediction['label'] == cidx].reset_index(drop=True), tiou_thresholds=self.tiou_thresholds, normalize_ap=self.normalize_ap, average_num_instance_per_class=self.average_num_instance_per_class, minimum_normalized_precision_threshold_for_detection=self.minimum_normalized_precision_threshold_for_detection, ) for cidx in self.activity_index.values()) for i, cidx in enumerate(self.activity_index.values()): ap[:,cidx], matched_this_cls_gt_id, this_cls_prediction_ids, recall[:,cidx], precision[:,cidx] = results[i] matched_gt_id[:,this_cls_prediction_ids] = matched_this_cls_gt_id return ap, matched_gt_id, recall, precision
Example #7
Source File: __init__.py From s3tk with MIT License | 6 votes |
def parallelize(bucket, only, _except, fn, args=(), versions=False): bucket = s3().Bucket(bucket) # use prefix for performance prefix = None if only: # get the first prefix before wildcard prefix = '/'.join(only.split('*')[0].split('/')[:-1]) if prefix: prefix = prefix + '/' if versions: object_versions = bucket.object_versions.filter(Prefix=prefix) if prefix else bucket.object_versions.all() # delete markers have no size return Parallel(n_jobs=24)(delayed(fn)(bucket.name, ov.object_key, ov.id, *args) for ov in object_versions if object_matches(ov.object_key, only, _except) and not ov.is_latest and ov.size is not None) else: objects = bucket.objects.filter(Prefix=prefix) if prefix else bucket.objects.all() if only and not '*' in only: objects = [s3().Object(bucket, only)] return Parallel(n_jobs=24)(delayed(fn)(bucket.name, os.key, *args) for os in objects if object_matches(os.key, only, _except))
Example #8
Source File: action_detector_diagnosis.py From DETAD with MIT License | 6 votes |
def wrapper_analyze_fp_error_types(self): self.fp_error_types_legned = {'True Positive': 0, 'Double Detection Err': 1, 'Wrong Label Err': 2, 'Localization Err': 3, 'Confusion Err': 4, 'Background Err': 5} self.fp_error_types_inverse_legned = dict([(v, k) for k, v in self.fp_error_types_legned.iteritems()]) fp_error_types = Parallel(n_jobs=len(self.tiou_thresholds))( delayed(analyze_fp_error_types)( prediction=self.prediction, ground_truth=self.ground_truth, tiou_thr=tiou_thr, matched_gt_id_col_name=matched_gt_id_col_name, min_tiou_thr=self.min_tiou_thr, fp_error_types_legned=self.fp_error_types_legned, ) for tiou_thr, matched_gt_id_col_name in zip(self.tiou_thresholds, self.matched_gt_id_cols)) return fp_error_types
Example #9
Source File: docker_cache.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def build_save_containers(platforms, registry, load_cache) -> int: """ Entry point to build and upload all built dockerimages in parallel :param platforms: List of platforms :param registry: Docker registry name :param load_cache: Load cache before building :return: 1 if error occurred, 0 otherwise """ from joblib import Parallel, delayed if len(platforms) == 0: return 0 platform_results = Parallel(n_jobs=len(platforms), backend="multiprocessing")( delayed(_build_save_container)(platform, registry, load_cache) for platform in platforms) is_error = False for platform_result in platform_results: if platform_result is not None: logging.error('Failed to generate %s', platform_result) is_error = True return 1 if is_error else 0
Example #10
Source File: graph.py From AutoSmart with GNU General Public License v3.0 | 6 votes |
def recognize_binary_col(self,data,cat_cols): def func(ss): ss = ss.unique() if len(ss) == 3: if pd.isna(ss).sum() == 1: return True if len(ss) == 2: return True return False binary_cols = [] res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[col]) for col in cat_cols) for col,is_binary in zip(cat_cols,res): if is_binary: binary_cols.append(col) return binary_cols
Example #11
Source File: batched_inv_joblib.py From content_wmf with MIT License | 6 votes |
def recompute_factors_batched(Y, S, lambda_reg, W=None, X=None, dtype='float32', batch_size=10000, n_jobs=4): m = S.shape[0] # m = number of users f = Y.shape[1] # f = number of factors YTY = np.dot(Y.T, Y) # precompute this YTYpR = YTY + lambda_reg * np.eye(f) if W is not None: WX = lambda_reg * (X.dot(W)).T else: WX = None X_new = np.zeros((m, f), dtype=dtype) num_batches = int(np.ceil(m / float(batch_size))) res = Parallel(n_jobs=n_jobs)(delayed(solve_batch)(b, S, Y, WX, YTYpR, batch_size, m, f, dtype) for b in xrange(num_batches)) X_new = np.concatenate(res, axis=0) return X_new
Example #12
Source File: decomposition.py From tridesclous with MIT License | 6 votes |
def transform(self, waveforms): #~ print('ici', waveforms.shape, self.ind_peak) features = waveforms[:, self.ind_peak, : ].copy() return features #~ Parallel(n_jobs=n_jobs)(delayed(count_match_spikes)(sorting1.get_unit_spike_train(u1), #~ s2_spiketrains, delta_frames) for #~ i1, u1 in enumerate(unit1_ids)) #~ def get_pca_one_channel(wf_chan, chan, thresh, n_left, n_components_by_channel, params): #~ print(chan) #~ pca = sklearn.decomposition.IncrementalPCA(n_components=n_components_by_channel, **params) #~ wf_chan = waveforms[:,:,chan] #~ print(wf_chan.shape) #~ print(wf_chan[:, -n_left].shape) #~ keep = np.any((wf_chan>thresh) | (wf_chan<-thresh)) #~ keep = (wf_chan[:, -n_left]>thresh) | (wf_chan[:, -n_left]<-thresh) #~ if keep.sum() >=n_components_by_channel: #~ pca.fit(wf_chan[keep, :]) #~ return pca #~ else: #~ return None
Example #13
Source File: weights.py From pyhawkes with MIT License | 6 votes |
def _joblib_resample_A_given_W(self, data): """ Resample A given W. This must be immediately followed by an update of z | A, W. This version uses joblib to parallelize over columns of A. :return: """ # Use the module trick to avoid copying globals import pyhawkes.internals.parallel_adjacency_resampling as par par.model = self.model par.data = data par.lambda_irs = [par._compute_weighted_impulses_at_events(d) for d in data] if len(data) == 0: self.A = np.random.rand(self.K, self.K) < self.network.P return # We can naively parallelize over receiving neurons, k2 # To avoid serializing and copying the data object, we # manually extract the required arrays Sk, Fk, etc. A_cols = Parallel(n_jobs=-1, backend="multiprocessing")( delayed(par._ct_resample_column_of_A)(k2) for k2 in range(self.K)) self.A = np.array(A_cols).T
Example #14
Source File: weights.py From pyhawkes with MIT License | 6 votes |
def _joblib_resample_A_given_W(self, data): """ Resample A given W. This must be immediately followed by an update of z | A, W. This version uses joblib to parallelize over columns of A. :return: """ # Use the module trick to avoid copying globals import pyhawkes.internals.parallel_adjacency_resampling as par par.model = self.model par.data = data par.K = self.model.K if len(data) == 0: self.A = np.random.rand(self.K, self.K) < self.network.P return # We can naively parallelize over receiving neurons, k2 # To avoid serializing and copying the data object, we # manually extract the required arrays Sk, Fk, etc. A_cols = Parallel(n_jobs=-1, backend="multiprocessing")( delayed(par._resample_column_of_A)(k2)for k2 in range(self.K)) self.A = np.array(A_cols).T
Example #15
Source File: base_mab.py From mabwiser with Apache License 2.0 | 6 votes |
def _parallel_predict(self, contexts: np.ndarray, is_predict: bool): # Total number of contexts to predict n_contexts = len(contexts) # Partition contexts by job n_jobs, n_contexts, starts = self._partition_contexts(n_contexts) total_contexts = sum(n_contexts) # Get seed value for each context seeds = self.rng.randint(np.iinfo(np.int32).max, size=total_contexts) # Perform parallel predictions predictions = Parallel(n_jobs=n_jobs, backend=self.backend)( delayed(self._predict_contexts)( contexts[starts[i]:starts[i + 1]], is_predict, seeds[starts[i]:starts[i + 1]], starts[i]) for i in range(n_jobs)) # Reduce predictions = list(chain.from_iterable(t for t in predictions)) return predictions if len(predictions) > 1 else predictions[0]
Example #16
Source File: preprocessor.py From AutoSmart with GNU General Public License v3.0 | 6 votes |
def fit(self,X): def func(ss): length = len(ss.unique()) if length <= 1: return True else: return False df = X.data todo_cols = X.cat_cols + X.multi_cat_cols + X.num_cols + X.time_cols + X.binary_cols res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols) drop_cols = [] for col,unique in zip(todo_cols,res): if unique: drop_cols.append(col) self.drop_cols = drop_cols
Example #17
Source File: preprocessor.py From AutoSmart with GNU General Public License v3.0 | 6 votes |
def fit(self,X): def func(ss): length = len(ss.unique()) if length >= len(ss)-10: return True else: return False df = X.data todo_cols = X.cat_cols res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols) drop_cols = [] for col,all_diff in zip(todo_cols,res): if all_diff: drop_cols.append(col) self.drop_cols = drop_cols
Example #18
Source File: walker.py From GraphEmbedding with MIT License | 6 votes |
def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0): layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl') layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl') layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl') gamma = pd.read_pickle(self.temp_path+'gamma.pkl') walks = [] initialLayer = 0 nodes = self.idx # list(self.g.nodes()) results = Parallel(n_jobs=workers, verbose=verbose, )( delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in partition_num(num_walks, workers)) walks = list(itertools.chain(*results)) return walks
Example #19
Source File: default_feat.py From AutoSmart with GNU General Public License v3.0 | 5 votes |
def transform(self,X): df = X.data col2type = {} col2groupby = {} todo_cols = X.cat_cols todo_cols = todo_cols[:300] if not todo_cols: return new_cols = [] for col in todo_cols: new_col = col+'_CatCount' new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,CONSTANT.NUMERICAL_TYPE) col2type[new_col] = CONSTANT.NUMERICAL_TYPE new_cols.append(new_col) col2groupby[new_col] = col def func(series): col = series.name col_count = series.value_counts() new_col = col+'_CatCount' new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,CONSTANT.NUMERICAL_TYPE) ss = downcast(series.map(col_count)) return ss res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols) if res: tmp = pd.concat(res,axis=1) tmp.columns = new_cols if df.shape[0] <= 2000000: df = pd.concat([df,tmp],axis=1) else: for col in tmp.columns: df[col] = tmp[col] X.update_data(df,col2type,col2groupby,col2source_cat=col2groupby)
Example #20
Source File: data_manager.py From sepconv with MIT License | 5 votes |
def _extract_patches(tuples, max_per_frame=1, trials_per_tuple=100, flow_threshold=25.0, jumpcut_threshold=np.inf, workers=0): """ Spawns the specified number of workers running _extract_patches_worker(). Call this with workers=0 to run on the current thread. """ tick_t = timer() print('===> Extracting patches...') if workers != 0: parallel = Parallel(n_jobs=workers, backend='threading', verbose=5) tuples_per_job = len(tuples) // workers + 1 result = parallel( delayed(_extract_patches_worker)(tuples[i:i + tuples_per_job], max_per_frame, trials_per_tuple, flow_threshold, jumpcut_threshold) for i in range(0, len(tuples), tuples_per_job)) patches = sum(result, []) else: patches = _extract_patches_worker(tuples, max_per_frame, trials_per_tuple, flow_threshold, jumpcut_threshold) tock_t = timer() print("Done. Took ~{}s".format(round(tock_t - tick_t))) return patches ############################################### CACHE ###############################################
Example #21
Source File: mono_3d_tracking.py From 3d-vehicle-tracking with BSD 3-Clause "New" or "Revised" License | 5 votes |
def run_app(self): """ Entry function of calling parallel tracker on sequences """ self.seq_gt_name = os.path.join(os.path.dirname(self.args.path), 'gt.json') self.seq_pd_name = self.args.out_path + '_pd.json' if isinstance(self.label_paths, str): label_paths = pickle.load(open(self.label_paths, 'rb')) else: label_paths = self.label_paths n_seq = len(label_paths) print('* Number of sequence: {}'.format(n_seq)) assert n_seq > 0, "Number of sequence is 0!" print('=> Building gt & hypo...') result = Parallel(n_jobs=self.args.n_jobs)( delayed(self.run_parallel)(seq_path, i_s) for i_s, seq_path in enumerate(tqdm( label_paths, disable=not self.args.verbose)) ) self.seq_gt_list = [n[0] for n in result] self.seq_hypo_list = [n[1] for n in result] if not os.path.isfile(self.seq_gt_name): with open(self.seq_gt_name, 'w') as f: print("Writing to {}".format(self.seq_gt_name)) json.dump(self.seq_gt_list, f) with open(self.seq_pd_name, 'w') as f: print("Writing to {}".format(self.seq_pd_name)) json.dump(self.seq_hypo_list, f)
Example #22
Source File: data_manager.py From sepconv with MIT License | 5 votes |
def _cache_patches(cache_dir, patches, workers=0): """ Spawns the specified number of workers running _cache_patches_worker(). Call this with workers=0 to run on the current thread. """ if exists(cache_dir): rmdir(cache_dir) makedirs(cache_dir) tick_t = timer() print('===> Caching patches...') if workers != 0: parallel = Parallel(n_jobs=workers, backend='threading', verbose=5) patches_per_job = len(patches) // workers + 1 parallel(delayed(_cache_patches_worker)(cache_dir, patches[i:i + patches_per_job]) for i in range(0, len(patches), patches_per_job)) else: _cache_patches_worker(cache_dir, patches) tock_t = timer() print("Done. Took ~{}s".format(round(tock_t - tick_t))) ################################################ MAIN ###############################################
Example #23
Source File: minhash_encoder.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 5 votes |
def transform(self, X): """ Transform X using specified encoding scheme. Parameters ---------- X : array-like, shape (n_samples, ) or (n_samples, 1) The string data to encode. Returns ------- array, shape (n_samples, n_components) Transformed input. """ X = np.asarray(X) assert X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1), f"ERROR:\ shape {X.shape} of input array is not supported." if X.ndim == 2: X = X[:, 0] # Check if first item has str or np.str_ type assert isinstance(X[0], str), "ERROR: Input data is not string." X_out = np.zeros((len(X), self.n_components)) # TODO Parallel run here for i, x in enumerate(X): if x not in self.hash_dict: self.hash_dict[x] = self.get_hash(x) for i, x in enumerate(X): X_out[i, :] = self.hash_dict[x] return X_out
Example #24
Source File: Preprocessing.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 5 votes |
def write_data_csv(fname, frames, preproc): """Write data to csv file""" fdata = open(fname, "w") dr = Parallel()(delayed(get_data)(lst,preproc) for lst in frames) data,result = zip(*dr) for entry in data: fdata.write(','.join(entry)+'\r\n') print("All finished, %d slices in total" % len(data)) fdata.close() result = np.ravel(result) return result
Example #25
Source File: preprocessor.py From AutoSmart with GNU General Public License v3.0 | 5 votes |
def fit(self,X): def func(ss): cats = pd.Categorical(ss).categories return cats df = X.data todo_cols = X.binary_cols res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols) for col,cats in zip(todo_cols,res): self.col2cats[col] = cats
Example #26
Source File: walker.py From GraphEmbedding with MIT License | 5 votes |
def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0): G = self.G nodes = list(G.nodes()) results = Parallel(n_jobs=workers, verbose=verbose, )( delayed(self._simulate_walks)(nodes, num, walk_length) for num in partition_num(num_walks, workers)) walks = list(itertools.chain(*results)) return walks
Example #27
Source File: audfprint.py From audfprint with MIT License | 5 votes |
def do_cmd_multiproc(cmd, analyzer, hash_tab, filename_iter, matcher, outdir, type, report, skip_existing=False, strip_prefix=None, ncores=1): """ Run the actual command, using multiple processors """ if cmd == 'precompute': # precompute fingerprints with joblib msgslist = joblib.Parallel(n_jobs=ncores)( joblib.delayed(file_precompute)(analyzer, file, outdir, type, skip_existing, strip_prefix=strip_prefix) for file in filename_iter ) # Collapse into a single list of messages for msgs in msgslist: report(msgs) elif cmd == 'match': # Running queries in parallel msgslist = joblib.Parallel(n_jobs=ncores)( # Would use matcher.file_match_to_msgs(), but you # can't use joblib on an instance method joblib.delayed(matcher_file_match_to_msgs)(matcher, analyzer, hash_tab, filename) for filename in filename_iter ) for msgs in msgslist: report(msgs) elif cmd == 'new' or cmd == 'add': # We add by forking multiple parallel threads each running # analyzers over different subsets of the file list multiproc_add(analyzer, hash_tab, filename_iter, report, ncores) else: # This is not a multiproc command raise ValueError("unrecognized multiproc command: " + cmd) # Command to separate out setting of analyzer parameters
Example #28
Source File: main.py From lighter with MIT License | 5 votes |
def parse_services(filenames, canaryGroup=None, profiles=[]): # return [parse_service(filename) for filename in filenames] return Parallel(n_jobs=8, backend="threading")(delayed(parse_service)(filename, canaryGroup, profiles) for filename in filenames) if filenames else []
Example #29
Source File: dataset.py From stable-baselines with MIT License | 5 votes |
def _run(self): start = True with Parallel(n_jobs=self.n_workers, batch_size="auto", backend=self.backend) as parallel: while start or self.infinite_loop: start = False if self.shuffle: np.random.shuffle(self.indices) for minibatch_idx in range(self.n_minibatches): self.start_idx = minibatch_idx * self.batch_size obs = self.observations[self._minibatch_indices] if self.load_images: if self.n_workers <= 1: obs = [self._make_batch_element(image_path) for image_path in obs] else: obs = parallel(delayed(self._make_batch_element)(image_path) for image_path in obs) obs = np.concatenate(obs, axis=0) actions = self.actions[self._minibatch_indices] self.queue.put((obs, actions)) # Free memory del obs self.queue.put(None)
Example #30
Source File: aligning-docs-by-interlinks-demo2.py From comparable-text-miner with Apache License 2.0 | 5 votes |
def main(argv): source_corpus_file = sys.argv[1] target_corpus_file = sys.argv[2] source_language = sys.argv[3] target_language = sys.argv[4] output_path = sys.argv[5] if not output_path.endswith('/'): output_path = output_path + '/' tp.check_dir(output_path) # if directory does not exist, then create logging.info( 'aligning %s and %s wikipeida documents using interlanguage links', source_language, target_language) source_docs = tp.split_wikipedia_docs_into_array(source_corpus_file) logging.info( 'source corpus is loaded') target_docs = tp.split_wikipedia_docs_into_array(target_corpus_file) logging.info( 'target corpus is loaded ... start aligning ...') aligned_corpus = Parallel(n_jobs=3,verbose=100)(delayed(tp.aligning_doc_by_interlanguage_links)(d, target_docs, source_language, target_language, output_path) for d in source_docs) source_out = open(output_path + source_language + '.wiki.txt', 'w') target_out = open(output_path + target_language + '.wiki.txt', 'w') for doc_pair in aligned_corpus: if doc_pair[0]: # if not None text_out = doc_pair[0] print>>source_out, text_out.encode('utf-8') text_out = doc_pair[1] print>>target_out, text_out.encode('utf-8') ##################################################################