Python joblib.Parallel() Examples

The following are 30 code examples of joblib.Parallel(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module joblib , or try the search function .
Example #1
Source File: utils.py    From nmp_qc with MIT License 8 votes vote down vote up
def get_graph_stats(graph_obj_handle, prop='degrees'):
    # if prop == 'degrees':
    num_cores = multiprocessing.cpu_count()
    inputs = [int(i*len(graph_obj_handle)/num_cores) for i in range(num_cores)] + [len(graph_obj_handle)]
    res = Parallel(n_jobs=num_cores)(delayed(get_values)(graph_obj_handle, inputs[i], inputs[i+1], prop) for i in range(num_cores))

    stat_dict = {}

    if 'degrees' in prop:
        stat_dict['degrees'] = list(set([d for core_res in res for file_res in core_res for d in file_res['degrees']]))
    if 'edge_labels' in prop:
        stat_dict['edge_labels'] = list(set([d for core_res in res for file_res in core_res for d in file_res['edge_labels']]))
    if 'target_mean' in prop or 'target_std' in prop:
        param = np.array([file_res['params'] for core_res in res for file_res in core_res])
    if 'target_mean' in prop:
        stat_dict['target_mean'] = np.mean(param, axis=0)
    if 'target_std' in prop:
        stat_dict['target_std'] = np.std(param, axis=0)

    return stat_dict 
Example #2
Source File: atlas2.py    From ssbio with MIT License 6 votes vote down vote up
def build_strain_specific_models(self, joblib=False, cores=1, force_rerun=False):
        """Wrapper function for _build_strain_specific_model"""
        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix, please calculate first!')
        ref_functional_genes = [g.id for g in self.reference_gempro.functional_genes]
        log.info('Building strain specific models...')
        if joblib:
            result = DictList(Parallel(n_jobs=cores)(delayed(self._build_strain_specific_model)(s, ref_functional_genes, self.df_orthology_matrix, force_rerun=force_rerun) for s in self.strain_ids))
        # if sc:
        #     strains_rdd = sc.parallelize(self.strain_ids)
        #     result = strains_rdd.map(self._build_strain_specific_model).collect()
        else:
            result = []
            for s in tqdm(self.strain_ids):
                result.append(self._build_strain_specific_model(s, ref_functional_genes, self.df_orthology_matrix, force_rerun=force_rerun))

        for strain_id, gp_noseqs_path in result:
            self.strain_infodict[strain_id]['gp_noseqs_path'] = gp_noseqs_path 
Example #3
Source File: librispeech.py    From End-to-end-ASR-Pytorch with MIT License 6 votes vote down vote up
def __init__(self, path, split, tokenizer, bucket_size, ascending=False):
        # Setup
        self.path = path
        self.bucket_size = bucket_size

        # List all wave files
        file_list = []
        for s in split:
            split_list = list(Path(join(path, s)).rglob("*.flac"))
            assert len(split_list) > 0, "No data found @ {}".format(join(path,s))
            file_list += split_list
        # Read text
        text = Parallel(n_jobs=READ_FILE_THREADS)(
            delayed(read_text)(str(f)) for f in file_list)
        #text = Parallel(n_jobs=-1)(delayed(tokenizer.encode)(txt) for txt in text)
        text = [tokenizer.encode(txt) for txt in text]

        # Sort dataset by text length
        #file_len = Parallel(n_jobs=READ_FILE_THREADS)(delayed(getsize)(f) for f in file_list)
        self.file_list, self.text = zip(*[(f_name, txt)
                                          for f_name, txt in sorted(zip(file_list, text), reverse=not ascending, key=lambda x:len(x[1]))]) 
Example #4
Source File: _glm_reporter_visual_inspection_suite_.py    From nistats with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def prefer_parallel_execution(functions_to_be_called):  # pragma: no cover
    try:
        import joblib
        import multiprocessing
    except ImportError:
        print('Joblib not installed, switching to serial execution')
        [run_function(fn) for fn in functions_to_be_called]
    else:
        try:
            import tqdm
        except ImportError:
            inputs = functions_to_be_called
        else:
            inputs = tqdm.tqdm(functions_to_be_called)
        n_jobs = multiprocessing.cpu_count()
        print('Parallelizing execution using Joblib')
        joblib.Parallel(n_jobs=n_jobs)(
                joblib.delayed(run_function)(fn) for fn in inputs) 
Example #5
Source File: utils.py    From DeepLab_v3 with MIT License 6 votes vote down vote up
def next_minibatch(self):

        image_filenames_minibatch = self.image_filenames[self.current_index: self.current_index + self.minibatch_size]
        label_filenames_minibatch = self.label_filenames[self.current_index: self.current_index + self.minibatch_size]
        self.current_index += self.minibatch_size
        if self.current_index >= self.dataset_size:
            self.current_index = 0

        # Multithread image processing
        # Reference: https://www.kaggle.com/inoryy/fast-image-pre-process-in-parallel

        results = Parallel(n_jobs=self.num_jobs)(delayed(self.process_func)(image_filename, label_filename) for image_filename, label_filename in zip(image_filenames_minibatch, label_filenames_minibatch))
        images, labels = zip(*results)

        images = np.asarray(images)
        labels = np.asarray(labels)

        return images, labels 
Example #6
Source File: action_detector_diagnosis.py    From DETAD with MIT License 6 votes vote down vote up
def wrapper_compute_average_precision(self):
        """Computes average precision for each class in the subset.
        """
        ap = np.zeros((len(self.tiou_thresholds), len(self.activity_index)))
        recall = np.zeros((len(self.tiou_thresholds), len(self.activity_index)))
        precision = np.zeros((len(self.tiou_thresholds), len(self.activity_index)))
        matched_gt_id = np.zeros((len(self.tiou_thresholds), len(self.prediction)))

        results = Parallel(n_jobs=len(self.activity_index))(
                    delayed(compute_average_precision_detection)(
                        ground_truth=self.ground_truth.loc[self.ground_truth['label'] == cidx].reset_index(drop=True),
                        prediction=self.prediction.loc[self.prediction['label'] == cidx].reset_index(drop=True),
                        tiou_thresholds=self.tiou_thresholds,
                        normalize_ap=self.normalize_ap, 
                        average_num_instance_per_class=self.average_num_instance_per_class,
                        minimum_normalized_precision_threshold_for_detection=self.minimum_normalized_precision_threshold_for_detection,
                    ) for cidx in self.activity_index.values())
        
        for i, cidx in enumerate(self.activity_index.values()):
            ap[:,cidx], matched_this_cls_gt_id, this_cls_prediction_ids, recall[:,cidx], precision[:,cidx] = results[i]
            matched_gt_id[:,this_cls_prediction_ids] = matched_this_cls_gt_id

        return ap, matched_gt_id, recall, precision 
Example #7
Source File: __init__.py    From s3tk with MIT License 6 votes vote down vote up
def parallelize(bucket, only, _except, fn, args=(), versions=False):
    bucket = s3().Bucket(bucket)

    # use prefix for performance
    prefix = None
    if only:
        # get the first prefix before wildcard
        prefix = '/'.join(only.split('*')[0].split('/')[:-1])
        if prefix:
            prefix = prefix + '/'

    if versions:
        object_versions = bucket.object_versions.filter(Prefix=prefix) if prefix else bucket.object_versions.all()
        # delete markers have no size
        return Parallel(n_jobs=24)(delayed(fn)(bucket.name, ov.object_key, ov.id, *args) for ov in object_versions if object_matches(ov.object_key, only, _except) and not ov.is_latest and ov.size is not None)
    else:
        objects = bucket.objects.filter(Prefix=prefix) if prefix else bucket.objects.all()

        if only and not '*' in only:
            objects = [s3().Object(bucket, only)]

        return Parallel(n_jobs=24)(delayed(fn)(bucket.name, os.key, *args) for os in objects if object_matches(os.key, only, _except)) 
Example #8
Source File: action_detector_diagnosis.py    From DETAD with MIT License 6 votes vote down vote up
def wrapper_analyze_fp_error_types(self):
        self.fp_error_types_legned = {'True Positive': 0,
                                      'Double Detection Err': 1,
                                      'Wrong Label Err': 2,
                                      'Localization Err': 3,
                                      'Confusion Err': 4,
                                      'Background Err': 5}

        self.fp_error_types_inverse_legned = dict([(v, k) for k, v in self.fp_error_types_legned.iteritems()])

        fp_error_types = Parallel(n_jobs=len(self.tiou_thresholds))(
                            delayed(analyze_fp_error_types)(
                                prediction=self.prediction,
                                ground_truth=self.ground_truth,
                                tiou_thr=tiou_thr,
                                matched_gt_id_col_name=matched_gt_id_col_name,
                                min_tiou_thr=self.min_tiou_thr,
                                fp_error_types_legned=self.fp_error_types_legned,
                            ) for tiou_thr, matched_gt_id_col_name in zip(self.tiou_thresholds, self.matched_gt_id_cols))
        
        return fp_error_types 
Example #9
Source File: docker_cache.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def build_save_containers(platforms, registry, load_cache) -> int:
    """
    Entry point to build and upload all built dockerimages in parallel
    :param platforms: List of platforms
    :param registry: Docker registry name
    :param load_cache: Load cache before building
    :return: 1 if error occurred, 0 otherwise
    """
    from joblib import Parallel, delayed
    if len(platforms) == 0:
        return 0

    platform_results = Parallel(n_jobs=len(platforms), backend="multiprocessing")(
        delayed(_build_save_container)(platform, registry, load_cache)
        for platform in platforms)

    is_error = False
    for platform_result in platform_results:
        if platform_result is not None:
            logging.error('Failed to generate %s', platform_result)
            is_error = True

    return 1 if is_error else 0 
Example #10
Source File: graph.py    From AutoSmart with GNU General Public License v3.0 6 votes vote down vote up
def recognize_binary_col(self,data,cat_cols):
        def func(ss):
            ss = ss.unique()
            if len(ss) == 3:
                if pd.isna(ss).sum() == 1:
                    return True
            if len(ss) == 2:
                return True
            return False
        
        binary_cols = []
        
        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(data[col]) for col in cat_cols)
        
        for col,is_binary in zip(cat_cols,res):
            if is_binary:
                binary_cols.append(col)
        
        return binary_cols 
Example #11
Source File: batched_inv_joblib.py    From content_wmf with MIT License 6 votes vote down vote up
def recompute_factors_batched(Y, S, lambda_reg, W=None, X=None,
                              dtype='float32', batch_size=10000, n_jobs=4):
    m = S.shape[0]  # m = number of users
    f = Y.shape[1]  # f = number of factors

    YTY = np.dot(Y.T, Y)  # precompute this
    YTYpR = YTY + lambda_reg * np.eye(f)
    if W is not None:
        WX = lambda_reg * (X.dot(W)).T
    else:
        WX = None
    X_new = np.zeros((m, f), dtype=dtype)

    num_batches = int(np.ceil(m / float(batch_size)))

    res = Parallel(n_jobs=n_jobs)(delayed(solve_batch)(b, S, Y, WX, YTYpR,
                                                       batch_size, m, f, dtype)
                                  for b in xrange(num_batches))
    X_new = np.concatenate(res, axis=0)

    return X_new 
Example #12
Source File: decomposition.py    From tridesclous with MIT License 6 votes vote down vote up
def transform(self, waveforms):
        #~ print('ici', waveforms.shape, self.ind_peak)
        features = waveforms[:, self.ind_peak, : ].copy()
        return features



#~ Parallel(n_jobs=n_jobs)(delayed(count_match_spikes)(sorting1.get_unit_spike_train(u1),
                                                                                  #~ s2_spiketrains, delta_frames) for
                                                      #~ i1, u1 in enumerate(unit1_ids))

#~ def get_pca_one_channel(wf_chan, chan, thresh, n_left, n_components_by_channel, params):
    #~ print(chan)
    #~ pca = sklearn.decomposition.IncrementalPCA(n_components=n_components_by_channel, **params)
    #~ wf_chan = waveforms[:,:,chan]
    #~ print(wf_chan.shape)
    #~ print(wf_chan[:, -n_left].shape)
    #~ keep = np.any((wf_chan>thresh) | (wf_chan<-thresh))
    #~ keep = (wf_chan[:, -n_left]>thresh) | (wf_chan[:, -n_left]<-thresh)

    #~ if keep.sum() >=n_components_by_channel:
        #~ pca.fit(wf_chan[keep, :])
        #~ return pca
    #~ else:
        #~ return None 
Example #13
Source File: weights.py    From pyhawkes with MIT License 6 votes vote down vote up
def _joblib_resample_A_given_W(self, data):
        """
        Resample A given W. This must be immediately followed by an
        update of z | A, W. This  version uses joblib to parallelize
        over columns of A.
        :return:
        """
        # Use the module trick to avoid copying globals
        import pyhawkes.internals.parallel_adjacency_resampling as par
        par.model = self.model
        par.data = data
        par.lambda_irs = [par._compute_weighted_impulses_at_events(d) for d in data]

        if len(data) == 0:
            self.A = np.random.rand(self.K, self.K) < self.network.P
            return

        # We can naively parallelize over receiving neurons, k2
        # To avoid serializing and copying the data object, we
        # manually extract the required arrays Sk, Fk, etc.
        A_cols = Parallel(n_jobs=-1, backend="multiprocessing")(
            delayed(par._ct_resample_column_of_A)(k2) for k2 in range(self.K))
        self.A = np.array(A_cols).T 
Example #14
Source File: weights.py    From pyhawkes with MIT License 6 votes vote down vote up
def _joblib_resample_A_given_W(self, data):
        """
        Resample A given W. This must be immediately followed by an
        update of z | A, W. This  version uses joblib to parallelize
        over columns of A.
        :return:
        """
        # Use the module trick to avoid copying globals
        import pyhawkes.internals.parallel_adjacency_resampling as par
        par.model = self.model
        par.data = data
        par.K = self.model.K

        if len(data) == 0:
            self.A = np.random.rand(self.K, self.K) < self.network.P
            return

        # We can naively parallelize over receiving neurons, k2
        # To avoid serializing and copying the data object, we
        # manually extract the required arrays Sk, Fk, etc.
        A_cols = Parallel(n_jobs=-1, backend="multiprocessing")(
            delayed(par._resample_column_of_A)(k2)for k2 in range(self.K))
        self.A = np.array(A_cols).T 
Example #15
Source File: base_mab.py    From mabwiser with Apache License 2.0 6 votes vote down vote up
def _parallel_predict(self, contexts: np.ndarray, is_predict: bool):

        # Total number of contexts to predict
        n_contexts = len(contexts)

        # Partition contexts by job
        n_jobs, n_contexts, starts = self._partition_contexts(n_contexts)
        total_contexts = sum(n_contexts)

        # Get seed value for each context
        seeds = self.rng.randint(np.iinfo(np.int32).max, size=total_contexts)

        # Perform parallel predictions
        predictions = Parallel(n_jobs=n_jobs, backend=self.backend)(
                          delayed(self._predict_contexts)(
                              contexts[starts[i]:starts[i + 1]],
                              is_predict,
                              seeds[starts[i]:starts[i + 1]],
                              starts[i])
                          for i in range(n_jobs))

        # Reduce
        predictions = list(chain.from_iterable(t for t in predictions))

        return predictions if len(predictions) > 1 else predictions[0] 
Example #16
Source File: preprocessor.py    From AutoSmart with GNU General Public License v3.0 6 votes vote down vote up
def fit(self,X):
        def func(ss):
            length = len(ss.unique())
            if length <= 1:
                return True
            else:
                return False
            
        df = X.data
        todo_cols = X.cat_cols + X.multi_cat_cols + X.num_cols + X.time_cols + X.binary_cols
        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
        
        drop_cols = []
        for col,unique in zip(todo_cols,res):
            if unique:
                drop_cols.append(col)
        
        self.drop_cols = drop_cols 
Example #17
Source File: preprocessor.py    From AutoSmart with GNU General Public License v3.0 6 votes vote down vote up
def fit(self,X):
        def func(ss):
            length = len(ss.unique())
            if length >= len(ss)-10:
                return True
            else:  
                return False
        
        df = X.data
        todo_cols = X.cat_cols
        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
        
        drop_cols = []
        for col,all_diff in zip(todo_cols,res):
            if all_diff:
                drop_cols.append(col)
        
        self.drop_cols = drop_cols 
Example #18
Source File: walker.py    From GraphEmbedding with MIT License 6 votes vote down vote up
def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0):

        layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl')
        layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl')
        layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl')
        gamma = pd.read_pickle(self.temp_path+'gamma.pkl')
        walks = []
        initialLayer = 0

        nodes = self.idx  # list(self.g.nodes())

        results = Parallel(n_jobs=workers, verbose=verbose, )(
            delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in
            partition_num(num_walks, workers))

        walks = list(itertools.chain(*results))
        return walks 
Example #19
Source File: default_feat.py    From AutoSmart with GNU General Public License v3.0 5 votes vote down vote up
def transform(self,X):
        df = X.data
        col2type = {}
        col2groupby = {}
        todo_cols = X.cat_cols
        todo_cols = todo_cols[:300]
        
        if not todo_cols:
            return

        new_cols = []
        for col in todo_cols:
            new_col =  col+'_CatCount'
            new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,CONSTANT.NUMERICAL_TYPE)
            col2type[new_col] = CONSTANT.NUMERICAL_TYPE
            new_cols.append(new_col)
            col2groupby[new_col] = col

        def func(series):
            col = series.name
            col_count = series.value_counts()
            new_col = col+'_CatCount'
            new_col = FeatContext.gen_feat_name(namespace,self.__class__.__name__,new_col,CONSTANT.NUMERICAL_TYPE)
            ss = downcast(series.map(col_count))
            return ss

        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
        if res:
            tmp = pd.concat(res,axis=1)
            tmp.columns = new_cols
            
            if df.shape[0] <= 2000000:
                df = pd.concat([df,tmp],axis=1)
            else:
                for col in tmp.columns:
                    df[col] = tmp[col]
                    
            X.update_data(df,col2type,col2groupby,col2source_cat=col2groupby) 
Example #20
Source File: data_manager.py    From sepconv with MIT License 5 votes vote down vote up
def _extract_patches(tuples, max_per_frame=1, trials_per_tuple=100, flow_threshold=25.0, jumpcut_threshold=np.inf,
                     workers=0):
    """
    Spawns the specified number of workers running _extract_patches_worker().
    Call this with workers=0 to run on the current thread.
    """

    tick_t = timer()
    print('===> Extracting patches...')

    if workers != 0:
        parallel = Parallel(n_jobs=workers, backend='threading', verbose=5)
        tuples_per_job = len(tuples) // workers + 1
        result = parallel(
            delayed(_extract_patches_worker)(tuples[i:i + tuples_per_job], max_per_frame, trials_per_tuple,
                                             flow_threshold, jumpcut_threshold) for i in
            range(0, len(tuples), tuples_per_job))
        patches = sum(result, [])
    else:
        patches = _extract_patches_worker(tuples, max_per_frame, trials_per_tuple, flow_threshold, jumpcut_threshold)

    tock_t = timer()
    print("Done. Took ~{}s".format(round(tock_t - tick_t)))

    return patches


############################################### CACHE ############################################### 
Example #21
Source File: mono_3d_tracking.py    From 3d-vehicle-tracking with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def run_app(self):
        """
        Entry function of calling parallel tracker on sequences
        """
        self.seq_gt_name = os.path.join(os.path.dirname(self.args.path),
                                         'gt.json')
        self.seq_pd_name = self.args.out_path + '_pd.json'

        if isinstance(self.label_paths, str):
            label_paths = pickle.load(open(self.label_paths, 'rb'))
        else:
            label_paths = self.label_paths

        n_seq = len(label_paths)
        print('* Number of sequence: {}'.format(n_seq))
        assert n_seq > 0, "Number of sequence is 0!"

        print('=> Building gt & hypo...')
        result = Parallel(n_jobs=self.args.n_jobs)(
            delayed(self.run_parallel)(seq_path, i_s)
            for i_s, seq_path in enumerate(tqdm(
                label_paths,
                disable=not self.args.verbose))
        )

        self.seq_gt_list = [n[0] for n in result]
        self.seq_hypo_list = [n[1] for n in result]

        if not os.path.isfile(self.seq_gt_name):
            with open(self.seq_gt_name, 'w') as f:
                print("Writing to {}".format(self.seq_gt_name))
                json.dump(self.seq_gt_list, f)
        with open(self.seq_pd_name, 'w') as f:
            print("Writing to {}".format(self.seq_pd_name))
            json.dump(self.seq_hypo_list, f) 
Example #22
Source File: data_manager.py    From sepconv with MIT License 5 votes vote down vote up
def _cache_patches(cache_dir, patches, workers=0):
    """
    Spawns the specified number of workers running _cache_patches_worker().
    Call this with workers=0 to run on the current thread.
    """

    if exists(cache_dir):
        rmdir(cache_dir)

    makedirs(cache_dir)

    tick_t = timer()
    print('===> Caching patches...')

    if workers != 0:
        parallel = Parallel(n_jobs=workers, backend='threading', verbose=5)
        patches_per_job = len(patches) // workers + 1
        parallel(delayed(_cache_patches_worker)(cache_dir, patches[i:i + patches_per_job]) for i in
                 range(0, len(patches), patches_per_job))
    else:
        _cache_patches_worker(cache_dir, patches)

    tock_t = timer()
    print("Done. Took ~{}s".format(round(tock_t - tick_t)))


################################################ MAIN ############################################### 
Example #23
Source File: minhash_encoder.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def transform(self, X):
        """ Transform X using specified encoding scheme.
        Parameters
        ----------
        X : array-like, shape (n_samples, ) or (n_samples, 1)
            The string data to encode.
        Returns
        -------
        array, shape (n_samples, n_components)
            Transformed input.
        """
        X = np.asarray(X)
        assert X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1), f"ERROR:\
        shape {X.shape} of input array is not supported."
        if X.ndim == 2:
            X = X[:, 0]
        # Check if first item has str or np.str_ type
        assert isinstance(X[0], str), "ERROR: Input data is not string."
        X_out = np.zeros((len(X), self.n_components))

        # TODO Parallel run here
        for i, x in enumerate(X):
            if x not in self.hash_dict:
                self.hash_dict[x] = self.get_hash(x)

        for i, x in enumerate(X):
            X_out[i, :] = self.hash_dict[x]
            
        return X_out 
Example #24
Source File: Preprocessing.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 5 votes vote down vote up
def write_data_csv(fname, frames, preproc):
   """Write data to csv file"""
   fdata = open(fname, "w")
   dr = Parallel()(delayed(get_data)(lst,preproc) for lst in frames)
   data,result = zip(*dr)
   for entry in data:
      fdata.write(','.join(entry)+'\r\n')
   print("All finished, %d slices in total" % len(data))
   fdata.close()
   result = np.ravel(result)
   return result 
Example #25
Source File: preprocessor.py    From AutoSmart with GNU General Public License v3.0 5 votes vote down vote up
def fit(self,X):
        def func(ss):
            cats = pd.Categorical(ss).categories 
            return cats
        
        df = X.data
        todo_cols = X.binary_cols
        
        res = Parallel(n_jobs=CONSTANT.JOBS,require='sharedmem')(delayed(func)(df[col]) for col in todo_cols)
        for col,cats in zip(todo_cols,res):
            self.col2cats[col] = cats 
Example #26
Source File: walker.py    From GraphEmbedding with MIT License 5 votes vote down vote up
def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):

        G = self.G

        nodes = list(G.nodes())

        results = Parallel(n_jobs=workers, verbose=verbose, )(
            delayed(self._simulate_walks)(nodes, num, walk_length) for num in
            partition_num(num_walks, workers))

        walks = list(itertools.chain(*results))

        return walks 
Example #27
Source File: audfprint.py    From audfprint with MIT License 5 votes vote down vote up
def do_cmd_multiproc(cmd, analyzer, hash_tab, filename_iter, matcher,
                     outdir, type, report, skip_existing=False,
                     strip_prefix=None, ncores=1):
    """ Run the actual command, using multiple processors """
    if cmd == 'precompute':
        # precompute fingerprints with joblib
        msgslist = joblib.Parallel(n_jobs=ncores)(
                joblib.delayed(file_precompute)(analyzer, file, outdir, type, skip_existing, strip_prefix=strip_prefix)
                for file in filename_iter
        )
        # Collapse into a single list of messages
        for msgs in msgslist:
            report(msgs)

    elif cmd == 'match':
        # Running queries in parallel
        msgslist = joblib.Parallel(n_jobs=ncores)(
                # Would use matcher.file_match_to_msgs(), but you
                # can't use joblib on an instance method
                joblib.delayed(matcher_file_match_to_msgs)(matcher, analyzer,
                                                           hash_tab, filename)
                for filename in filename_iter
        )
        for msgs in msgslist:
            report(msgs)

    elif cmd == 'new' or cmd == 'add':
        # We add by forking multiple parallel threads each running
        # analyzers over different subsets of the file list
        multiproc_add(analyzer, hash_tab, filename_iter, report, ncores)

    else:
        # This is not a multiproc command
        raise ValueError("unrecognized multiproc command: " + cmd)


# Command to separate out setting of analyzer parameters 
Example #28
Source File: main.py    From lighter with MIT License 5 votes vote down vote up
def parse_services(filenames, canaryGroup=None, profiles=[]):
    # return [parse_service(filename) for filename in filenames]
    return Parallel(n_jobs=8, backend="threading")(delayed(parse_service)(filename, canaryGroup, profiles) for filename in filenames) if filenames else [] 
Example #29
Source File: dataset.py    From stable-baselines with MIT License 5 votes vote down vote up
def _run(self):
        start = True
        with Parallel(n_jobs=self.n_workers, batch_size="auto", backend=self.backend) as parallel:
            while start or self.infinite_loop:
                start = False

                if self.shuffle:
                    np.random.shuffle(self.indices)

                for minibatch_idx in range(self.n_minibatches):

                    self.start_idx = minibatch_idx * self.batch_size

                    obs = self.observations[self._minibatch_indices]
                    if self.load_images:
                        if self.n_workers <= 1:
                            obs = [self._make_batch_element(image_path)
                                   for image_path in obs]

                        else:
                            obs = parallel(delayed(self._make_batch_element)(image_path)
                                           for image_path in obs)

                        obs = np.concatenate(obs, axis=0)

                    actions = self.actions[self._minibatch_indices]

                    self.queue.put((obs, actions))

                    # Free memory
                    del obs

                self.queue.put(None) 
Example #30
Source File: aligning-docs-by-interlinks-demo2.py    From comparable-text-miner with Apache License 2.0 5 votes vote down vote up
def main(argv):
	source_corpus_file = sys.argv[1]
	target_corpus_file = sys.argv[2]
	source_language = sys.argv[3]
	target_language = sys.argv[4]
	output_path = sys.argv[5]
	
	if not output_path.endswith('/'): output_path = output_path + '/'
	tp.check_dir(output_path) # if directory does not exist, then create
	
	logging.info( 'aligning %s and %s wikipeida documents using interlanguage links',  source_language, target_language)
	source_docs = tp.split_wikipedia_docs_into_array(source_corpus_file)
	logging.info( 'source corpus is loaded')
	target_docs = tp.split_wikipedia_docs_into_array(target_corpus_file)
	logging.info( 'target corpus is loaded ... start aligning ...')
	
	aligned_corpus = Parallel(n_jobs=3,verbose=100)(delayed(tp.aligning_doc_by_interlanguage_links)(d, target_docs, source_language, target_language, output_path) for d in source_docs)
	
	
	source_out = open(output_path +  source_language + '.wiki.txt', 'w') 
	target_out = open(output_path +  target_language + '.wiki.txt', 'w')
	
	for doc_pair in aligned_corpus:
		if doc_pair[0]: # if not None 
			text_out = doc_pair[0]
			print>>source_out, text_out.encode('utf-8')
			text_out = doc_pair[1]
			print>>target_out, text_out.encode('utf-8')
	
	

##################################################################