Python multiprocess.Pool() Examples
The following are 12
code examples of multiprocess.Pool().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
multiprocess
, or try the search function
.
Example #1
Source File: data_builder.py From PreSumm with MIT License | 6 votes |
def format_to_bert(args): if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'valid', 'test'] for corpus_type in datasets: a_lst = [] for json_f in glob.glob(pjoin(args.raw_path, '*' + corpus_type + '.*.json')): real_name = json_f.split('/')[-1] a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')))) print(a_lst) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert, a_lst): pass pool.close() pool.join()
Example #2
Source File: data_builder.py From BertSum with Apache License 2.0 | 6 votes |
def format_to_bert(args): if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'valid', 'test'] for corpus_type in datasets: a_lst = [] for json_f in glob.glob(pjoin(args.raw_path, '*' + corpus_type + '.*.json')): real_name = json_f.split('/')[-1] a_lst.append((json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')))) print(a_lst) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert, a_lst): pass pool.close() pool.join()
Example #3
Source File: data_builder.py From PreSumm with MIT License | 5 votes |
def format_xsum_to_lines(args): if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'test', 'valid'] corpus_mapping = json.load(open(pjoin(args.raw_path, 'XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json'))) for corpus_type in datasets: mapped_fnames = corpus_mapping[corpus_type] root_src = pjoin(args.raw_path, 'restbody') root_tgt = pjoin(args.raw_path, 'firstsentence') # realnames = [fname.split('.')[0] for fname in os.listdir(root_src)] realnames = mapped_fnames a_lst = [(root_src, root_tgt, n) for n in realnames] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_xsum_to_lines, a_lst): if (d is None): continue dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = []
Example #4
Source File: catalog.py From GASpy with GNU Lesser General Public License v3.0 | 5 votes |
def update_catalog_collection(elements, max_miller, n_processes=1, mp_query=None): ''' This function will add enumerate and add adsorption sites to our `catalog` Mongo collection. Args: elements A list of strings indicating the elements you are looking for, e.g., ['Cu', 'Al'] max_miller An integer indicating the maximum Miller index to be enumerated n_processes An integer indicating how many threads you want to use when running the tasks. If you do not expect many updates, stick to the default of 1, or go up to 4. If you are re-creating your collection from scratch, you may want to want to increase this argument as high as you can. mp_query We get our bulks from The Materials Project. This dictionary argument is used as a Mongo query to The Materials Project Database. If you do not supply this argument, then it will automatically filter out bulks whose energies above the hull are greater than 0.1 eV and whose formation energy per atom are above 0 eV. ''' # Python doesn't like mutable arguments if mp_query is None: mp_query = {} # Figure out the MPIDs we need to enumerate get_mpid_task = _GetMpids(elements=elements, mp_query=mp_query) schedule_tasks([get_mpid_task]) mpids = get_task_output(get_mpid_task) # For each MPID, enumerate all the sites and then add them to our `catalog` # Mongo collection. Do this in parallel because it can be. if n_processes > 1: with multiprocess.Pool(n_processes) as pool: list(pool.imap(func=lambda mpid: __run_insert_to_catalog_task(mpid, max_miller), iterable=mpids, chunksize=20)) else: for mpid in mpids: __run_insert_to_catalog_task(mpid, max_miller)
Example #5
Source File: parallel.py From quantum-honeycomp with GNU General Public License v3.0 | 5 votes |
def Pool(n=1): # workaround class mpool(): def map(self,f,xs): return [f(x) for x in xs] def terminate(self): return None # dummy function return mpool()
Example #6
Source File: parallel.py From quantum-honeycomp with GNU General Public License v3.0 | 5 votes |
def set_cores(n=1): global cores cores = n #mainpool = None #def initialize(): # global mainpool # if cores>1: # mainpool = Pool(cores) # create pool # return mainpool #def finish(): mainpool=None # delete pool
Example #7
Source File: parallel.py From quantum-honeycomp with GNU General Public License v3.0 | 5 votes |
def pcall_mp(fun,args,cores=cores): """Calls a function for every input in args""" mainpool = Pool(cores) # create pool # print("Using",cores,"cores") out = mainpool.map(fun,args) # return list mainpool.terminate() # clear the pool del mainpool # delete pool return out #except: # print("Multiprocessing not found, running in a single core") # def pcall_mp(fun,args,cores=1): return pcall_serial(fun,args)
Example #8
Source File: data_builder.py From PreSumm with MIT License | 4 votes |
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) # else: # train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
Example #9
Source File: data_builder.py From BertSum with Apache License 2.0 | 4 votes |
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
Example #10
Source File: cload.py From cooler with BSD 3-Clause "New" or "Revised" License | 4 votes |
def tabix(bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split, **kwargs): """ Bin a tabix-indexed contact list file. {} See also: 'cooler csort' to sort and index a contact list file Tabix manpage: <http://www.htslib.org/doc/tabix.html>. """ logger = get_logger(__name__) chromsizes, bins = parse_bins(bins) if metadata is not None: with open(metadata, 'r') as f: metadata = json.load(f) try: if nproc > 1: pool = Pool(nproc) logger.info("Using {} cores".format(nproc)) map = pool.imap else: map = six.moves.map opts = {} if 'chrom2' in kwargs: opts['C2'] = kwargs['chrom2'] - 1 if 'pos2' in kwargs: opts['P2'] = kwargs['pos2'] - 1 iterator = TabixAggregator( pairs_path, chromsizes, bins, map=map, is_one_based=(not zero_based), n_chunks=max_split, **opts ) create_cooler( cool_path, bins, iterator, metadata=metadata, assembly=assembly, ordered=True) finally: if nproc > 1: pool.close()
Example #11
Source File: cload.py From cooler with BSD 3-Clause "New" or "Revised" License | 4 votes |
def pairix(bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split): """ Bin a pairix-indexed contact list file. {} See also: 'cooler csort' to sort and index a contact list file Pairix on GitHub: <https://github.com/4dn-dcic/pairix>. """ logger = get_logger(__name__) chromsizes, bins = parse_bins(bins) if metadata is not None: with open(metadata, 'r') as f: metadata = json.load(f) try: if nproc > 1: pool = Pool(nproc) logger.info("Using {} cores".format(nproc)) map = pool.imap else: map = six.moves.map iterator = PairixAggregator( pairs_path, chromsizes, bins, map=map, is_one_based=(not zero_based), n_chunks=max_split) create_cooler( cool_path, bins, iterator, metadata=metadata, assembly=assembly, ordered=True) finally: if nproc > 1: pool.close()
Example #12
Source File: utils.py From GASpy with GNU Lesser General Public License v3.0 | 4 votes |
def multimap(function, inputs, chunked=False, processes=32, maxtasksperchild=1, chunksize=1, n_calcs=None): ''' This function is a wrapper to parallelize a function. Args: function The function you want to execute inputs An iterable that yields proper arguments to the function chunked A Boolean indicating whether your function expects single arguments or "chunked" iterables, e.g., lists. processes The number of threads/processes you want to be using maxtasksperchild The maximum number of tasks that a child process may do before terminating (and therefore clearing its memory cache to avoid memory overload). chunksize How many calculations you want to have each single processor do per task. Smaller chunks means more memory shuffling. Bigger chunks means more RAM requirements. n_calcs How many calculations you have. Only necessary for adding a percentage timer to the progress bar. Returns: outputs A list of the inputs mapped through the function ''' # Collect garbage before we begin multiprocessing to make sure we don't # pass things we don't need to gc.collect() # If we have one thread, there's no use multiprocessing if processes == 1: output = [function(input_) for input_ in tqdm(inputs, total=n_calcs)] return output with Pool(processes=processes, maxtasksperchild=maxtasksperchild) as pool: # Use multiprocessing to perform the calculations. We use imap instead # of map so that we get an iterator, which we need for tqdm (the # progress bar) to work. imap also requires less disk memory, which # can be an issue for some of our large systems. if not chunked: iterator = pool.imap(function, inputs, chunksize=chunksize) total = n_calcs outputs = list(tqdm(iterator, total=total)) # If our function expects chunks, then we have to unpack our inputs # appropriately else: iterator = pool.imap(function, _chunk(inputs, n=chunksize)) total = n_calcs / chunksize outputs = list(np.concatenate(list(tqdm(iterator, total=total)))) return outputs