Python toolz.partition_all() Examples
The following are 11
code examples of toolz.partition_all().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
toolz
, or try the search function
.
Example #1
Source File: umis.py From umis with MIT License | 6 votes |
def sb_filter(fastq, bc, cores, nedit): ''' Filters reads with non-matching sample barcodes Expects formatted fastq files. ''' barcodes = set(sb.strip() for sb in bc) if nedit == 0: filter_sb = partial(exact_sample_filter2, barcodes=barcodes) else: barcodehash = MutationHash(barcodes, nedit) filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_sb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
Example #2
Source File: accounts.py From hivemind with MIT License | 6 votes |
def _cache_accounts(cls, accounts, steem, trx=True): """Fetch all `accounts` and write to db.""" timer = Timer(len(accounts), 'account', ['rps', 'wps']) for name_batch in partition_all(1000, accounts): cached_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') timer.batch_start() batch = steem.get_accounts(name_batch) timer.batch_lap() sqls = [cls._sql(acct, cached_at) for acct in batch] DB.batch_queries(sqls, trx) timer.batch_finish(len(batch)) if trx or len(accounts) > 1000: log.info(timer.batch_status())
Example #3
Source File: scraper.py From steemdata-mongo with MIT License | 6 votes |
def scrape_blockchain(mongo): s = Steem() # see how far behind we are missing = list(range(last_block_num(mongo), s.last_irreversible_block_num)) # if we are far behind blockchain head # split work in chunks of 100 if len(missing) > 100: for batch in partition_all(100, missing): results = s.get_blocks(batch) insert_blocks(mongo, results) # otherwise continue as normal blockchain = Blockchain(mode="irreversible") hist = blockchain.stream_from(start_block=last_block_num(mongo), full_blocks=True) insert_blocks(mongo, hist)
Example #4
Source File: umis.py From umis with MIT License | 5 votes |
def cb_filter(fastq, bc1, bc2, bc3, cores, nedit): ''' Filters reads with non-matching barcodes Expects formatted fastq files. ''' with open_gzipsafe(bc1) as bc1_fh: bc1 = set(cb.strip() for cb in bc1_fh) if bc2: with open_gzipsafe(bc2) as bc2_fh: bc2 = set(cb.strip() for cb in bc2_fh) if bc3: with open_gzipsafe(bc3) as bc3_fh: bc3 = set(cb.strip() for cb in bc3_fh) annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) if nedit == 0: filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2, bc3=bc3, re_string=re_string) else: bc1hash = MutationHash(bc1, nedit) bc2hash = None bc3hash = None if bc2: bc2hash = MutationHash(bc2, nedit) if bc3: bc3hash = MutationHash(bc3, nedit) filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash, bc2hash=bc2hash, bc3hash=bc3hash, re_string=re_string) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_cb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
Example #5
Source File: umis.py From umis with MIT License | 5 votes |
def mb_filter(fastq, cores): ''' Filters umis with non-ACGT bases Expects formatted fastq files. ''' filter_mb = partial(umi_filter) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_mb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
Example #6
Source File: umis.py From umis with MIT License | 5 votes |
def add_uid(fastq, cores): ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication Expects formatted fastq files with correct sample and cell barcodes. ''' uids = partial(append_uids) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(uids, list(bigchunk)): for read in chunk: sys.stdout.write(read)
Example #7
Source File: discovery.py From pyquarkchain with MIT License | 5 votes |
def send_topic_nodes( self, node: kademlia.Node, echo: Hash32, nodes: Tuple[kademlia.Node, ...] ) -> None: encoded_nodes = tuple( n.address.to_endpoint() + [n.pubkey.to_bytes()] for n in nodes ) max_neighbours = self._get_max_neighbours_per_packet() for batch in toolz.partition_all(max_neighbours, encoded_nodes): message = _pack_v5(CMD_TOPIC_NODES.id, (echo, batch), self.privkey) self.logger.trace(">>> topic_nodes to %s: %s", node, batch) self.send_v5(node, message)
Example #8
Source File: prepare_penobscot.py From seismic-deeplearning with MIT License | 5 votes |
def split_inline(data_dir, val_ratio, test_ratio, overwrite=False, exclude_files=None): """Splits the inline data into train, val and test. Args: data_dir (str): path to directory that holds the data val_ratio (float): the ratio of the partition that will be used for validation test_ratio (float): the ratio of the partition that they should use for testing exclude_files (list[str]): filenames to exclude from dataset, such as ones that contain artifacts. Example:['image1.tiff'] """ num_partitions = 5 image_dir = os.path.join(data_dir, "inlines") dir_paths = (os.path.join(image_dir, ddir) for ddir in ("train", "val", "test")) locations_list = [_create_directory(d, overwrite=overwrite) for d in dir_paths] # train, val, test images_iter = glob.iglob(os.path.join(image_dir, "*.tiff")) if exclude_files is not None: images_list = list(itertools.filterfalse(lambda x: x in exclude_files, images_iter)) else: images_list = list(images_iter) num_elements = math.ceil(len(images_list) / num_partitions) for partition in partition_all(num_elements, images_list): # Partition files into N partitions for files_list, dest_dir in zip(_split_train_val_test(partition, val_ratio, test_ratio), locations_list): _copy_files(files_list, dest_dir)
Example #9
Source File: sync.py From hivemind with MIT License | 5 votes |
def from_checkpoints(self, chunk_size=1000): """Initial sync strategy: read from blocks on disk. This methods scans for files matching ./checkpoints/*.json.lst and uses them for hive's initial sync. Each line must contain exactly one block in JSON format. """ # pylint: disable=no-self-use last_block = Blocks.head_num() tuplize = lambda path: [int(path.split('/')[-1].split('.')[0]), path] basedir = os.path.dirname(os.path.realpath(__file__ + "/../..")) files = glob.glob(basedir + "/checkpoints/*.json.lst") tuples = sorted(map(tuplize, files), key=lambda f: f[0]) last_read = 0 for (num, path) in tuples: if last_block < num: log.info("[SYNC] Load %s. Last block: %d", path, last_block) with open(path) as f: # each line in file represents one block # we can skip the blocks we already have skip_lines = last_block - last_read remaining = drop(skip_lines, f) for lines in partition_all(chunk_size, remaining): Blocks.process_multi(map(json.loads, lines), True) last_block = num last_read = num
Example #10
Source File: date_utils.py From catalyst with Apache License 2.0 | 5 votes |
def compute_date_range_chunks(sessions, start_date, end_date, chunksize): """Compute the start and end dates to run a pipeline for. Parameters ---------- sessions : DatetimeIndex The available dates. start_date : pd.Timestamp The first date in the pipeline. end_date : pd.Timestamp The last date in the pipeline. chunksize : int or None The size of the chunks to run. Setting this to None returns one chunk. Returns ------- ranges : iterable[(np.datetime64, np.datetime64)] A sequence of start and end dates to run the pipeline for. """ if start_date not in sessions: raise KeyError("Start date %s is not found in calendar." % (start_date.strftime("%Y-%m-%d"),)) if end_date not in sessions: raise KeyError("End date %s is not found in calendar." % (end_date.strftime("%Y-%m-%d"),)) if end_date < start_date: raise ValueError("End date %s cannot precede start date %s." % (end_date.strftime("%Y-%m-%d"), start_date.strftime("%Y-%m-%d"))) if chunksize is None: return [(start_date, end_date)] start_ix, end_ix = sessions.slice_locs(start_date, end_date) return ( (r[0], r[-1]) for r in partition_all( chunksize, sessions[start_ix:end_ix] ) )
Example #11
Source File: CML.py From CollMetric with GNU General Public License v3.0 | 4 votes |
def optimize(model, sampler, train, valid): """ Optimize the model. TODO: implement early-stopping :param model: model to optimize :param sampler: mini-batch sampler :param train: train user-item matrix :param valid: validation user-item matrix :return: None """ sess = tf.Session() sess.run(tf.global_variables_initializer()) if model.feature_projection is not None: # initialize item embedding with feature projection sess.run(tf.assign(model.item_embeddings, model.feature_projection)) # sample some users to calculate recall validation valid_users = numpy.random.choice(list(set(valid.nonzero()[0])), size=1000, replace=False) while True: # create evaluator on validation set validation_recall = RecallEvaluator(model, train, valid) # compute recall on validate set valid_recalls = [] # compute recall in chunks to utilize speedup provided by Tensorflow for user_chunk in toolz.partition_all(100, valid_users): valid_recalls.extend([validation_recall.eval(sess, user_chunk)]) print("\nRecall on (sampled) validation set: {}".format(numpy.mean(valid_recalls))) # TODO: early stopping based on validation recall # train model losses = [] # run n mini-batches for _ in tqdm(range(EVALUATION_EVERY_N_BATCHES), desc="Optimizing..."): user_pos, neg = sampler.next_batch() _, loss = sess.run((model.optimize, model.loss), {model.user_positive_items_pairs: user_pos, model.negative_samples: neg}) losses.append(loss) print("\nTraining loss {}".format(numpy.mean(losses)))