Python Examples of toolz.partition

Source File: umis.py From umis with MIT License

6 votes

def sb_filter(fastq, bc, cores, nedit):
    ''' Filters reads with non-matching sample barcodes
    Expects formatted fastq files.
    '''
    barcodes = set(sb.strip() for sb in bc)
    if nedit == 0:
        filter_sb = partial(exact_sample_filter2, barcodes=barcodes)
    else:
        barcodehash = MutationHash(barcodes, nedit)
        filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_sb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)

Source File: accounts.py From hivemind with MIT License

6 votes

def _cache_accounts(cls, accounts, steem, trx=True):
        """Fetch all `accounts` and write to db."""
        timer = Timer(len(accounts), 'account', ['rps', 'wps'])
        for name_batch in partition_all(1000, accounts):
            cached_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

            timer.batch_start()
            batch = steem.get_accounts(name_batch)

            timer.batch_lap()
            sqls = [cls._sql(acct, cached_at) for acct in batch]
            DB.batch_queries(sqls, trx)

            timer.batch_finish(len(batch))
            if trx or len(accounts) > 1000:
                log.info(timer.batch_status())

Source File: scraper.py From steemdata-mongo with MIT License

6 votes

def scrape_blockchain(mongo):
    s = Steem()
    # see how far behind we are
    missing = list(range(last_block_num(mongo), s.last_irreversible_block_num))

    # if we are far behind blockchain head
    # split work in chunks of 100
    if len(missing) > 100:
        for batch in partition_all(100, missing):
            results = s.get_blocks(batch)
            insert_blocks(mongo, results)

    # otherwise continue as normal
    blockchain = Blockchain(mode="irreversible")
    hist = blockchain.stream_from(start_block=last_block_num(mongo), full_blocks=True)
    insert_blocks(mongo, hist)

Source File: umis.py From umis with MIT License

5 votes

def cb_filter(fastq, bc1, bc2, bc3, cores, nedit):
    ''' Filters reads with non-matching barcodes
    Expects formatted fastq files.
    '''
    with open_gzipsafe(bc1) as bc1_fh:
        bc1 = set(cb.strip() for cb in bc1_fh)

    if bc2:
        with open_gzipsafe(bc2) as bc2_fh:
            bc2 = set(cb.strip() for cb in bc2_fh)
    if bc3:
        with open_gzipsafe(bc3) as bc3_fh:
            bc3 = set(cb.strip() for cb in bc3_fh)

    annotations = detect_fastq_annotations(fastq)
    re_string = construct_transformed_regex(annotations)

    if nedit == 0:
        filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2, bc3=bc3,
                            re_string=re_string)
    else:
        bc1hash = MutationHash(bc1, nedit)
        bc2hash = None
        bc3hash = None
        if bc2:
            bc2hash = MutationHash(bc2, nedit)
        if bc3:
            bc3hash = MutationHash(bc3, nedit)
        filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash,
                            bc2hash=bc2hash, bc3hash=bc3hash, re_string=re_string)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_cb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)

Source File: umis.py From umis with MIT License

5 votes

def mb_filter(fastq, cores):
    ''' Filters umis with non-ACGT bases
    Expects formatted fastq files.
    '''
    filter_mb = partial(umi_filter)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_mb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)

Source File: umis.py From umis with MIT License

5 votes

def add_uid(fastq, cores):
    ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication
    Expects formatted fastq files with correct sample and cell barcodes.
    '''

    uids = partial(append_uids)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(uids, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)

Source File: discovery.py From pyquarkchain with MIT License

5 votes

def send_topic_nodes(
        self, node: kademlia.Node, echo: Hash32, nodes: Tuple[kademlia.Node, ...]
    ) -> None:
        encoded_nodes = tuple(
            n.address.to_endpoint() + [n.pubkey.to_bytes()] for n in nodes
        )
        max_neighbours = self._get_max_neighbours_per_packet()
        for batch in toolz.partition_all(max_neighbours, encoded_nodes):
            message = _pack_v5(CMD_TOPIC_NODES.id, (echo, batch), self.privkey)
            self.logger.trace(">>> topic_nodes to %s: %s", node, batch)
            self.send_v5(node, message)

Source File: prepare_penobscot.py From seismic-deeplearning with MIT License

5 votes

def split_inline(data_dir, val_ratio, test_ratio, overwrite=False, exclude_files=None):
    """Splits the inline data into train, val and test.

    Args:
        data_dir (str): path to directory that holds the data
        val_ratio (float): the ratio of the partition that will be used for validation
        test_ratio (float): the ratio of the partition that they should use for testing
        exclude_files (list[str]): filenames to exclude from dataset, such as ones that contain
            artifacts. Example:['image1.tiff']
    """
    num_partitions = 5
    image_dir = os.path.join(data_dir, "inlines")
    dir_paths = (os.path.join(image_dir, ddir) for ddir in ("train", "val", "test"))
    locations_list = [_create_directory(d, overwrite=overwrite) for d in dir_paths]  # train, val, test

    images_iter = glob.iglob(os.path.join(image_dir, "*.tiff"))

    if exclude_files is not None:
        images_list = list(itertools.filterfalse(lambda x: x in exclude_files, images_iter))
    else:
        images_list = list(images_iter)

    num_elements = math.ceil(len(images_list) / num_partitions)
    for partition in partition_all(num_elements, images_list):  # Partition files into N partitions
        for files_list, dest_dir in zip(_split_train_val_test(partition, val_ratio, test_ratio), locations_list):
            _copy_files(files_list, dest_dir)

Source File: sync.py From hivemind with MIT License

5 votes

def from_checkpoints(self, chunk_size=1000):
        """Initial sync strategy: read from blocks on disk.

        This methods scans for files matching ./checkpoints/*.json.lst
        and uses them for hive's initial sync. Each line must contain
        exactly one block in JSON format.
        """
        # pylint: disable=no-self-use
        last_block = Blocks.head_num()

        tuplize = lambda path: [int(path.split('/')[-1].split('.')[0]), path]
        basedir = os.path.dirname(os.path.realpath(__file__ + "/../.."))
        files = glob.glob(basedir + "/checkpoints/*.json.lst")
        tuples = sorted(map(tuplize, files), key=lambda f: f[0])

        last_read = 0
        for (num, path) in tuples:
            if last_block < num:
                log.info("[SYNC] Load %s. Last block: %d", path, last_block)
                with open(path) as f:
                    # each line in file represents one block
                    # we can skip the blocks we already have
                    skip_lines = last_block - last_read
                    remaining = drop(skip_lines, f)
                    for lines in partition_all(chunk_size, remaining):
                        Blocks.process_multi(map(json.loads, lines), True)
                last_block = num
            last_read = num

Source File: date_utils.py From catalyst with Apache License 2.0

5 votes

def compute_date_range_chunks(sessions, start_date, end_date, chunksize):
    """Compute the start and end dates to run a pipeline for.

    Parameters
    ----------
    sessions : DatetimeIndex
        The available dates.
    start_date : pd.Timestamp
        The first date in the pipeline.
    end_date : pd.Timestamp
        The last date in the pipeline.
    chunksize : int or None
        The size of the chunks to run. Setting this to None returns one chunk.

    Returns
    -------
    ranges : iterable[(np.datetime64, np.datetime64)]
        A sequence of start and end dates to run the pipeline for.
    """
    if start_date not in sessions:
        raise KeyError("Start date %s is not found in calendar." %
                       (start_date.strftime("%Y-%m-%d"),))
    if end_date not in sessions:
        raise KeyError("End date %s is not found in calendar." %
                       (end_date.strftime("%Y-%m-%d"),))
    if end_date < start_date:
        raise ValueError("End date %s cannot precede start date %s." %
                         (end_date.strftime("%Y-%m-%d"),
                          start_date.strftime("%Y-%m-%d")))

    if chunksize is None:
        return [(start_date, end_date)]

    start_ix, end_ix = sessions.slice_locs(start_date, end_date)
    return (
        (r[0], r[-1]) for r in partition_all(
            chunksize, sessions[start_ix:end_ix]
        )
    )

Source File: CML.py From CollMetric with GNU General Public License v3.0

4 votes

def optimize(model, sampler, train, valid):
    """
    Optimize the model. TODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :return: None
    """
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    if model.feature_projection is not None:
        # initialize item embedding with feature projection
        sess.run(tf.assign(model.item_embeddings, model.feature_projection))

    # sample some users to calculate recall validation
    valid_users = numpy.random.choice(list(set(valid.nonzero()[0])), size=1000, replace=False)

    while True:
        # create evaluator on validation set
        validation_recall = RecallEvaluator(model, train, valid)
        # compute recall on validate set
        valid_recalls = []

        # compute recall in chunks to utilize speedup provided by Tensorflow
        for user_chunk in toolz.partition_all(100, valid_users):
            valid_recalls.extend([validation_recall.eval(sess, user_chunk)])
        print("\nRecall on (sampled) validation set: {}".format(numpy.mean(valid_recalls)))
        # TODO: early stopping based on validation recall

        # train model
        losses = []
        # run n mini-batches
        for _ in tqdm(range(EVALUATION_EVERY_N_BATCHES), desc="Optimizing..."):
            user_pos, neg = sampler.next_batch()
            _, loss = sess.run((model.optimize, model.loss),
                               {model.user_positive_items_pairs: user_pos,
                                model.negative_samples: neg})

            losses.append(loss)

        print("\nTraining loss {}".format(numpy.mean(losses)))

Python toolz.partition_all() Examples