Python Examples of more_itertools.chunked

Source File: filter.py From pygreynoise with MIT License

6 votes

def filter(self, text, noise_only):
        """Filter lines that contain IP addresses from a given text.

        :param text: Text input
        :type text: file-like | str
        :param noise_only:
            If set, return only lines that contain IP addresses classified as noise,
            otherwise, return lines that contain IP addresses not classified as noise.
        :type noise_only: bool
        :return: Iterator that yields lines in chunks
        :rtype: iterable

        """
        if isinstance(text, str):
            text = text.splitlines(True)
        chunks = more_itertools.chunked(text, self.FILTER_TEXT_CHUNK_SIZE)
        for chunk in chunks:
            yield self._filter_chunk(chunk, noise_only)

Source File: utils.py From vidreid_cosegmentation with Apache License 2.0

6 votes

def get_spatial_features(model, imgs, test_num_tracks):
    """to handle higher seq length videos due to OOM error
    specifically used during test
    
    Arguments:
        model -- model under test
        imgs -- imgs to get features for
    
    Returns:
        features 
    """

    # handle chunked data
    all_features, all_spatial_features = [], []

    for test_imgs in mit.chunked(imgs, test_num_tracks):
        current_test_imgs = torch.stack(test_imgs)
        num_current_test_imgs = current_test_imgs.shape[0]
        features, spatial_feats = model(current_test_imgs)
        features = features.view(num_current_test_imgs, -1)

        all_spatial_features.append(spatial_feats)
        all_features.append(features)

    return torch.cat(all_features), torch.cat(all_spatial_features)

Source File: utils.py From vidreid_cosegmentation with Apache License 2.0

6 votes

def get_features(model, imgs, test_num_tracks):
    """to handle higher seq length videos due to OOM error
    specifically used during test
    
    Arguments:
        model -- model under test
        imgs -- imgs to get features for
    
    Returns:
        features 
    """

    # handle chunked data
    all_features = []

    for test_imgs in mit.chunked(imgs, test_num_tracks):
        current_test_imgs = torch.stack(test_imgs)
        num_current_test_imgs = current_test_imgs.shape[0]
        # print(current_test_imgs.shape)
        features = model(current_test_imgs)
        features = features.view(num_current_test_imgs, -1)
        all_features.append(features)

    return torch.cat(all_features)

Source File: displacements.py From langchangetrack with BSD 3-Clause "New" or "Revised" License

6 votes

def calculate_words_displacement(self, column_names, n_jobs = 1):
        """ Calculate word displacements for each word in the Pandas data frame. """

        words = self.get_word_list()
        # Create chunks of the words to be processed.
        chunk_sz = np.ceil(len(words)/float(n_jobs))
        chunks = list(more_itertools.chunked(words, chunk_sz))

        # Calculate the displacements
        chunksL = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_source, self) for chunk in chunks)
        chunksH = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_dest, self) for chunk in chunks)
        L = more_itertools.flatten(chunksL)
        H = more_itertools.flatten(chunksH)
        flattendL = [x for sublist in L for x in sublist]
        flattendH = [x for sublist in H for x in sublist]

        # Store the results in a nice pandas data frame
        dfo, dfn = self.create_data_frames(flattendL, flattendH, column_names)
        return flattendL, flattendH, dfo, dfn

Source File: dataset.py From machine-learning-for-programming-samples with MIT License

6 votes

def get_minibatch_iterator(
    token_seqs: np.ndarray,
    batch_size: int,
    is_training: bool,
    drop_remainder: bool = True,
) -> Iterator[np.ndarray]:
    indices = np.arange(token_seqs.shape[0])
    if is_training:
        np.random.shuffle(indices)

    for minibatch_indices in chunked(indices, batch_size):
        if len(minibatch_indices) < batch_size and drop_remainder:
            break  # Drop last, smaller batch

        minibatch_seqs = token_seqs[minibatch_indices]
        yield minibatch_seqs

Source File: test_more.py From python-netsurv with MIT License

5 votes

def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        )

Source File: graph_sequence.py From article-0 with MIT License

5 votes

def __init__(self, args, batch_size=32, test=False):
		self.batch_size = batch_size
		
		self.query = """
			MATCH p=
				(person:PERSON) 
					-[:WROTE]-> 
				(review:REVIEW {dataset_name:{dataset_name}, test:{test}}) 
					-[:OF]-> 
				(product:PRODUCT)
			RETURN person.style_preference + product.style as x, review.score as y
		"""

		self.query_params = {
			"dataset_name": "article_0",
			"test": test
		}

		with open('./settings.json') as f:
			self.settings = json.load(f)[args.database]

		driver = GraphDatabase.driver(
			self.settings["neo4j_url"], 
			auth=(self.settings["neo4j_user"], self.settings["neo4j_password"]))

		with driver.session() as session:
			data = session.run(self.query, **self.query_params).data()
			data = [ (np.array(i["x"]), i["y"]) for i in data]
			
			# Split the data up into "batches"
			data = more_itertools.chunked(data, self.batch_size)

			# Format our batches in the way Keras expects them:
			# An array of tuples (x_batch, y_batch)

			# An x_batch is a numpy array of shape (batch_size, 12), 
			# containing the concatenated style and style_preference vectors. 

			# A y_batch is a numpy array of shape (batch_size,1) containing the review scores.

			self.data = [ (np.array([j[0] for j in i]), np.array([j[1] for j in i])) for i in data]

Source File: biodati_client.py From pybel with MIT License

5 votes

def post_graph_chunked(
        self,
        graph: BELGraph,
        chunksize: int,
        *,
        use_tqdm: bool = True,
        collections: Optional[Iterable[str]] = None,
        overwrite: bool = False,
        validate: bool = True,
        email: Union[bool, str] = False
    ) -> requests.Response:
        """Post the graph to BioDati in chunks, when the graph is too big for a normal upload.

        :param graph: A BEL graph
        :param chunksize: The size of the chunks of nanopubs to upload
        :param use_tqdm: Should tqdm be used when iterating?
        :param collections: Tags to add to the nanopubs for lookup on BioDati
        :param overwrite: Set the BioDati upload "overwrite" setting
        :param validate: Set the BioDati upload "validate" setting
        :param email: Who should get emailed with results about the upload? If true, emails to user
         used for login. If string, emails to that user. If false, no email.
        :return: Last response from upload
        """
        metadata_extras = dict()
        if collections is not None:
            metadata_extras.update(collections=list(collections))
        iterable = _iter_graphdati(graph, use_tqdm=use_tqdm, metadata_extras=metadata_extras)
        res = None
        for chunk in chunked(iterable, chunksize):
            res = self.post_graph_json(
                chunk,
                overwrite=overwrite,
                validate=validate,
                email=email,
            )
        return res

Source File: test_more.py From pipenv with MIT License

5 votes

def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        )

Source File: test_more.py From pipenv with MIT License

5 votes

def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        )

Source File: abuse_ch.py From n6 with GNU Affero General Public License v3.0

5 votes

def publish_iteratively(self):
        for chunk in more_itertools.chunked(self._selected_data, 20000):
            rk, body, prop_kwargs = self.get_output_components(selected_data="\n".join(chunk))
            self.publish_output(rk, body, prop_kwargs)
            yield
        yield self.FLUSH_OUT
        self.save_state(self._state)

Source File: varmisuse_data_splitter.py From tf-gnn-samples with MIT License

5 votes

def _write_data(out_dir: RichPath, window_idx: int, chunk_size: int, data_window: List[Any]):
    np.random.shuffle(data_window)
    for chunk_idx, data_chunk in enumerate(chunked(data_window, chunk_size)):
        out_file = out_dir.join('chunk_%i-%i.jsonl.gz' % (window_idx, chunk_idx))
        out_file.save_as_compressed_file(data_chunk)

Source File: score_processor.py From revscoring with MIT License

5 votes

def score(self, rev_ids, caches=None, cache=None):
        if isinstance(rev_ids, int):
            rev_ids = [rev_ids]

        batches = batch_rev_caches(chunked(rev_ids, self.batch_size), caches,
                                   cache)

        for batch_scores in self.scores_ex.map(self._score_batch, batches):
            for score in batch_scores:
                yield score

Source File: test_more.py From python-netsurv with MIT License

5 votes

def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        )

Source File: test_more.py From python-netsurv with MIT License

5 votes

def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        )

Source File: test_more.py From python-netsurv with MIT License

5 votes

def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        )

Source File: pydataapi.py From py-data-api with MIT License

5 votes

def batch_execute(
        self,
        query: Union[Query, Insert, Update, Delete, Select, str],
        parameter_sets: Optional[List[Dict[str, Any]]],
        transaction_id: Optional[str] = None,
        database: Optional[str] = None,
    ) -> UpdateResults:

        if self.transaction_id:
            start_transaction: bool = False
        else:
            self.begin(database=database)
            start_transaction = True
        try:
            results_sets = list(
                flatten(
                    self.client.batch_execute_statement(
                        **Options(
                            resourceArn=self.resource_arn,
                            secretArn=self.secret_arn,
                            database=database or self.database,
                            transactionId=transaction_id or self.transaction_id,
                            parameterSets=chunked_parameter_sets,  # type: ignore
                            sql=query,
                        ).build()
                    )["updateResults"]
                    for chunked_parameter_sets in chunked(
                        parameter_sets or [], MAX_RECORDS
                    )
                )
            )
        except:
            if start_transaction:
                self.rollback()
            raise
        if start_transaction:
            self.commit()
        return UpdateResults(results_sets)

Source File: dump.py From promnesia with MIT License

5 votes

def dump_histories(all_histories: List[Tuple[str, List[DbVisit]]]) -> None:
    logger = get_logger()
    output_dir = Path(config.get().output_dir)
    db_path = output_dir / 'promnesia.sqlite'

    def iter_visits():
        for e, h in all_histories:
            # TODO sort them somehow for determinism?
            # TODO what do we do with errors?
            # TODO maybe conform them to schema and dump too?
            # TODO or, dump to a separate table?
            yield from h

    tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite'
    engine = create_engine(f'sqlite:///{tpath}')
    binder = NTBinder.make(DbVisit)
    meta = MetaData(engine)
    table = Table('visits', meta, *binder.columns)
    meta.create_all()

    with engine.begin() as conn:
        for chunk in chunked(iter_visits(), n=_CHUNK_BY):
            bound = [binder.to_row(x) for x in chunk]
            # pylint: disable=no-value-for-parameter
            conn.execute(table.insert().values(bound))

    shutil.move(str(tpath), str(db_path))

    logger.info('saved database to %s', db_path)
    # TODO log error count

Source File: test_more.py From Tautulli with GNU General Public License v3.0

5 votes

def test_even(self):
        """Test when ``n`` divides evenly into the length of the iterable."""
        self.assertEqual(
            list(mi.chunked('ABCDEF', 3)), [['A', 'B', 'C'], ['D', 'E', 'F']]
        )

Source File: test_more.py From Tautulli with GNU General Public License v3.0

5 votes

def test_odd(self):
        """Test when ``n`` does not divide evenly into the length of the
        iterable.

        """
        self.assertEqual(
            list(mi.chunked('ABCDE', 3)), [['A', 'B', 'C'], ['D', 'E']]
        )

Source File: analyzer.py From pygreynoise with MIT License

4 votes

def analyze(self, text):
        """Aggregate stats related to IP addresses from a given text.

        :param text: Text input
        :type text: file-like | str
        :return: Aggregated stats for all the IP addresses found.
        :rtype: dict

        """
        if isinstance(text, str):
            text = text.splitlines(True)
        chunks = more_itertools.chunked(text, self.ANALYZE_TEXT_CHUNK_SIZE)
        text_stats = {
            "query": [],
            "count": 0,
            "stats": {},
        }
        text_ip_addresses = set()
        chunks_stats = [
            self._analyze_chunk(chunk, text_ip_addresses) for chunk in chunks
        ]
        functools.reduce(self._aggregate_stats, chunks_stats, text_stats)

        # This maps section dictionaries to list of dictionaries
        # (undoing mapping done previously to keep track of count values)
        for section_key, section_value in text_stats["stats"].items():
            section_element_key = self.SECTION_KEY_TO_ELEMENT_KEY[section_key]
            text_stats["stats"][section_key] = sorted(
                [
                    {section_element_key: element_key, "count": element_count}
                    for element_key, element_count in section_value.items()
                ],
                key=lambda element: (-element["count"], element[section_element_key]),
            )

        if text_ip_addresses:
            noise_ip_addresses = {
                result["ip"]
                for result in self.api.quick(text_ip_addresses)
                if result["noise"]
            }
        else:
            noise_ip_addresses = set()

        ip_count = len(text_ip_addresses)
        noise_ip_count = len(noise_ip_addresses)
        not_noise_ip_count = ip_count - noise_ip_count
        if ip_count > 0:
            noise_ip_ratio = float(noise_ip_count) / ip_count
        else:
            noise_ip_ratio = 0

        text_stats["summary"] = {
            "ip_count": ip_count,
            "noise_ip_count": noise_ip_count,
            "not_noise_ip_count": not_noise_ip_count,
            "noise_ip_ratio": noise_ip_ratio,
        }

        return text_stats

Source File: dump_timeseries.py From langchangetrack with BSD 3-Clause "New" or "Revised" License

4 votes

def main(args):
    # get the arguments
    method = args.method
    win_size = args.win_size
    step = args.step
    metric_name = args.metric_name
    n_jobs = args.workers

    # Load the data.
    L, H, olddf, newdf = pickle.load(open(args.filename))
    words = pd.Series(olddf.word.values.ravel()).unique()
    oldrows = []
    newrows = []
    sourcexrange = np.arange(args.mint, args.maxt, step)
    destxrange = np.arange(args.mint, args.maxt, step)
    if method == 'win':
        sourcexrange = sourcexrange[win_size:]
        destxrange = destxrange[:-win_size]

    if args.interpolate:
        sourcexinter = np.arange(sourcexrange[0], sourcexrange[-1] + 1, 1)
        destxinter = np.arange(destxrange[0], destxrange[-1] + 1, 1)
    else:
        sourcexinter = sourcexrange
        destxinter = destxrange

    # Construct the series
    assert(len(sourcexinter) == len(destxinter))
    chunk_sz = np.ceil(len(words)/float(n_jobs))
    words_chunks = more_itertools.chunked(words, chunk_sz)
    timeseries_chunks = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, create_word_time_series, olddf, newdf,
                                                                               sourcexinter, destxinter,
                                                                               metric_name=metric_name,
                                                                               interpolate=args.interpolate) for chunk in words_chunks)

    timeseries = list(more_itertools.flatten(timeseries_chunks))

    # Dump the data frame
    for orow, newrow in timeseries:
        if orow and newrow:
            oldrows.append(orow)
            newrows.append(newrow)

    oldtimeseries = pd.DataFrame()
    newtimeseries = pd.DataFrame()
    header = ['word']
    header.extend(sourcexinter)
    newheader = ['word']
    newheader.extend(destxinter)
    oldtimeseries = oldtimeseries.from_records(oldrows, columns=header)
    oldtimeseries = oldtimeseries.fillna(method='backfill', axis=1)
    newtimeseries = newtimeseries.from_records(newrows, columns=newheader)
    newtimeseries = newtimeseries.fillna(method='backfill', axis=1)
    oldtimeseries.to_csv(args.sourcetimef, encoding='utf-8')
    newtimeseries.to_csv(args.endtimef, encoding='utf-8')

Source File: image_viewer_widget.py From CvStudio with MIT License

4 votes

def load_image(self):
        @work_exception
        def do_work():
            return dask.compute(*[
                self.load_image_label(),
                self.load_image_annotations()
            ]), None

        @gui_exception
        def done_work(args):
            result, error = args
            if result:
                label, annotations = result
                if label:
                    self._class_label.setVisible(True)
                    self._class_label.setText(label)
                else:
                    self._class_label.setVisible(False)
                    self._class_label.setText("")

                if annotations:
                    img_bbox: QRectF = self.image_viewer.pixmap.sceneBoundingRect()
                    offset = QPointF(img_bbox.width() / 2, img_bbox.height() / 2)
                    for entry in annotations:
                        try:
                            vo: AnnotaVO = entry
                            points = map(float, vo.points.split(","))
                            points = list(more_itertools.chunked(points, 2))
                            if vo.kind == "box" or vo.kind == "ellipse":
                                x = points[0][0] - offset.x()
                                y = points[0][1] - offset.y()
                                w = math.fabs(points[0][0] - points[1][0])
                                h = math.fabs(points[0][1] - points[1][1])
                                roi: QRectF = QRectF(x, y, w, h)
                                if vo.kind == "box":
                                    item = EditableBox(roi)
                                else:
                                    item = EditableEllipse()
                                item.tag = self.tag.dataset
                                item.setRect(roi)
                                item.label = vo.label
                                self.image_viewer.scene().addItem(item)
                            elif vo.kind == "polygon":
                                item = EditablePolygon()
                                item.label = vo.label
                                item.tag = self.tag.dataset
                                self.image_viewer.scene().addItem(item)
                                for p in points:
                                    item.addPoint(QPoint(p[0] - offset.x(), p[1] - offset.y()))
                        except Exception as ex:
                            GUIUtilities.show_error_message("Error loading the annotations: {}".format(ex), "Error")

        self.image_viewer.remove_annotations()
        worker = Worker(do_work)
        worker.signals.result.connect(done_work)
        self._thread_pool.start(worker)

Python more_itertools.chunked() Examples