Python dask.dataframe.read_parquet() Examples

The following are 11 code examples of dask.dataframe.read_parquet(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module dask.dataframe , or try the search function .
Example #1
Source File: drybell_dask.py    From snorkel-tutorials with Apache License 2.0 6 votes vote down vote up
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}") 
Example #2
Source File: main.py    From d6tflow with MIT License 5 votes vote down vote up
def test_formats(cleanup):
    def helper(data,TaskClass,format=None):
        class TestTask(TaskClass):
            def run(self):
                self.save(data)

        TestTask().run()
        if format=='pd':
            assert TestTask().output().load().equals(data)
        else:
            assert TestTask().output().load()==data

    helper(df, d6tflow.tasks.TaskCachePandas, 'pd')
    helper({'test': 1}, d6tflow.tasks.TaskJson)
    helper({'test': 1}, d6tflow.tasks.TaskPickle)

    from d6tflow.tasks.h5 import TaskH5Pandas
    helper(df, TaskH5Pandas, 'pd')

    try:
        from d6tflow.tasks.dt import TaskDatatable
        import datatable as dt
        dt = dt.Frame(df)
        helper(dt, TaskH5Pandas)
    except:
        warnings.warn('datatable failed')

    if 0==1: # todo:
        import dask.dataframe as dd
        t1 = Task1();
        t1.run();
        ddf = dd.read_parquet(t1.output().path)
        from d6tflow.tasks.dask import TaskPqDask
        helper(ddf, TaskPqDask, 'pd')
        t1.invalidate(confirm=False) 
Example #3
Source File: item.py    From pystore with Apache License 2.0 5 votes vote down vote up
def __init__(self, item, datastore, collection,
                 snapshot=None, filters=None, columns=None,
                 engine="fastparquet"):
        self.engine = engine
        self.datastore = datastore
        self.collection = collection
        self.snapshot = snapshot
        self.item = item

        self._path = utils.make_path(datastore, collection, item)
        if not self._path.exists():
            raise ValueError(
                "Item `%s` doesn't exist. "
                "Create it using collection.write(`%s`, data, ...)" % (
                    item, item))
        if snapshot:
            snap_path = utils.make_path(
                datastore, collection, "_snapshots", snapshot)

            self._path = utils.make_path(snap_path, item)

            if not utils.path_exists(snap_path):
                raise ValueError("Snapshot `%s` doesn't exist" % snapshot)

            if not utils.path_exists(self._path):
                raise ValueError(
                    "Item `%s` doesn't exist in this snapshot" % item)

        self.metadata = utils.read_metadata(self._path)
        self.data = dd.read_parquet(
            self._path, engine=self.engine, filters=filters, columns=columns) 
Example #4
Source File: collection.py    From pystore with Apache License 2.0 5 votes vote down vote up
def index(self, item, last=False):
        data = dd.read_parquet(self._item_path(item, as_string=True),
                               columns="index", engine=self.engine)
        if not last:
            return data.index.compute()

        return float(str(data.index).split(
                     "\nName")[0].split("\n")[-1].split(" ")[0]) 
Example #5
Source File: test_copy.py    From bionic with Apache License 2.0 5 votes vote down vote up
def test_copy_dask_to_dir(tmp_path, expected_dask_df, dask_flow):
    destination = tmp_path / "output"
    destination.mkdir()
    expected_dir_path = destination / "dask_df.pq.dask"

    dask_flow.get("dask_df", mode="FileCopier").copy(destination=destination)

    actual = dd.read_parquet(expected_dir_path)
    assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 
Example #6
Source File: test_copy.py    From bionic with Apache License 2.0 5 votes vote down vote up
def test_copy_dask_to_gcs_dir(
    tmp_path, tmp_gcs_url_prefix, expected_dask_df, dask_flow
):
    cloud_url = tmp_gcs_url_prefix + "output"

    dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url)

    check_call(f"gsutil -m cp -r {cloud_url} {tmp_path}", shell=True)
    actual = dd.read_parquet(tmp_path / "output")
    assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 
Example #7
Source File: ga_chp_bq_advanced_preprocessor.py    From MorphL-Community-Edition with Apache License 2.0 5 votes vote down vote up
def process_dataframe(client, hdfs_dir_input, hdfs_dir_output):
    dask_df = client.persist(dd.read_parquet(hdfs_dir_input))
    st = ScalerTransformer(dask_df)
    scaled_features = st.get_transformed_data()
    scaled_features.repartition(npartitions=32).to_parquet(hdfs_dir_output) 
Example #8
Source File: ga_chp_bq_model_generator.py    From MorphL-Community-Edition with Apache License 2.0 5 votes vote down vote up
def main():
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    ModelGenerator(dask_df).generate_and_save_model() 
Example #9
Source File: ga_chp_advanced_preprocessor.py    From MorphL-Community-Edition with Apache License 2.0 5 votes vote down vote up
def process_dataframe(client, hdfs_dir_input, hdfs_dir_output):
    dask_df = client.persist(dd.read_parquet(hdfs_dir_input))
    st = ScalerTransformer(dask_df)
    scaled_features = st.get_transformed_data()
    scaled_features.repartition(npartitions=32).to_parquet(hdfs_dir_output) 
Example #10
Source File: ga_chp_model_generator.py    From MorphL-Community-Edition with Apache License 2.0 5 votes vote down vote up
def main():
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    ModelGenerator(dask_df).generate_and_save_model() 
Example #11
Source File: collection.py    From pystore with Apache License 2.0 4 votes vote down vote up
def append(self, item, data, npartitions=None, epochdate=False,
               threaded=False, reload_items=False, **kwargs):

        if not utils.path_exists(self._item_path(item)):
            raise ValueError(
                """Item do not exists. Use `<collection>.write(...)`""")

        # work on copy
        data = data.copy()

        try:
            if epochdate or ("datetime" in str(data.index.dtype) and
                             any(data.index.nanosecond) > 0):
                data = utils.datetime_to_int64(data)
            old_index = dd.read_parquet(self._item_path(item, as_string=True),
                                        columns=[], engine=self.engine
                                        ).index.compute()
            data = data[~data.index.isin(old_index)]
        except Exception:
            return

        if data.empty:
            return

        if data.index.name == "":
            data.index.name = "index"

        # combine old dataframe with new
        current = self.item(item)
        new = dd.from_pandas(data, npartitions=1)
        combined = dd.concat([current.data, new]).drop_duplicates(keep="last")

        if npartitions is None:
            memusage = combined.memory_usage(deep=True).sum()
            if isinstance(combined, dd.DataFrame):
                memusage = memusage.compute()
            npartitions = int(1 + memusage // DEFAULT_PARTITION_SIZE)

        # write data
        write = self.write_threaded if threaded else self.write
        write(item, combined, npartitions=npartitions, chunksize=None,
              metadata=current.metadata, overwrite=True,
              epochdate=epochdate, reload_items=reload_items, **kwargs)