Python dask.dataframe.read_parquet() Examples
The following are 11
code examples of dask.dataframe.read_parquet().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask.dataframe
, or try the search function
.
Example #1
Source File: drybell_dask.py From snorkel-tutorials with Apache License 2.0 | 6 votes |
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") data = dd.read_parquet(data_path) data = data.repartition(npartitions=2) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = DaskLFApplier(lfs) L = applier.apply(data) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] data = data.reset_index().set_index("index") data_labeled = data.assign(y_prob=dd.from_array(y_prob)) dd.to_parquet(data_labeled, output_path) logging.info(f"Labels saved to {output_path}")
Example #2
Source File: main.py From d6tflow with MIT License | 5 votes |
def test_formats(cleanup): def helper(data,TaskClass,format=None): class TestTask(TaskClass): def run(self): self.save(data) TestTask().run() if format=='pd': assert TestTask().output().load().equals(data) else: assert TestTask().output().load()==data helper(df, d6tflow.tasks.TaskCachePandas, 'pd') helper({'test': 1}, d6tflow.tasks.TaskJson) helper({'test': 1}, d6tflow.tasks.TaskPickle) from d6tflow.tasks.h5 import TaskH5Pandas helper(df, TaskH5Pandas, 'pd') try: from d6tflow.tasks.dt import TaskDatatable import datatable as dt dt = dt.Frame(df) helper(dt, TaskH5Pandas) except: warnings.warn('datatable failed') if 0==1: # todo: import dask.dataframe as dd t1 = Task1(); t1.run(); ddf = dd.read_parquet(t1.output().path) from d6tflow.tasks.dask import TaskPqDask helper(ddf, TaskPqDask, 'pd') t1.invalidate(confirm=False)
Example #3
Source File: item.py From pystore with Apache License 2.0 | 5 votes |
def __init__(self, item, datastore, collection, snapshot=None, filters=None, columns=None, engine="fastparquet"): self.engine = engine self.datastore = datastore self.collection = collection self.snapshot = snapshot self.item = item self._path = utils.make_path(datastore, collection, item) if not self._path.exists(): raise ValueError( "Item `%s` doesn't exist. " "Create it using collection.write(`%s`, data, ...)" % ( item, item)) if snapshot: snap_path = utils.make_path( datastore, collection, "_snapshots", snapshot) self._path = utils.make_path(snap_path, item) if not utils.path_exists(snap_path): raise ValueError("Snapshot `%s` doesn't exist" % snapshot) if not utils.path_exists(self._path): raise ValueError( "Item `%s` doesn't exist in this snapshot" % item) self.metadata = utils.read_metadata(self._path) self.data = dd.read_parquet( self._path, engine=self.engine, filters=filters, columns=columns)
Example #4
Source File: collection.py From pystore with Apache License 2.0 | 5 votes |
def index(self, item, last=False): data = dd.read_parquet(self._item_path(item, as_string=True), columns="index", engine=self.engine) if not last: return data.index.compute() return float(str(data.index).split( "\nName")[0].split("\n")[-1].split(" ")[0])
Example #5
Source File: test_copy.py From bionic with Apache License 2.0 | 5 votes |
def test_copy_dask_to_dir(tmp_path, expected_dask_df, dask_flow): destination = tmp_path / "output" destination.mkdir() expected_dir_path = destination / "dask_df.pq.dask" dask_flow.get("dask_df", mode="FileCopier").copy(destination=destination) actual = dd.read_parquet(expected_dir_path) assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
Example #6
Source File: test_copy.py From bionic with Apache License 2.0 | 5 votes |
def test_copy_dask_to_gcs_dir( tmp_path, tmp_gcs_url_prefix, expected_dask_df, dask_flow ): cloud_url = tmp_gcs_url_prefix + "output" dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url) check_call(f"gsutil -m cp -r {cloud_url} {tmp_path}", shell=True) actual = dd.read_parquet(tmp_path / "output") assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
Example #7
Source File: ga_chp_bq_advanced_preprocessor.py From MorphL-Community-Edition with Apache License 2.0 | 5 votes |
def process_dataframe(client, hdfs_dir_input, hdfs_dir_output): dask_df = client.persist(dd.read_parquet(hdfs_dir_input)) st = ScalerTransformer(dask_df) scaled_features = st.get_transformed_data() scaled_features.repartition(npartitions=32).to_parquet(hdfs_dir_output)
Example #8
Source File: ga_chp_bq_model_generator.py From MorphL-Community-Edition with Apache License 2.0 | 5 votes |
def main(): client = Client() dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT)) ModelGenerator(dask_df).generate_and_save_model()
Example #9
Source File: ga_chp_advanced_preprocessor.py From MorphL-Community-Edition with Apache License 2.0 | 5 votes |
def process_dataframe(client, hdfs_dir_input, hdfs_dir_output): dask_df = client.persist(dd.read_parquet(hdfs_dir_input)) st = ScalerTransformer(dask_df) scaled_features = st.get_transformed_data() scaled_features.repartition(npartitions=32).to_parquet(hdfs_dir_output)
Example #10
Source File: ga_chp_model_generator.py From MorphL-Community-Edition with Apache License 2.0 | 5 votes |
def main(): client = Client() dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT)) ModelGenerator(dask_df).generate_and_save_model()
Example #11
Source File: collection.py From pystore with Apache License 2.0 | 4 votes |
def append(self, item, data, npartitions=None, epochdate=False, threaded=False, reload_items=False, **kwargs): if not utils.path_exists(self._item_path(item)): raise ValueError( """Item do not exists. Use `<collection>.write(...)`""") # work on copy data = data.copy() try: if epochdate or ("datetime" in str(data.index.dtype) and any(data.index.nanosecond) > 0): data = utils.datetime_to_int64(data) old_index = dd.read_parquet(self._item_path(item, as_string=True), columns=[], engine=self.engine ).index.compute() data = data[~data.index.isin(old_index)] except Exception: return if data.empty: return if data.index.name == "": data.index.name = "index" # combine old dataframe with new current = self.item(item) new = dd.from_pandas(data, npartitions=1) combined = dd.concat([current.data, new]).drop_duplicates(keep="last") if npartitions is None: memusage = combined.memory_usage(deep=True).sum() if isinstance(combined, dd.DataFrame): memusage = memusage.compute() npartitions = int(1 + memusage // DEFAULT_PARTITION_SIZE) # write data write = self.write_threaded if threaded else self.write write(item, combined, npartitions=npartitions, chunksize=None, metadata=current.metadata, overwrite=True, epochdate=epochdate, reload_items=reload_items, **kwargs)