Python dask.dataframe.from_pandas() Examples
The following are 30
code examples of dask.dataframe.from_pandas().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask.dataframe
, or try the search function
.
Example #1
Source File: test_block_transformer.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_validate(self, mocker, daskify, validate): X = np.arange(100).reshape((25, 4)) df = pd.DataFrame(X).rename(columns=str) if daskify: X = da.from_array(X, chunks=(5, 4)) df = dd.from_pandas(df, npartitions=2) m = mocker.patch("dask_ml.preprocessing._block_transformer.check_array") bt = BlockTransformer(lambda x: x, validate=validate) if validate: _ = bt.transform(X) m.assert_called_once() m.reset_mock() _ = bt.transform(df) m.assert_called_once() else: _ = bt.transform(X) m.assert_not_called() _ = bt.transform(df) m.assert_not_called()
Example #2
Source File: swifter.py From swifter with MIT License | 6 votes |
def _dask_apply(self, func, *args, **kwds): try: # check that the dask rolling apply matches the pandas apply with suppress_stdout_stderr(): tmp_df = ( dd.from_pandas(self._comparison_pd, npartitions=self._npartitions) .rolling(**{k: v for k, v in self._rolling_kwds.items() if k not in ["on", "closed"]}) .apply(func, *args, **kwds) .compute(scheduler=self._scheduler) ) self._validate_apply( tmp_df.equals(self._comparison_pd.rolling(**self._rolling_kwds).apply(func, *args, **kwds)), error_message="Dask rolling apply sample does not match pandas rolling apply sample.", ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler) else: return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler) except ERRORS_TO_HANDLE: if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") return self._obj_pd.progress_apply(func, *args, **kwds) else: return self._obj_pd.apply(func, *args, **kwds)
Example #3
Source File: test_label.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_categorical(self, categories, transformed, daskify, ordered): cat = pd.Series( ["a", "b", "a"], dtype=pd.api.types.CategoricalDtype(categories=categories, ordered=ordered), ) if daskify: cat = dd.from_pandas(cat, npartitions=2) transformed = da.from_array(transformed, chunks=(2, 1)) if daskify == "unknown": cat = cat.cat.as_unknown() a = dpp.LabelEncoder().fit(cat) if daskify != "unknown": assert a.dtype_ == cat.dtype np.testing.assert_array_equal(a.classes_, categories) result = a.transform(cat) da.utils.assert_eq(result, transformed) inv_transformed = a.inverse_transform(result) if daskify: # manually set the divisions for the test inv_transformed.divisions = (0, 2) dd.utils.assert_eq(inv_transformed, cat)
Example #4
Source File: swifter.py From swifter with MIT License | 6 votes |
def __init__( self, pandas_obj, npartitions=None, dask_threshold=1, scheduler="processes", progress_bar=True, progress_bar_desc=None, allow_dask_on_strings=False, ): super(Transformation, self).__init__( pandas_obj, npartitions, dask_threshold, scheduler, progress_bar, progress_bar_desc, allow_dask_on_strings ) self._sample_pd = pandas_obj.iloc[: self._SAMPLE_SIZE] self._obj_pd = pandas_obj self._obj_dd = dd.from_pandas(pandas_obj, npartitions=npartitions) self._nrows = pandas_obj.shape[0]
Example #5
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_inverse_transform(self): enc = dpp.OrdinalEncoder() df = dd.from_pandas( pd.DataFrame( {"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)} ), npartitions=2, ) enc.fit(df) assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df).values)) assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df))) assert_eq_df(df, enc.inverse_transform(enc.transform(df))) assert_eq_df(df, enc.inverse_transform(enc.transform(df))) assert_eq_df(df, enc.inverse_transform(enc.transform(df).values)) assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
Example #6
Source File: test_label.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_use_categorical(self, daskify): data = pd.Series( ["b", "c"], dtype=pd.api.types.CategoricalDtype(["c", "a", "b"]) ) if daskify: data = dd.from_pandas(data, npartitions=2) a = dpp.LabelEncoder(use_categorical=False).fit(data) b = spp.LabelEncoder().fit(data) assert_estimator_equal(a, b, exclude={"dtype_"}) assert a.dtype_ is None da.utils.assert_eq(a.transform(data), b.transform(data)) a_trn = a.transform(data) b_trn = b.transform(data) da.utils.assert_eq(a_trn, b_trn) da.utils.assert_eq(a.inverse_transform(a_trn), b.inverse_transform(b_trn))
Example #7
Source File: test_lf_applier.py From snorkel with Apache License 2.0 | 6 votes |
def test_lf_applier_pandas_spacy_preprocessor_memoized(self) -> None: spacy = SpacyPreprocessor(text_field="text", doc_field="doc") spacy.memoize = True @labeling_function(pre=[spacy]) def first_is_name(x: DataPoint) -> int: return 0 if x.doc[0].pos_ == "PROPN" else -1 @labeling_function(pre=[spacy]) def has_verb(x: DataPoint) -> int: return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1 df = pd.DataFrame(dict(text=TEXT_DATA)) df = dd.from_pandas(df, npartitions=2) applier = DaskLFApplier([first_is_name, has_verb]) L = applier.apply(df) np.testing.assert_equal(L, L_TEXT_EXPECTED)
Example #8
Source File: test_block_transformer.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_block_transform_multiply(self, daskify, validation, factor): X = np.arange(100).reshape((25, 4)) df = pd.DataFrame(X).rename(columns=str) if daskify: X = da.from_array(X, chunks=(5, 4)) df = dd.from_pandas(df, npartitions=2) if factor: bt = BlockTransformer(multiply, validate=validation, factor=factor) else: bt = BlockTransformer(multiply, validate=validation) if daskify: assert dask.is_dask_collection(bt.transform(X)) assert dask.is_dask_collection(bt.transform(df)) if factor: da.utils.assert_eq(bt.transform(X), multiply(X, factor=factor)) dd.utils.assert_eq(bt.transform(df), multiply(df, factor=factor)) else: da.utils.assert_eq(bt.transform(X), multiply(X)) dd.utils.assert_eq(bt.transform(df), multiply(df))
Example #9
Source File: testdatasetproperty.py From holoviews with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_to_holomap_dask(self): if dd is None: raise SkipTest("Dask required to test .to with dask dataframe.") ddf = dd.from_pandas(self.df, npartitions=2) dds = Dataset( ddf, kdims=[ Dimension('a', label="The a Column"), Dimension('b', label="The b Column"), Dimension('c', label="The c Column"), Dimension('d', label="The d Column"), ] ) curve_hmap = dds.to(Curve, 'a', 'b', groupby=['c']) # Check HoloMap element datasets for v in self.df.c.drop_duplicates(): curve = curve_hmap.data[(v,)] self.assertEqual( curve.dataset, self.ds ) # Execute pipeline self.assertEqual(curve.pipeline(curve.dataset), curve)
Example #10
Source File: test_impute.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_frame_strategies(daskify, strategy): df = pd.DataFrame({"A": [1, 1, np.nan, np.nan, 2, 2]}) if daskify: df = dd.from_pandas(df, 2) if strategy == "constant": fill_value = 2 else: fill_value = None b = dask_ml.impute.SimpleImputer(strategy=strategy, fill_value=fill_value) b.fit(df) if not daskify and strategy == "median": expected = pd.Series([1.5], index=["A"]) else: expected = pd.Series([2], index=["A"]) tm.assert_series_equal(b.statistics_, expected, check_dtype=False)
Example #11
Source File: mock.py From timeserio with MIT License | 6 votes |
def mock_dask_fit_data( periods=DEF_N, start_date=None, ids=[0], embedding_dim=DEF_EMB_DIM, seq_length=DEF_SEQ_LENGTH ): """Create example fit data as a dask DataFrame. DataFrame is partitioned by ID. """ df = mock_fit_data( periods=periods, start_date=start_date, ids=ids, embedding_dim=embedding_dim, seq_length=seq_length ) ddf = dd.from_pandas(df, chunksize=periods) return ddf
Example #12
Source File: mock.py From timeserio with MIT License | 6 votes |
def mock_dask_raw_data( periods=DEF_N, start_date=None, ids=[0] ): """Create example fit data as a dask DataFrame. DataFrame is partitioned by ID. """ df = mock_raw_data( periods=periods, start_date=start_date, ids=ids, ) ddf = dd.from_pandas(df, chunksize=periods) return ddf
Example #13
Source File: test_model_selection.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_grid_search_dask_dataframe(): iris = load_iris() X = iris.data y = iris.target df = pd.DataFrame(X) ddf = dd.from_pandas(df, 2) dy = pd.Series(y) ddy = dd.from_pandas(dy, 2) clf = LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=200) param_grid = {"C": [0.1, 1, 10]} gs = GridSearchCV(clf, param_grid, cv=5) dgs = dcv.GridSearchCV(clf, param_grid, cv=5) gs.fit(df, dy) dgs.fit(ddf, ddy) assert gs.best_params_ == dgs.best_params_
Example #14
Source File: test_incremental.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_incremental_text_pipeline(container): X = pd.Series(["a list", "of words", "for classification"] * 100) X = dd.from_pandas(X, npartitions=3) if container == "bag": X = X.to_bag() y = da.from_array(np.array([0, 0, 1] * 100), chunks=(100,) * 3) assert tuple(X.map_partitions(len).compute()) == y.chunks[0] sgd = SGDClassifier(max_iter=5, tol=1e-3) clf = Incremental(sgd, scoring="accuracy", assume_equal_chunks=True) vect = dask_ml.feature_extraction.text.HashingVectorizer() pipe = make_pipeline(vect, clf) pipe.fit(X, y, incremental__classes=[0, 1]) X2 = pipe.steps[0][1].transform(X) assert hasattr(clf, "coef_") X2.compute_chunk_sizes() assert X2.shape == (300, vect.n_features)
Example #15
Source File: test_protocols.py From bionic with Apache License 2.0 | 6 votes |
def test_typed_dask_dataframe(builder): df_value = pd.DataFrame() df_value["int"] = [1, 2, 3] df_value["float"] = [1.0, 1.5, float("nan")] df_value["str"] = ["red", "blue", None] df_value["time"] = pd.to_datetime(["2011-02-07", "2011-03-17", "2011-04-27"]) dask_df = dd.from_pandas(df_value, npartitions=1) @builder @bn.protocol.dask def df(): return dask_df assert equal_frame_and_index_content( builder.build().get("df").compute(), dask_df.compute() ) assert ( builder.build().get("df").compute().dtypes.to_dict() == dask_df.compute().dtypes.to_dict() )
Example #16
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_da(self): a = dd.from_pandas(dummy, npartitions=2) de = dpp.DummyEncoder() result = de.fit_transform(a) assert isinstance(result, dd.DataFrame)
Example #17
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_drop_first(self, daskify): if daskify: df = dd.from_pandas(dummy, 2) else: df = dummy de = dpp.DummyEncoder(drop_first=True) trn = de.fit_transform(df) assert len(trn.columns) == 8 result = de.inverse_transform(trn) if daskify: result, df = compute(result, df) tm.assert_frame_equal(result, dummy)
Example #18
Source File: swifter.py From swifter with MIT License | 5 votes |
def _dask_apply(self, func, axis=0, raw=None, result_type=None, *args, **kwds): sample = self._obj.iloc[: self._npartitions * 2, :] with suppress_stdout_stderr(): meta = sample.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) try: with suppress_stdout_stderr(): # check that the dask apply matches the pandas apply tmp_df = ( dd.from_pandas(sample, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) self._validate_apply( tmp_df.equals(meta), error_message="Dask apply sample does not match pandas apply sample." ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) else: return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) except ERRORS_TO_HANDLE: # if dask apply doesn't match pandas apply, fallback to pandas if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") apply_func = self._obj.progress_apply else: apply_func = self._obj.apply return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds)
Example #19
Source File: swifter.py From swifter with MIT License | 5 votes |
def _dask_applymap(self, func): sample = self._obj.iloc[: self._npartitions * 2, :] with suppress_stdout_stderr(): meta = sample.applymap(func) try: with suppress_stdout_stderr(): # check that the dask apply matches the pandas apply tmp_df = ( dd.from_pandas(sample, npartitions=self._npartitions) .applymap(func, meta=meta) .compute(scheduler=self._scheduler) ) self._validate_apply( tmp_df.equals(meta), error_message="Dask applymap sample does not match pandas applymap sample." ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Applymap"): return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .applymap(func, meta=meta) .compute(scheduler=self._scheduler) ) else: return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .applymap(func, meta=meta) .compute(scheduler=self._scheduler) ) except ERRORS_TO_HANDLE: # if dask apply doesn't match pandas apply, fallback to pandas if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") applymap_func = self._obj.progress_applymap else: applymap_func = self._obj.applymap return applymap_func(func)
Example #20
Source File: test_column_transformer.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_column_transformer_unk_chunksize(): names = ["a", "b", "c"] x = dd.from_pandas(pd.DataFrame(np.arange(12).reshape(4, 3), columns=names), 2) features = sklearn.pipeline.Pipeline( [ ( "features", sklearn.pipeline.FeatureUnion( [ ( "ratios", dask_ml.compose.ColumnTransformer( [ ("a_b", SumTransformer(one_d=False), ["a", "b"]), ("b_c", SumTransformer(one_d=False), ["b", "c"]), ] ), ) ] ), ) ] ) out = features.fit_transform(x) exp = np.array([[1, 3], [7, 9], [13, 15], [19, 21]]) np.testing.assert_array_equal(out, exp)
Example #21
Source File: test_partial.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_dataframes(): df = pd.DataFrame({"x": range(10), "y": [0, 1] * 5}) ddf = dd.from_pandas(df, npartitions=2) with dask.config.set(scheduler="single-threaded"): sgd = SGDClassifier(max_iter=5, tol=1e-3) sgd = fit(sgd, ddf[["x"]], ddf.y, classes=[0, 1]) sol = sgd.predict(df[["x"]]) result = predict(sgd, ddf[["x"]]) da.utils.assert_eq(sol, result)
Example #22
Source File: simpletable.py From pyphot with MIT License | 5 votes |
def to_dask(self, **kwargs): """ Construct a Dask DataFrame This splits an in-memory Pandas dataframe into several parts and constructs a dask.dataframe from those parts on which Dask.dataframe can operate in parallel. Note that, despite parallelism, Dask.dataframe may not always be faster than Pandas. We recommend that you stay with Pandas for as long as possible before switching to Dask.dataframe. Parameters ---------- keys: sequence, optional ordered subset of columns to export npartitions : int, optional The number of partitions of the index to create. Note that depending on the size and index of the dataframe, the output may have fewer partitions than requested. chunksize : int, optional The size of the partitions of the index. sort: bool Sort input first to obtain cleanly divided partitions or don't sort and don't get cleanly divided partitions name: string, optional An optional keyname for the dataframe. Defaults to hashing the input Returns ------- dask.DataFrame or dask.Series A dask DataFrame/Series partitioned along the index """ try: from dask import dataframe keys = kwargs.pop('keys', None) return dataframe.from_pandas(self.to_pandas(keys=keys), **kwargs) except ImportError as error: print("Dask import error") raise error
Example #23
Source File: test_text.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_correct_meta(): vect = dask_ml.feature_extraction.text.HashingVectorizer() X = dd.from_pandas(pd.Series(["some text", "to classifiy"]), 2) result = vect.fit_transform(X) assert scipy.sparse.issparse(result._meta) assert result._meta.dtype == "float64" assert result._meta.shape == (0, 0)
Example #24
Source File: test_downstream.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def test_pyarrow(df): pyarrow = import_module('pyarrow') # noqa table = pyarrow.Table.from_pandas(df) result = table.to_pandas() tm.assert_frame_equal(result, df)
Example #25
Source File: test_downstream.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def test_dask(df): toolz = import_module('toolz') # noqa dask = import_module('dask') # noqa import dask.dataframe as dd ddf = dd.from_pandas(df, npartitions=3) assert ddf.A is not None assert ddf.compute() is not None
Example #26
Source File: test_viirs_edr_active_fires.py From satpy with GNU General Public License v3.0 | 5 votes |
def get_test_content(self): """Create fake test file content.""" fake_file = io.StringIO(u'''\n\n\n\n\n\n\n\n\n\n\n\n\n\n 24.64015007, -107.57017517, 317.38290405, 0.75, 0.75, 40, 4.28618050 25.90660477, -100.06127167, 331.17962646, 0.75, 0.75, 81, 20.61096764''') platform_key = {"NPP": "Suomi-NPP", "J01": "NOAA-20", "J02": "NOAA-21"} self.platform_name = platform_key.get(self.filename_info['satellite_name'].upper(), "unknown") return dd.from_pandas(pd.read_csv(fake_file, skiprows=15, header=None, names=["latitude", "longitude", "T4", "Along-scan", "Along-track", "confidence_cat", "power"]), chunksize=1)
Example #27
Source File: test_viirs_edr_active_fires.py From satpy with GNU General Public License v3.0 | 5 votes |
def get_test_content(self): """Create fake test file content.""" fake_file = io.StringIO(u'''\n\n\n\n\n\n\n\n\n\n\n\n\n\n 24.64015007, -107.57017517, 317.38290405, 0.75, 0.75, 40, 4.28618050 25.90660477, -100.06127167, 331.17962646, 0.75, 0.75, 81, 20.61096764''') return dd.from_pandas(pd.read_csv(fake_file, skiprows=15, header=None, names=["latitude", "longitude", "T13", "Along-scan", "Along-track", "confidence_pct", "power"]), chunksize=1)
Example #28
Source File: test_persistence_gcs.py From bionic with Apache License 2.0 | 5 votes |
def test_multifile_serialization(gcs_builder, make_counter): call_counter = make_counter() builder = gcs_builder dask_df = dd.from_pandas( df_from_csv_str( """ color,number red,1 blue,2 green,3 """ ), npartitions=1, ) @builder @bn.protocol.dask @count_calls(call_counter) def df(): return dask_df flow = builder.build() local_cache_path_str = flow.get("core__persistent_cache__flow_dir") assert equal_frame_and_index_content(flow.get("df").compute(), dask_df.compute()) assert equal_frame_and_index_content(flow.get("df").compute(), dask_df.compute()) assert call_counter.times_called() == 1 local_wipe_path(local_cache_path_str) flow = builder.build() assert equal_frame_and_index_content(flow.get("df").compute(), dask_df.compute()) assert call_counter.times_called() == 0
Example #29
Source File: simpletable.py From pyphot with MIT License | 5 votes |
def to_dask(self, **kwargs): """ Construct a Dask DataFrame This splits an in-memory Pandas dataframe into several parts and constructs a dask.dataframe from those parts on which Dask.dataframe can operate in parallel. Note that, despite parallelism, Dask.dataframe may not always be faster than Pandas. We recommend that you stay with Pandas for as long as possible before switching to Dask.dataframe. Parameters ---------- keys: sequence, optional ordered subset of columns to export npartitions : int, optional The number of partitions of the index to create. Note that depending on the size and index of the dataframe, the output may have fewer partitions than requested. chunksize : int, optional The size of the partitions of the index. sort: bool Sort input first to obtain cleanly divided partitions or don't sort and don't get cleanly divided partitions name: string, optional An optional keyname for the dataframe. Defaults to hashing the input Returns ------- dask.DataFrame or dask.Series A dask DataFrame/Series partitioned along the index """ try: from dask import dataframe keys = kwargs.pop('keys', None) return dataframe.from_pandas(self.to_pandas(keys=keys), **kwargs) except ImportError as error: print("Dask import error") raise error
Example #30
Source File: test_impute.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_impute_most_frequent(): # https://github.com/dask/dask-ml/issues/385 data = dd.from_pandas(pd.DataFrame([1, 1, 1, 1, np.nan, np.nan]), 2) model = dask_ml.impute.SimpleImputer(strategy="most_frequent") result = model.fit_transform(data) expected = dd.from_pandas(pd.DataFrame({0: [1.0] * 6}), 2) dd.utils.assert_eq(result, expected) assert model.statistics_[0] == 1.0