Python dask.dataframe.from_pandas() Examples

The following are 30 code examples of dask.dataframe.from_pandas(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module dask.dataframe , or try the search function .
Example #1
Source File: test_block_transformer.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_validate(self, mocker, daskify, validate):
        X = np.arange(100).reshape((25, 4))
        df = pd.DataFrame(X).rename(columns=str)
        if daskify:
            X = da.from_array(X, chunks=(5, 4))
            df = dd.from_pandas(df, npartitions=2)
        m = mocker.patch("dask_ml.preprocessing._block_transformer.check_array")
        bt = BlockTransformer(lambda x: x, validate=validate)
        if validate:
            _ = bt.transform(X)
            m.assert_called_once()
            m.reset_mock()
            _ = bt.transform(df)
            m.assert_called_once()
        else:
            _ = bt.transform(X)
            m.assert_not_called()
            _ = bt.transform(df)
            m.assert_not_called() 
Example #2
Source File: swifter.py    From swifter with MIT License 6 votes vote down vote up
def _dask_apply(self, func, *args, **kwds):
        try:
            # check that the dask rolling apply matches the pandas apply
            with suppress_stdout_stderr():
                tmp_df = (
                    dd.from_pandas(self._comparison_pd, npartitions=self._npartitions)
                    .rolling(**{k: v for k, v in self._rolling_kwds.items() if k not in ["on", "closed"]})
                    .apply(func, *args, **kwds)
                    .compute(scheduler=self._scheduler)
                )
                self._validate_apply(
                    tmp_df.equals(self._comparison_pd.rolling(**self._rolling_kwds).apply(func, *args, **kwds)),
                    error_message="Dask rolling apply sample does not match pandas rolling apply sample.",
                )
            if self._progress_bar:
                with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"):
                    return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler)
            else:
                return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler)
        except ERRORS_TO_HANDLE:
            if self._progress_bar:
                tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                return self._obj_pd.progress_apply(func, *args, **kwds)
            else:
                return self._obj_pd.apply(func, *args, **kwds) 
Example #3
Source File: test_label.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_categorical(self, categories, transformed, daskify, ordered):
        cat = pd.Series(
            ["a", "b", "a"],
            dtype=pd.api.types.CategoricalDtype(categories=categories, ordered=ordered),
        )
        if daskify:
            cat = dd.from_pandas(cat, npartitions=2)
            transformed = da.from_array(transformed, chunks=(2, 1))
            if daskify == "unknown":
                cat = cat.cat.as_unknown()

        a = dpp.LabelEncoder().fit(cat)

        if daskify != "unknown":
            assert a.dtype_ == cat.dtype
        np.testing.assert_array_equal(a.classes_, categories)
        result = a.transform(cat)
        da.utils.assert_eq(result, transformed)

        inv_transformed = a.inverse_transform(result)
        if daskify:
            # manually set the divisions for the test
            inv_transformed.divisions = (0, 2)
        dd.utils.assert_eq(inv_transformed, cat) 
Example #4
Source File: swifter.py    From swifter with MIT License 6 votes vote down vote up
def __init__(
        self,
        pandas_obj,
        npartitions=None,
        dask_threshold=1,
        scheduler="processes",
        progress_bar=True,
        progress_bar_desc=None,
        allow_dask_on_strings=False,
    ):
        super(Transformation, self).__init__(
            pandas_obj, npartitions, dask_threshold, scheduler, progress_bar, progress_bar_desc, allow_dask_on_strings
        )
        self._sample_pd = pandas_obj.iloc[: self._SAMPLE_SIZE]
        self._obj_pd = pandas_obj
        self._obj_dd = dd.from_pandas(pandas_obj, npartitions=npartitions)
        self._nrows = pandas_obj.shape[0] 
Example #5
Source File: test_data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_inverse_transform(self):
        enc = dpp.OrdinalEncoder()
        df = dd.from_pandas(
            pd.DataFrame(
                {"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)}
            ),
            npartitions=2,
        )
        enc.fit(df)

        assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df).values))
        assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df)))

        assert_eq_df(df, enc.inverse_transform(enc.transform(df)))
        assert_eq_df(df, enc.inverse_transform(enc.transform(df)))
        assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
        assert_eq_df(df, enc.inverse_transform(enc.transform(df).values)) 
Example #6
Source File: test_label.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_use_categorical(self, daskify):
        data = pd.Series(
            ["b", "c"], dtype=pd.api.types.CategoricalDtype(["c", "a", "b"])
        )
        if daskify:
            data = dd.from_pandas(data, npartitions=2)
        a = dpp.LabelEncoder(use_categorical=False).fit(data)
        b = spp.LabelEncoder().fit(data)
        assert_estimator_equal(a, b, exclude={"dtype_"})
        assert a.dtype_ is None

        da.utils.assert_eq(a.transform(data), b.transform(data))
        a_trn = a.transform(data)
        b_trn = b.transform(data)
        da.utils.assert_eq(a_trn, b_trn)
        da.utils.assert_eq(a.inverse_transform(a_trn), b.inverse_transform(b_trn)) 
Example #7
Source File: test_lf_applier.py    From snorkel with Apache License 2.0 6 votes vote down vote up
def test_lf_applier_pandas_spacy_preprocessor_memoized(self) -> None:
        spacy = SpacyPreprocessor(text_field="text", doc_field="doc")
        spacy.memoize = True

        @labeling_function(pre=[spacy])
        def first_is_name(x: DataPoint) -> int:
            return 0 if x.doc[0].pos_ == "PROPN" else -1

        @labeling_function(pre=[spacy])
        def has_verb(x: DataPoint) -> int:
            return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1

        df = pd.DataFrame(dict(text=TEXT_DATA))
        df = dd.from_pandas(df, npartitions=2)
        applier = DaskLFApplier([first_is_name, has_verb])
        L = applier.apply(df)
        np.testing.assert_equal(L, L_TEXT_EXPECTED) 
Example #8
Source File: test_block_transformer.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_block_transform_multiply(self, daskify, validation, factor):
        X = np.arange(100).reshape((25, 4))
        df = pd.DataFrame(X).rename(columns=str)
        if daskify:
            X = da.from_array(X, chunks=(5, 4))
            df = dd.from_pandas(df, npartitions=2)
        if factor:
            bt = BlockTransformer(multiply, validate=validation, factor=factor)
        else:
            bt = BlockTransformer(multiply, validate=validation)
        if daskify:
            assert dask.is_dask_collection(bt.transform(X))
            assert dask.is_dask_collection(bt.transform(df))
        if factor:
            da.utils.assert_eq(bt.transform(X), multiply(X, factor=factor))
            dd.utils.assert_eq(bt.transform(df), multiply(df, factor=factor))
        else:
            da.utils.assert_eq(bt.transform(X), multiply(X))
            dd.utils.assert_eq(bt.transform(df), multiply(df)) 
Example #9
Source File: testdatasetproperty.py    From holoviews with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_to_holomap_dask(self):
        if dd is None:
            raise SkipTest("Dask required to test .to with dask dataframe.")
        ddf = dd.from_pandas(self.df, npartitions=2)
        dds = Dataset(
            ddf,
            kdims=[
                Dimension('a', label="The a Column"),
                Dimension('b', label="The b Column"),
                Dimension('c', label="The c Column"),
                Dimension('d', label="The d Column"),
            ]
        )

        curve_hmap = dds.to(Curve, 'a', 'b', groupby=['c'])

        # Check HoloMap element datasets
        for v in self.df.c.drop_duplicates():
            curve = curve_hmap.data[(v,)]
            self.assertEqual(
                curve.dataset, self.ds
            )

            # Execute pipeline
            self.assertEqual(curve.pipeline(curve.dataset), curve) 
Example #10
Source File: test_impute.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_frame_strategies(daskify, strategy):
    df = pd.DataFrame({"A": [1, 1, np.nan, np.nan, 2, 2]})
    if daskify:
        df = dd.from_pandas(df, 2)

    if strategy == "constant":
        fill_value = 2
    else:
        fill_value = None

    b = dask_ml.impute.SimpleImputer(strategy=strategy, fill_value=fill_value)
    b.fit(df)
    if not daskify and strategy == "median":
        expected = pd.Series([1.5], index=["A"])
    else:
        expected = pd.Series([2], index=["A"])
    tm.assert_series_equal(b.statistics_, expected, check_dtype=False) 
Example #11
Source File: mock.py    From timeserio with MIT License 6 votes vote down vote up
def mock_dask_fit_data(
    periods=DEF_N,
    start_date=None,
    ids=[0],
    embedding_dim=DEF_EMB_DIM,
    seq_length=DEF_SEQ_LENGTH
):
    """Create example fit data as a dask DataFrame.

    DataFrame is partitioned by ID.
    """
    df = mock_fit_data(
        periods=periods,
        start_date=start_date,
        ids=ids,
        embedding_dim=embedding_dim,
        seq_length=seq_length
    )
    ddf = dd.from_pandas(df, chunksize=periods)
    return ddf 
Example #12
Source File: mock.py    From timeserio with MIT License 6 votes vote down vote up
def mock_dask_raw_data(
    periods=DEF_N,
    start_date=None,
    ids=[0]
):
    """Create example fit data as a dask DataFrame.

    DataFrame is partitioned by ID.
    """
    df = mock_raw_data(
        periods=periods,
        start_date=start_date,
        ids=ids,
    )
    ddf = dd.from_pandas(df, chunksize=periods)
    return ddf 
Example #13
Source File: test_model_selection.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_grid_search_dask_dataframe():
    iris = load_iris()
    X = iris.data
    y = iris.target

    df = pd.DataFrame(X)
    ddf = dd.from_pandas(df, 2)

    dy = pd.Series(y)
    ddy = dd.from_pandas(dy, 2)

    clf = LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=200)

    param_grid = {"C": [0.1, 1, 10]}
    gs = GridSearchCV(clf, param_grid, cv=5)
    dgs = dcv.GridSearchCV(clf, param_grid, cv=5)
    gs.fit(df, dy)
    dgs.fit(ddf, ddy)

    assert gs.best_params_ == dgs.best_params_ 
Example #14
Source File: test_incremental.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_incremental_text_pipeline(container):
    X = pd.Series(["a list", "of words", "for classification"] * 100)
    X = dd.from_pandas(X, npartitions=3)

    if container == "bag":
        X = X.to_bag()

    y = da.from_array(np.array([0, 0, 1] * 100), chunks=(100,) * 3)

    assert tuple(X.map_partitions(len).compute()) == y.chunks[0]

    sgd = SGDClassifier(max_iter=5, tol=1e-3)
    clf = Incremental(sgd, scoring="accuracy", assume_equal_chunks=True)
    vect = dask_ml.feature_extraction.text.HashingVectorizer()
    pipe = make_pipeline(vect, clf)

    pipe.fit(X, y, incremental__classes=[0, 1])
    X2 = pipe.steps[0][1].transform(X)
    assert hasattr(clf, "coef_")

    X2.compute_chunk_sizes()
    assert X2.shape == (300, vect.n_features) 
Example #15
Source File: test_protocols.py    From bionic with Apache License 2.0 6 votes vote down vote up
def test_typed_dask_dataframe(builder):
    df_value = pd.DataFrame()
    df_value["int"] = [1, 2, 3]
    df_value["float"] = [1.0, 1.5, float("nan")]
    df_value["str"] = ["red", "blue", None]
    df_value["time"] = pd.to_datetime(["2011-02-07", "2011-03-17", "2011-04-27"])
    dask_df = dd.from_pandas(df_value, npartitions=1)

    @builder
    @bn.protocol.dask
    def df():
        return dask_df

    assert equal_frame_and_index_content(
        builder.build().get("df").compute(), dask_df.compute()
    )
    assert (
        builder.build().get("df").compute().dtypes.to_dict()
        == dask_df.compute().dtypes.to_dict()
    ) 
Example #16
Source File: test_data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_da(self):
        a = dd.from_pandas(dummy, npartitions=2)
        de = dpp.DummyEncoder()
        result = de.fit_transform(a)
        assert isinstance(result, dd.DataFrame) 
Example #17
Source File: test_data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_drop_first(self, daskify):
        if daskify:
            df = dd.from_pandas(dummy, 2)
        else:
            df = dummy
        de = dpp.DummyEncoder(drop_first=True)
        trn = de.fit_transform(df)
        assert len(trn.columns) == 8

        result = de.inverse_transform(trn)
        if daskify:
            result, df = compute(result, df)
        tm.assert_frame_equal(result, dummy) 
Example #18
Source File: swifter.py    From swifter with MIT License 5 votes vote down vote up
def _dask_apply(self, func, axis=0, raw=None, result_type=None, *args, **kwds):
        sample = self._obj.iloc[: self._npartitions * 2, :]
        with suppress_stdout_stderr():
            meta = sample.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds)
        try:
            with suppress_stdout_stderr():
                # check that the dask apply matches the pandas apply
                tmp_df = (
                    dd.from_pandas(sample, npartitions=self._npartitions)
                    .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                    .compute(scheduler=self._scheduler)
                )
                self._validate_apply(
                    tmp_df.equals(meta), error_message="Dask apply sample does not match pandas apply sample."
                )
            if self._progress_bar:
                with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"):
                    return (
                        dd.from_pandas(self._obj, npartitions=self._npartitions)
                        .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                        .compute(scheduler=self._scheduler)
                    )
            else:
                return (
                    dd.from_pandas(self._obj, npartitions=self._npartitions)
                    .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                    .compute(scheduler=self._scheduler)
                )
        except ERRORS_TO_HANDLE:
            # if dask apply doesn't match pandas apply, fallback to pandas
            if self._progress_bar:
                tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                apply_func = self._obj.progress_apply
            else:
                apply_func = self._obj.apply

            return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) 
Example #19
Source File: swifter.py    From swifter with MIT License 5 votes vote down vote up
def _dask_applymap(self, func):
        sample = self._obj.iloc[: self._npartitions * 2, :]
        with suppress_stdout_stderr():
            meta = sample.applymap(func)
        try:
            with suppress_stdout_stderr():
                # check that the dask apply matches the pandas apply
                tmp_df = (
                    dd.from_pandas(sample, npartitions=self._npartitions)
                    .applymap(func, meta=meta)
                    .compute(scheduler=self._scheduler)
                )
                self._validate_apply(
                    tmp_df.equals(meta), error_message="Dask applymap sample does not match pandas applymap sample."
                )
            if self._progress_bar:
                with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Applymap"):
                    return (
                        dd.from_pandas(self._obj, npartitions=self._npartitions)
                        .applymap(func, meta=meta)
                        .compute(scheduler=self._scheduler)
                    )
            else:
                return (
                    dd.from_pandas(self._obj, npartitions=self._npartitions)
                    .applymap(func, meta=meta)
                    .compute(scheduler=self._scheduler)
                )
        except ERRORS_TO_HANDLE:
            # if dask apply doesn't match pandas apply, fallback to pandas
            if self._progress_bar:
                tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
                applymap_func = self._obj.progress_applymap
            else:
                applymap_func = self._obj.applymap

            return applymap_func(func) 
Example #20
Source File: test_column_transformer.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_column_transformer_unk_chunksize():
    names = ["a", "b", "c"]
    x = dd.from_pandas(pd.DataFrame(np.arange(12).reshape(4, 3), columns=names), 2)
    features = sklearn.pipeline.Pipeline(
        [
            (
                "features",
                sklearn.pipeline.FeatureUnion(
                    [
                        (
                            "ratios",
                            dask_ml.compose.ColumnTransformer(
                                [
                                    ("a_b", SumTransformer(one_d=False), ["a", "b"]),
                                    ("b_c", SumTransformer(one_d=False), ["b", "c"]),
                                ]
                            ),
                        )
                    ]
                ),
            )
        ]
    )
    out = features.fit_transform(x)

    exp = np.array([[1, 3], [7, 9], [13, 15], [19, 21]])
    np.testing.assert_array_equal(out, exp) 
Example #21
Source File: test_partial.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_dataframes():
    df = pd.DataFrame({"x": range(10), "y": [0, 1] * 5})
    ddf = dd.from_pandas(df, npartitions=2)

    with dask.config.set(scheduler="single-threaded"):
        sgd = SGDClassifier(max_iter=5, tol=1e-3)

        sgd = fit(sgd, ddf[["x"]], ddf.y, classes=[0, 1])

        sol = sgd.predict(df[["x"]])
        result = predict(sgd, ddf[["x"]])

        da.utils.assert_eq(sol, result) 
Example #22
Source File: simpletable.py    From pyphot with MIT License 5 votes vote down vote up
def to_dask(self, **kwargs):
        """ Construct a Dask DataFrame

        This splits an in-memory Pandas dataframe into several parts and constructs
        a dask.dataframe from those parts on which Dask.dataframe can operate in
        parallel.

        Note that, despite parallelism, Dask.dataframe may not always be faster
        than Pandas.  We recommend that you stay with Pandas for as long as
        possible before switching to Dask.dataframe.

        Parameters
        ----------
        keys: sequence, optional
            ordered subset of columns to export
        npartitions : int, optional
            The number of partitions of the index to create. Note that depending on
            the size and index of the dataframe, the output may have fewer
            partitions than requested.
        chunksize : int, optional
            The size of the partitions of the index.
        sort: bool
            Sort input first to obtain cleanly divided partitions or don't sort and
            don't get cleanly divided partitions
        name: string, optional
            An optional keyname for the dataframe.  Defaults to hashing the input

        Returns
        -------
        dask.DataFrame or dask.Series
            A dask DataFrame/Series partitioned along the index
        """
        try:
            from dask import dataframe
            keys = kwargs.pop('keys', None)
            return dataframe.from_pandas(self.to_pandas(keys=keys), **kwargs)
        except ImportError as error:
            print("Dask import error")
            raise error 
Example #23
Source File: test_text.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_correct_meta():
    vect = dask_ml.feature_extraction.text.HashingVectorizer()
    X = dd.from_pandas(pd.Series(["some text", "to classifiy"]), 2)
    result = vect.fit_transform(X)
    assert scipy.sparse.issparse(result._meta)
    assert result._meta.dtype == "float64"
    assert result._meta.shape == (0, 0) 
Example #24
Source File: test_downstream.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_pyarrow(df):

    pyarrow = import_module('pyarrow')  # noqa
    table = pyarrow.Table.from_pandas(df)
    result = table.to_pandas()
    tm.assert_frame_equal(result, df) 
Example #25
Source File: test_downstream.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_dask(df):

    toolz = import_module('toolz')  # noqa
    dask = import_module('dask')  # noqa

    import dask.dataframe as dd

    ddf = dd.from_pandas(df, npartitions=3)
    assert ddf.A is not None
    assert ddf.compute() is not None 
Example #26
Source File: test_viirs_edr_active_fires.py    From satpy with GNU General Public License v3.0 5 votes vote down vote up
def get_test_content(self):
        """Create fake test file content."""
        fake_file = io.StringIO(u'''\n\n\n\n\n\n\n\n\n\n\n\n\n\n
        24.64015007, -107.57017517,  317.38290405,   0.75,   0.75,   40,    4.28618050
        25.90660477, -100.06127167,  331.17962646,   0.75,   0.75,   81,   20.61096764''')

        platform_key = {"NPP": "Suomi-NPP", "J01": "NOAA-20", "J02": "NOAA-21"}

        self.platform_name = platform_key.get(self.filename_info['satellite_name'].upper(), "unknown")

        return dd.from_pandas(pd.read_csv(fake_file, skiprows=15, header=None,
                                          names=["latitude", "longitude",
                                                 "T4", "Along-scan", "Along-track",
                                                 "confidence_cat",
                                                 "power"]), chunksize=1) 
Example #27
Source File: test_viirs_edr_active_fires.py    From satpy with GNU General Public License v3.0 5 votes vote down vote up
def get_test_content(self):
        """Create fake test file content."""
        fake_file = io.StringIO(u'''\n\n\n\n\n\n\n\n\n\n\n\n\n\n
        24.64015007, -107.57017517,  317.38290405,   0.75,   0.75,   40,    4.28618050
        25.90660477, -100.06127167,  331.17962646,   0.75,   0.75,   81,   20.61096764''')

        return dd.from_pandas(pd.read_csv(fake_file, skiprows=15, header=None,
                                          names=["latitude", "longitude",
                                                 "T13", "Along-scan", "Along-track",
                                                 "confidence_pct",
                                                 "power"]), chunksize=1) 
Example #28
Source File: test_persistence_gcs.py    From bionic with Apache License 2.0 5 votes vote down vote up
def test_multifile_serialization(gcs_builder, make_counter):
    call_counter = make_counter()
    builder = gcs_builder

    dask_df = dd.from_pandas(
        df_from_csv_str(
            """
            color,number
            red,1
            blue,2
            green,3
            """
        ),
        npartitions=1,
    )

    @builder
    @bn.protocol.dask
    @count_calls(call_counter)
    def df():
        return dask_df

    flow = builder.build()
    local_cache_path_str = flow.get("core__persistent_cache__flow_dir")

    assert equal_frame_and_index_content(flow.get("df").compute(), dask_df.compute())
    assert equal_frame_and_index_content(flow.get("df").compute(), dask_df.compute())
    assert call_counter.times_called() == 1

    local_wipe_path(local_cache_path_str)
    flow = builder.build()

    assert equal_frame_and_index_content(flow.get("df").compute(), dask_df.compute())
    assert call_counter.times_called() == 0 
Example #29
Source File: simpletable.py    From pyphot with MIT License 5 votes vote down vote up
def to_dask(self, **kwargs):
        """ Construct a Dask DataFrame

        This splits an in-memory Pandas dataframe into several parts and constructs
        a dask.dataframe from those parts on which Dask.dataframe can operate in
        parallel.

        Note that, despite parallelism, Dask.dataframe may not always be faster
        than Pandas.  We recommend that you stay with Pandas for as long as
        possible before switching to Dask.dataframe.

        Parameters
        ----------
        keys: sequence, optional
            ordered subset of columns to export
        npartitions : int, optional
            The number of partitions of the index to create. Note that depending on
            the size and index of the dataframe, the output may have fewer
            partitions than requested.
        chunksize : int, optional
            The size of the partitions of the index.
        sort: bool
            Sort input first to obtain cleanly divided partitions or don't sort and
            don't get cleanly divided partitions
        name: string, optional
            An optional keyname for the dataframe.  Defaults to hashing the input

        Returns
        -------
        dask.DataFrame or dask.Series
            A dask DataFrame/Series partitioned along the index
        """
        try:
            from dask import dataframe
            keys = kwargs.pop('keys', None)
            return dataframe.from_pandas(self.to_pandas(keys=keys), **kwargs)
        except ImportError as error:
            print("Dask import error")
            raise error 
Example #30
Source File: test_impute.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_impute_most_frequent():
    # https://github.com/dask/dask-ml/issues/385
    data = dd.from_pandas(pd.DataFrame([1, 1, 1, 1, np.nan, np.nan]), 2)
    model = dask_ml.impute.SimpleImputer(strategy="most_frequent")
    result = model.fit_transform(data)
    expected = dd.from_pandas(pd.DataFrame({0: [1.0] * 6}), 2)
    dd.utils.assert_eq(result, expected)
    assert model.statistics_[0] == 1.0