Python dask.dataframe() Examples

The following are 30 code examples of dask.dataframe(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module dask , or try the search function .
Example #1
Source File: load.py    From predictatops with MIT License 7 votes vote down vote up
def turn_dict_of_well_dfs_to_single_df(dictOfWellDf):
    """
    Takes in a dict of dataframes, where each dataframe is for a well created by LASIO. Likely created by load_all_wells_in function and is the first item in the returned list.
    and returns a single dataframe of all wells
    """
    # start by creating empty dataframe and list
    data_df = pd.DataFrame()
    list_of_df = []
    keys = list(dictOfWellDf.keys())
    # get dict of well data frames into values format
    values = dictOfWellDf.values()
    # go through each item in values and add to a list
    count = 0
    for each in values:
        each["UWI"] = keys[count]
        count += 1
        list_of_df.append(each)
    # concat the list into a single dataframe
    data_df = pd.concat(list_of_df)
    return data_df 
Example #2
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def normalize(cls, df, target_var, mean_list, stddev_list):
        """Normalizes the numerical columns in a dataframe.

        Arguments:
                df : dask dataframe, The dataframe to normalize
                target_var : string, Dependent variable for the analysis
                mean_list : dask series, Series with all the mean values
                stddev_list : dask series, Series with all the standard deviation values

        Returns:
                df : Dataframe with mean normalized numerical columns
        """
        continuous_cols = [
            col for col in df.columns if df[col].dtype != 'object' and col != target_var]
        for col in continuous_cols:
            df[col] = df[col].sub(mean_list[col]).div(stddev_list[col])

        return df 
Example #3
Source File: test_data.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_transformed_shape(self):
        # checks if the transformed objects have the correct columns
        a = dpp.PolynomialFeatures()
        a.fit(X)
        n_cols = len(a.get_feature_names())
        # dask array
        assert a.transform(X).shape[1] == n_cols
        # numpy array
        assert a.transform(X.compute()).shape[1] == n_cols
        # dask dataframe
        assert a.transform(df).shape[1] == n_cols
        # pandas dataframe
        assert a.transform(df.compute()).shape[1] == n_cols
        X_nan_rows = df.values
        df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns)
        # dask array with nan rows
        assert a.transform(X_nan_rows).shape[1] == n_cols
        # dask data frame with nan rows
        assert a.transform(df_none_divisions).shape[1] == n_cols 
Example #4
Source File: test_parallel_post_fit.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_transform(kind):
    X, y = make_classification(chunks=100)

    if kind == "numpy":
        X, y = dask.compute(X, y)
    elif kind == "dask.dataframe":
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = PCA(random_state=0)
    wrap = ParallelPostFit(PCA(random_state=0))

    base.fit(X, y)
    wrap.fit(X, y)

    assert_estimator_equal(wrap.estimator, base)

    result = base.transform(X)
    expected = wrap.transform(X)
    assert_eq_ar(result, expected) 
Example #5
Source File: test_update.py    From kartothek with MIT License 6 votes vote down vote up
def test_hash_bucket(col, num_buckets=5):
    df = pd.DataFrame(
        {
            "range": np.arange(10),
            "range_duplicated": np.repeat(np.arange(2), 5),
            "random": np.random.randint(0, 100, 10),
        }
    )
    hashed = _hash_bucket(df, [col], num_buckets)
    assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET: "nunique"}) == 1).all().all()

    # Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets)
    df_sample = df.iloc[[0, 7]]
    hashed_sample = _hash_bucket(df_sample, [col], num_buckets)
    expected = hashed.loc[df_sample.index]
    pdt.assert_frame_equal(expected, hashed_sample) 
Example #6
Source File: dataframe.py    From kartothek with MIT License 6 votes vote down vote up
def _get_dask_meta_for_dataset(
    ds_factory, table, columns, categoricals, dates_as_object
):
    """
    Calculate a schema suitable for the dask dataframe meta from the dataset.
    """
    table_schema = ds_factory.table_meta[table]
    meta = empty_dataframe_from_schema(
        table_schema, columns=columns, date_as_object=dates_as_object
    )

    if categoricals:
        meta = meta.astype({col: "category" for col in categoricals})
        meta = dd.utils.clear_known_categories(meta, categoricals)

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, {table: categoricals}
    )
    if categoricals_from_index:
        meta = meta.astype(categoricals_from_index[table])
    return meta 
Example #7
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def calculate_stats(cls, df, target_var):
        """Calculates descriptive stats of the dataframe required for cleaning.

        Arguments:
                df : dask dataframe, The dataframe at hand
                target_var : string, Dependent variable for the analysis

        Returns:
                mean : dask series, mean of each column
                median : dask series, median of each column
                dict(zip(categorical_cols, mode)) : dict, Dictionary containing
                        categorical column as keys and their modes as values
                std : dask series, standard deviation of each column
        """
        categorical_columns = [
            col for col in df.columns if col != target_var and df[col].dtype == 'object']
        mean_op = df.mean()
        std_op = df.std()
        median_op = df.quantile(0.5)
        mode_op = [df[col].value_counts().idxmax()
                   for col in categorical_columns]
        mean, median, mode, std = dask.compute(
            mean_op, median_op, mode_op, std_op)
        return mean, median, dict(zip(categorical_columns, mode)), std 
Example #8
Source File: test_base.py    From intake with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_datasource_discover(source_dataframe):
    r = source_dataframe.discover()

    assert source_dataframe.container == 'dataframe'

    row_dtype = np.dtype([('x', np.int64), ('y', np.int64)])
    assert r == {
        'datashape': 'datashape',
        'dtype': row_dtype,
        'shape': (6,),
        'npartitions': 2,
        'metadata': dict(a=1, b=2, c=3, d=4),
    }

    # check attributes have been set
    assert source_dataframe.datashape == 'datashape'
    assert source_dataframe.dtype == row_dtype
    assert source_dataframe.shape == (6,)
    assert source_dataframe.npartitions == 2
    assert source_dataframe.metadata == dict(a=1, b=2, c=3, d=4)

    # check that _get_schema is only called once
    assert source_dataframe.call_count['_get_schema'] == 1
    source_dataframe.discover()
    assert source_dataframe.call_count['_get_schema'] == 1 
Example #9
Source File: test_dask.py    From eliot with Apache License 2.0 5 votes vote down vote up
def test_persist_pandas(self):
        """persist_with_trace() with a Pandas dataframe.

        This ensures we don't blow up, which used to be the case.
        """
        df = pd.DataFrame()
        df = dd.from_pandas(df, npartitions=1)
        persist_with_trace(df) 
Example #10
Source File: csv.py    From intake with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _open_dataset(self, urlpath):
        """Open dataset using dask and use pattern fields to set new columns
        """
        import dask.dataframe

        if self.pattern is None:
            self._dataframe = dask.dataframe.read_csv(
                urlpath, storage_options=self._storage_options,
                **self._csv_kwargs)
            return

        if not (DASK_VERSION >= '0.19.0'):
            raise ValueError("Your version of dask is '{}'. "
                "The ability to include filenames in read_csv output "
                "(``include_path_column``) was added in 0.19.0, so "
                "pattern urlpaths are not supported.".format(DASK_VERSION))

        drop_path_column = 'include_path_column' not in self._csv_kwargs
        path_column = self._path_column()

        self._dataframe = dask.dataframe.read_csv(
            urlpath, storage_options=self._storage_options, **self._csv_kwargs)

        # add the new columns to the dataframe
        self._set_pattern_columns(path_column)

        if drop_path_column:
            self._dataframe = self._dataframe.drop([path_column], axis=1) 
Example #11
Source File: utils.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _pandas_indexing(X, key, key_dtype, axis):
    """Index a pandas dataframe or a series."""
    if hasattr(key, "shape"):
        # Work-around for indexing with read-only key in pandas
        # FIXME: solved in pandas 0.25
        key = np.asarray(key)
        key = key if key.flags.writeable else key.copy()
    # check whether we should index with loc or iloc
    indexer = X.iloc if key_dtype == "int" else X.loc
    return indexer[:, key] if axis else indexer[key] 
Example #12
Source File: utils.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _num_samples(X):
    result = sk_validation._num_samples(X)
    if dask.is_dask_collection(result):
        # dask dataframe
        result = result.compute()
    return result 
Example #13
Source File: text.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def transform(self, raw_X):
        """Transform a sequence of documents to a document-term matrix.

        Transformation is done in parallel, and correctly handles dask
        collections.

        Parameters
        ----------
        raw_X : dask.bag.Bag or dask.dataframe.Series, length = n_samples
            Each sample must be a text document (either bytes or
            unicode strings, file name or file object depending on the
            constructor argument) which will be tokenized and hashed.

        Returns
        -------
        X : dask.array.Array, shape = (n_samples, self.n_features)
            Document-term matrix. Each block of the array is a scipy sparse
            matrix.

        Notes
        -----
        The returned dask Array is composed scipy sparse matricies. If you need
        to compute on the result immediately, you may need to convert the individual
        blocks to ndarrays or pydata/sparse matricies.

        >>> import sparse
        >>> X.map_blocks(sparse.COO.from_scipy_sparse, dtype=X.dtype)  # doctest: +SKIP

        See the :doc:`examples/text-vectorization` for more.
        """
        return super().transform(raw_X) 
Example #14
Source File: _blockwise.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _collect_probas(self, X):
        if isinstance(X, da.Array):
            chunks = (len(self.estimators_), X.chunks[0], len(self.classes_))
            meta = np.array([], dtype="float64")
            # (n_estimators, len(X), n_classses)
            combined = X.map_blocks(
                _predict_proba_stack,
                estimators=self.estimators_,
                chunks=chunks,
                meta=meta,
            )
        elif isinstance(X, dd._Frame):
            # TODO: replace with a _predict_proba_stack version.
            # This current raises; dask.dataframe doesn't like map_partitions that
            # return new axes.
            # meta = np.empty((len(self.estimators_), 0, len(self.classes_)),
            #                 dtype="float64")
            # combined = X.map_partitions(_predict_proba_stack, meta=meta,
            #                             estimators=self.estimators_)
            # combined._chunks = ((len(self.estimators_),),
            #                     (np.nan,) * X.npartitions,
            #                     (len(X.columns),))
            meta = np.empty((0, len(self.classes_)), dtype="float64")
            probas = [
                X.map_partitions(_predict_proba, meta=meta, estimator=estimator)
                for estimator in self.estimators_
            ]
            # TODO(https://github.com/dask/dask/issues/6177): replace with da.stack
            chunks = probas[0]._chunks
            for proba in probas:
                proba._chunks = ((1,) * len(chunks[0]), chunks[1])

            combined = da.stack(probas)
            combined._chunks = ((1,) * len(self.estimators_),) + chunks
        else:
            # ndarray, etc.
            combined = np.stack(
                [estimator.predict_proba(X) for estimator in self.estimators_]
            )

        return combined 
Example #15
Source File: test_read.py    From kartothek with MIT License 5 votes vote down vote up
def test_reconstruct_dask_index_sorting(store_factory, monkeypatch):

    # Make sure we're not shuffling anything
    monkeypatch.delattr(
        dask.dataframe.shuffle, dask.dataframe.shuffle.set_index.__name__
    )
    dataset_uuid = "dataset_uuid"
    colA = "ColumnA"
    colB = "ColumnB"

    df = pd.DataFrame(
        {colA: np.random.randint(high=100000, low=-100000, size=(50,)), colB: 0}
    )
    store_dataframes_as_dataset(
        store=store_factory, dataset_uuid=dataset_uuid, dfs=[df], partition_on=colA
    )
    ddf = read_dataset_as_ddf(
        dataset_uuid=dataset_uuid,
        store=store_factory,
        table="table",
        dask_index_on=colA,
    )

    assert all(
        ddf.map_partitions(lambda df: df.index.min()).compute().values
        == ddf.divisions[:-1]
    ) 
Example #16
Source File: test_update.py    From kartothek with MIT License 5 votes vote down vote up
def test_pack_payload_pandas_empty(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df_empty = df_all_types.iloc[:0]

    group_key = [df_all_types.columns[-1]]
    pdt.assert_frame_equal(
        df_empty,
        unpack_payload_pandas(
            pack_payload_pandas(df_empty, group_key=group_key), unpack_meta=df_empty
        ),
    ) 
Example #17
Source File: test_update.py    From kartothek with MIT License 5 votes vote down vote up
def test_pack_payload_pandas(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df = pd.concat([df_all_types] * 10, ignore_index=True)
    size_before = df.memory_usage(deep=True).sum()

    packed_df = pack_payload_pandas(df, group_key=list(df.columns[-2:]))

    size_after = packed_df.memory_usage(deep=True).sum()

    assert size_after < size_before 
Example #18
Source File: test_update.py    From kartothek with MIT License 5 votes vote down vote up
def test_pack_payload(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df = dd.from_pandas(
        pd.concat([df_all_types] * 10, ignore_index=True), npartitions=3
    )
    size_before = df.memory_usage(deep=True).sum()

    packed_df = pack_payload(df, group_key=list(df.columns[-2:]))

    size_after = packed_df.memory_usage(deep=True).sum()

    assert (size_after < size_before).compute() 
Example #19
Source File: test_update.py    From kartothek with MIT License 5 votes vote down vote up
def test_update_dataset_from_ddf_empty(store_factory, shuffle):
    with pytest.raises(ValueError, match="Cannot store empty datasets"):
        update_dataset_from_ddf(
            dask.dataframe.from_delayed([], meta=(("a", int),)),
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=shuffle,
            partition_on=["a"],
        ).compute() 
Example #20
Source File: dask.py    From PyMove with MIT License 5 votes vote down vote up
def to_data_frame(self):
        """
        Converts trajectory data to DataFrame format.

        Returns
        -------
        dask.dataframe.DataFrame
            Represents the trajectory in DataFrame format.

        """

        return self._data 
Example #21
Source File: dask.py    From PyMove with MIT License 5 votes vote down vote up
def generate_weekend_features(self):
        """Create or update the feature weekend to the dataframe."""
        raise NotImplementedError('To be implemented') 
Example #22
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def find_vocab(self, df):
        """Finds the number of levels in each categorical column.
        Helps for creation of feature columns for use in tf.data API

        Arguments:
          df : dask dataframe, Dataframe to extract the levels from

        Returns:
                A dictionary of column names and the levels in each variables
                        [ 0 for numerical columns and number of levels for categorical columns]
        """
        self.is_not_used()
        cat_columns = [
            col for col in df.columns if df[col].dtype == 'object']
        continuous_cols = [
            col for col in df.columns if df[col].dtype != 'object']
        temp = dask.compute([df[col].drop_duplicates() for col in cat_columns])

        column_mapping = dict()

        for col in continuous_cols:
            column_mapping[col] = 0

        for index, col in enumerate(cat_columns):
            column_mapping[col] = np.array(temp[0][index])

        return column_mapping 
Example #23
Source File: dask.py    From PyMove with MIT License 5 votes vote down vote up
def show_trajectories_info(self):
        """Show dataset information from dataframe."""
        raise NotImplementedError('To be implemented') 
Example #24
Source File: dask.py    From PyMove with MIT License 5 votes vote down vote up
def max(self, axis=None, skipna=True, split_every=False, out=None):
        """
        Return the maximum of the values for the requested axis..

        Parameters
        ----------
        axis: int, optional, default None, {index (0), columns (1)}.
            Axis for the function to be applied on.
        skipna: bool, optional, default None.
            Exclude NA/null values when computing the result.
        split_every:
            ?
        out:
            ?

        Returns
        -------
        max:Series or DataFrame (if level specified)
            The maximum values for the request axis.

        References
        ----------
        https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.max

        """

        return self._data.max(axis, skipna, split_every, out) 
Example #25
Source File: load.py    From predictatops with MIT License 5 votes vote down vote up
def makeDF(well_list):
    """
    Changes format of well list into a pandas dataframe with one column called "UWI_file".
    """
    formatted_well_list = []
    for eachW in well_list:
        formatted_well_list.append({"UWI_file": eachW})
    wells_df = pd.DataFrame(formatted_well_list)
    return wells_df 
Example #26
Source File: dask.py    From PyMove with MIT License 5 votes vote down vote up
def min(self, axis=None, skipna=True, split_every=False, out=None):
        """
        Return the minimum of the values for the requested axis.

        Parameters
        ----------
        axis: int, optional, default None, {index (0), columns (1)}.
            Axis for the function to be applied on.
        skipna: bool, optional, default None.
            Exclude NA/null values when computing the result.
        split_every:
            ?
        out:
            ?

        Returns
        -------
        max:Series or DataFrame (if level specified)
            The minimum values for the request axis.

        References
        ----------
        https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.min

        """

        return self._data.min(axis, skipna, split_every, out) 
Example #27
Source File: predictionclasses.py    From predictatops with MIT License 5 votes vote down vote up
def predict_from_model(self, model, df_X_toPredict):
        """
        The predict_from_model function takes as argument a model that is already trained on training data, in the demo case a 
        scikit-learn XGBoost model and the dataframe of the columns to predict. From this, it fills in 
        the self.result_df_from_prediction attribute and returns nothing.
    
        """
        self.result_df_from_prediction = model.predict(df_X_toPredict) 
Example #28
Source File: predictionclasses.py    From predictatops with MIT License 5 votes vote down vote up
def __init__(self, ML, vs, distClassDF_wRollingCols_training):
        # self.knn_dir = ML.knn_dir
        # self.load_dir = ML.load_dir
        # self.features_dir = ML.features_dir
        # self.machine_learning_dir = ML.machine_learning_dir
        # self.h5_to_load = ML.h5_to_load
        self.train_X = ML.train_X
        self.train_y = ML.train_y
        self.test_X = ML.test_X
        self.test_y = ML.test_y
        self.train_index = ML.train_index
        self.test_index = ML.test_index
        self.preSplitpreBal = ML.preSplitpreBal
        self.result_df_from_prediction = None  # df
        ####
        ####
        self.vs = vs  # object instance from variables class
        self.depth_str = vs["depth_str"]
        self.pick_class_str = vs["pick_class_str"]
        self.UWI_str = vs["UWI_str"]
        self.rollingWindows = vs["rollingWindows"]
        self.distClassIntegersArray = vs["distClassIntegersArray"]
        ####
        self.calc_pred = distClassDF_wRollingCols_training
        self.excludeWellsThatOnlyHaveTheseClasses = (
            []
        )  ### aka dropIfOnlyClasses in optionallyExcludeWellsWithoutStrongPredictions()
        self.NoGoodWellsToExclude = (
            []
        )  #### UWIs of wells that only had zeros in the predicted dsitance class so these wells were excluded from accurracy prediction
        ####
        self.calc_pred_TopMcMr_Pick_pred_DEPT_pred = None  # df
        self.calc_pred_TopTarget_DEPTH = None  # df
        self.fullUWIsSet = []  ### set of UWIs in the dataframe
        self.precentWellsKept = 1
        self.UWIsSetSubsetKept = (
            []
        )  #### subset of the wells that have predictions that aren't just zero or something else not wanted

    ## if zeros, calc_pred is changed to without zeros and zerosExcluded Array is populated 
Example #29
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def drop_cols(cls, df, col_names):
        """Drops any columns which are not required by the user.

        Arguments:
                df : dask dataframe, Dataframe of input data
                col_names : list, Columns in the data to be dropped

        returns:
                dask dataframe, Updated dataframe with columns dropped
        """
        return df.drop(col_names, axis=1) 
Example #30
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def dropping_zero_var_cols(cls, df, target_var, stddev_list):
        """Check columns which have zero variance and removes the from the dataframe.
            As the zero variance columns or contant columns can't be considered as output column

        Arguments:
                df : dask dataframe, The dataframe to validate
                stddev : dask series, Series containing the standard deviation values for columns
                target_var : string, Dependent variable for the analysis

        Returns:
                df : dask dataframe, Dataframe with redundant columns removed

        Raises:
                AssertionError : If the target column has zero deviation
        """
        continuous_cols = [
            col for col in df.columns if df[col].dtype != 'object']
        for col in continuous_cols:
            if stddev_list[col] == 0.0:
                df = df.drop(col, axis=1)
                if col == target_var:
                    err_msg = 'Target variable has zero standard deviation or a contant column. ' \
                              'Please check the data'
                    tf.logging.error(err_msg)
                    raise AssertionError(err_msg)
        return df