Python Examples of dask.dataframe

Source File: load.py From predictatops with MIT License

7 votes

def turn_dict_of_well_dfs_to_single_df(dictOfWellDf):
    """
    Takes in a dict of dataframes, where each dataframe is for a well created by LASIO. Likely created by load_all_wells_in function and is the first item in the returned list.
    and returns a single dataframe of all wells
    """
    # start by creating empty dataframe and list
    data_df = pd.DataFrame()
    list_of_df = []
    keys = list(dictOfWellDf.keys())
    # get dict of well data frames into values format
    values = dictOfWellDf.values()
    # go through each item in values and add to a list
    count = 0
    for each in values:
        each["UWI"] = keys[count]
        count += 1
        list_of_df.append(each)
    # concat the list into a single dataframe
    data_df = pd.concat(list_of_df)
    return data_df

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

6 votes

def normalize(cls, df, target_var, mean_list, stddev_list):
        """Normalizes the numerical columns in a dataframe.

        Arguments:
                df : dask dataframe, The dataframe to normalize
                target_var : string, Dependent variable for the analysis
                mean_list : dask series, Series with all the mean values
                stddev_list : dask series, Series with all the standard deviation values

        Returns:
                df : Dataframe with mean normalized numerical columns
        """
        continuous_cols = [
            col for col in df.columns if df[col].dtype != 'object' and col != target_var]
        for col in continuous_cols:
            df[col] = df[col].sub(mean_list[col]).div(stddev_list[col])

        return df

Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_transformed_shape(self):
        # checks if the transformed objects have the correct columns
        a = dpp.PolynomialFeatures()
        a.fit(X)
        n_cols = len(a.get_feature_names())
        # dask array
        assert a.transform(X).shape[1] == n_cols
        # numpy array
        assert a.transform(X.compute()).shape[1] == n_cols
        # dask dataframe
        assert a.transform(df).shape[1] == n_cols
        # pandas dataframe
        assert a.transform(df.compute()).shape[1] == n_cols
        X_nan_rows = df.values
        df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns)
        # dask array with nan rows
        assert a.transform(X_nan_rows).shape[1] == n_cols
        # dask data frame with nan rows
        assert a.transform(df_none_divisions).shape[1] == n_cols

Source File: test_parallel_post_fit.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_transform(kind):
    X, y = make_classification(chunks=100)

    if kind == "numpy":
        X, y = dask.compute(X, y)
    elif kind == "dask.dataframe":
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = PCA(random_state=0)
    wrap = ParallelPostFit(PCA(random_state=0))

    base.fit(X, y)
    wrap.fit(X, y)

    assert_estimator_equal(wrap.estimator, base)

    result = base.transform(X)
    expected = wrap.transform(X)
    assert_eq_ar(result, expected)

Source File: test_update.py From kartothek with MIT License

6 votes

def test_hash_bucket(col, num_buckets=5):
    df = pd.DataFrame(
        {
            "range": np.arange(10),
            "range_duplicated": np.repeat(np.arange(2), 5),
            "random": np.random.randint(0, 100, 10),
        }
    )
    hashed = _hash_bucket(df, [col], num_buckets)
    assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET: "nunique"}) == 1).all().all()

    # Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets)
    df_sample = df.iloc[[0, 7]]
    hashed_sample = _hash_bucket(df_sample, [col], num_buckets)
    expected = hashed.loc[df_sample.index]
    pdt.assert_frame_equal(expected, hashed_sample)

Source File: dataframe.py From kartothek with MIT License

6 votes

def _get_dask_meta_for_dataset(
    ds_factory, table, columns, categoricals, dates_as_object
):
    """
    Calculate a schema suitable for the dask dataframe meta from the dataset.
    """
    table_schema = ds_factory.table_meta[table]
    meta = empty_dataframe_from_schema(
        table_schema, columns=columns, date_as_object=dates_as_object
    )

    if categoricals:
        meta = meta.astype({col: "category" for col in categoricals})
        meta = dd.utils.clear_known_categories(meta, categoricals)

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, {table: categoricals}
    )
    if categoricals_from_index:
        meta = meta.astype(categoricals_from_index[table])
    return meta

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

6 votes

def calculate_stats(cls, df, target_var):
        """Calculates descriptive stats of the dataframe required for cleaning.

        Arguments:
                df : dask dataframe, The dataframe at hand
                target_var : string, Dependent variable for the analysis

        Returns:
                mean : dask series, mean of each column
                median : dask series, median of each column
                dict(zip(categorical_cols, mode)) : dict, Dictionary containing
                        categorical column as keys and their modes as values
                std : dask series, standard deviation of each column
        """
        categorical_columns = [
            col for col in df.columns if col != target_var and df[col].dtype == 'object']
        mean_op = df.mean()
        std_op = df.std()
        median_op = df.quantile(0.5)
        mode_op = [df[col].value_counts().idxmax()
                   for col in categorical_columns]
        mean, median, mode, std = dask.compute(
            mean_op, median_op, mode_op, std_op)
        return mean, median, dict(zip(categorical_columns, mode)), std

Source File: test_base.py From intake with BSD 2-Clause "Simplified" License

6 votes

def test_datasource_discover(source_dataframe):
    r = source_dataframe.discover()

    assert source_dataframe.container == 'dataframe'

    row_dtype = np.dtype([('x', np.int64), ('y', np.int64)])
    assert r == {
        'datashape': 'datashape',
        'dtype': row_dtype,
        'shape': (6,),
        'npartitions': 2,
        'metadata': dict(a=1, b=2, c=3, d=4),
    }

    # check attributes have been set
    assert source_dataframe.datashape == 'datashape'
    assert source_dataframe.dtype == row_dtype
    assert source_dataframe.shape == (6,)
    assert source_dataframe.npartitions == 2
    assert source_dataframe.metadata == dict(a=1, b=2, c=3, d=4)

    # check that _get_schema is only called once
    assert source_dataframe.call_count['_get_schema'] == 1
    source_dataframe.discover()
    assert source_dataframe.call_count['_get_schema'] == 1

Source File: test_dask.py From eliot with Apache License 2.0

5 votes

def test_persist_pandas(self):
        """persist_with_trace() with a Pandas dataframe.

        This ensures we don't blow up, which used to be the case.
        """
        df = pd.DataFrame()
        df = dd.from_pandas(df, npartitions=1)
        persist_with_trace(df)

Source File: csv.py From intake with BSD 2-Clause "Simplified" License

5 votes

def _open_dataset(self, urlpath):
        """Open dataset using dask and use pattern fields to set new columns
        """
        import dask.dataframe

        if self.pattern is None:
            self._dataframe = dask.dataframe.read_csv(
                urlpath, storage_options=self._storage_options,
                **self._csv_kwargs)
            return

        if not (DASK_VERSION >= '0.19.0'):
            raise ValueError("Your version of dask is '{}'. "
                "The ability to include filenames in read_csv output "
                "(``include_path_column``) was added in 0.19.0, so "
                "pattern urlpaths are not supported.".format(DASK_VERSION))

        drop_path_column = 'include_path_column' not in self._csv_kwargs
        path_column = self._path_column()

        self._dataframe = dask.dataframe.read_csv(
            urlpath, storage_options=self._storage_options, **self._csv_kwargs)

        # add the new columns to the dataframe
        self._set_pattern_columns(path_column)

        if drop_path_column:
            self._dataframe = self._dataframe.drop([path_column], axis=1)

Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _pandas_indexing(X, key, key_dtype, axis):
    """Index a pandas dataframe or a series."""
    if hasattr(key, "shape"):
        # Work-around for indexing with read-only key in pandas
        # FIXME: solved in pandas 0.25
        key = np.asarray(key)
        key = key if key.flags.writeable else key.copy()
    # check whether we should index with loc or iloc
    indexer = X.iloc if key_dtype == "int" else X.loc
    return indexer[:, key] if axis else indexer[key]

Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _num_samples(X):
    result = sk_validation._num_samples(X)
    if dask.is_dask_collection(result):
        # dask dataframe
        result = result.compute()
    return result

Source File: text.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def transform(self, raw_X):
        """Transform a sequence of documents to a document-term matrix.

        Transformation is done in parallel, and correctly handles dask
        collections.

        Parameters
        ----------
        raw_X : dask.bag.Bag or dask.dataframe.Series, length = n_samples
            Each sample must be a text document (either bytes or
            unicode strings, file name or file object depending on the
            constructor argument) which will be tokenized and hashed.

        Returns
        -------
        X : dask.array.Array, shape = (n_samples, self.n_features)
            Document-term matrix. Each block of the array is a scipy sparse
            matrix.

        Notes
        -----
        The returned dask Array is composed scipy sparse matricies. If you need
        to compute on the result immediately, you may need to convert the individual
        blocks to ndarrays or pydata/sparse matricies.

        >>> import sparse
        >>> X.map_blocks(sparse.COO.from_scipy_sparse, dtype=X.dtype)  # doctest: +SKIP

        See the :doc:`examples/text-vectorization` for more.
        """
        return super().transform(raw_X)

Source File: _blockwise.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _collect_probas(self, X):
        if isinstance(X, da.Array):
            chunks = (len(self.estimators_), X.chunks[0], len(self.classes_))
            meta = np.array([], dtype="float64")
            # (n_estimators, len(X), n_classses)
            combined = X.map_blocks(
                _predict_proba_stack,
                estimators=self.estimators_,
                chunks=chunks,
                meta=meta,
            )
        elif isinstance(X, dd._Frame):
            # TODO: replace with a _predict_proba_stack version.
            # This current raises; dask.dataframe doesn't like map_partitions that
            # return new axes.
            # meta = np.empty((len(self.estimators_), 0, len(self.classes_)),
            #                 dtype="float64")
            # combined = X.map_partitions(_predict_proba_stack, meta=meta,
            #                             estimators=self.estimators_)
            # combined._chunks = ((len(self.estimators_),),
            #                     (np.nan,) * X.npartitions,
            #                     (len(X.columns),))
            meta = np.empty((0, len(self.classes_)), dtype="float64")
            probas = [
                X.map_partitions(_predict_proba, meta=meta, estimator=estimator)
                for estimator in self.estimators_
            ]
            # TODO(https://github.com/dask/dask/issues/6177): replace with da.stack
            chunks = probas[0]._chunks
            for proba in probas:
                proba._chunks = ((1,) * len(chunks[0]), chunks[1])

            combined = da.stack(probas)
            combined._chunks = ((1,) * len(self.estimators_),) + chunks
        else:
            # ndarray, etc.
            combined = np.stack(
                [estimator.predict_proba(X) for estimator in self.estimators_]
            )

        return combined

Source File: test_read.py From kartothek with MIT License

5 votes

def test_reconstruct_dask_index_sorting(store_factory, monkeypatch):

    # Make sure we're not shuffling anything
    monkeypatch.delattr(
        dask.dataframe.shuffle, dask.dataframe.shuffle.set_index.__name__
    )
    dataset_uuid = "dataset_uuid"
    colA = "ColumnA"
    colB = "ColumnB"

    df = pd.DataFrame(
        {colA: np.random.randint(high=100000, low=-100000, size=(50,)), colB: 0}
    )
    store_dataframes_as_dataset(
        store=store_factory, dataset_uuid=dataset_uuid, dfs=[df], partition_on=colA
    )
    ddf = read_dataset_as_ddf(
        dataset_uuid=dataset_uuid,
        store=store_factory,
        table="table",
        dask_index_on=colA,
    )

    assert all(
        ddf.map_partitions(lambda df: df.index.min()).compute().values
        == ddf.divisions[:-1]
    )

Source File: test_update.py From kartothek with MIT License

5 votes

def test_pack_payload_pandas_empty(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df_empty = df_all_types.iloc[:0]

    group_key = [df_all_types.columns[-1]]
    pdt.assert_frame_equal(
        df_empty,
        unpack_payload_pandas(
            pack_payload_pandas(df_empty, group_key=group_key), unpack_meta=df_empty
        ),
    )

Source File: test_update.py From kartothek with MIT License

5 votes

def test_pack_payload_pandas(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df = pd.concat([df_all_types] * 10, ignore_index=True)
    size_before = df.memory_usage(deep=True).sum()

    packed_df = pack_payload_pandas(df, group_key=list(df.columns[-2:]))

    size_after = packed_df.memory_usage(deep=True).sum()

    assert size_after < size_before

Source File: test_update.py From kartothek with MIT License

5 votes

def test_pack_payload(df_all_types):
    # For a single row dataframe the packing actually has a few more bytes
    df = dd.from_pandas(
        pd.concat([df_all_types] * 10, ignore_index=True), npartitions=3
    )
    size_before = df.memory_usage(deep=True).sum()

    packed_df = pack_payload(df, group_key=list(df.columns[-2:]))

    size_after = packed_df.memory_usage(deep=True).sum()

    assert (size_after < size_before).compute()

Source File: test_update.py From kartothek with MIT License

5 votes

def test_update_dataset_from_ddf_empty(store_factory, shuffle):
    with pytest.raises(ValueError, match="Cannot store empty datasets"):
        update_dataset_from_ddf(
            dask.dataframe.from_delayed([], meta=(("a", int),)),
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=shuffle,
            partition_on=["a"],
        ).compute()

Source File: dask.py From PyMove with MIT License

5 votes

def to_data_frame(self):
        """
        Converts trajectory data to DataFrame format.

        Returns
        -------
        dask.dataframe.DataFrame
            Represents the trajectory in DataFrame format.

        """

        return self._data

Source File: dask.py From PyMove with MIT License

5 votes

def generate_weekend_features(self):
        """Create or update the feature weekend to the dataframe."""
        raise NotImplementedError('To be implemented')

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

5 votes

def find_vocab(self, df):
        """Finds the number of levels in each categorical column.
        Helps for creation of feature columns for use in tf.data API

        Arguments:
          df : dask dataframe, Dataframe to extract the levels from

        Returns:
                A dictionary of column names and the levels in each variables
                        [ 0 for numerical columns and number of levels for categorical columns]
        """
        self.is_not_used()
        cat_columns = [
            col for col in df.columns if df[col].dtype == 'object']
        continuous_cols = [
            col for col in df.columns if df[col].dtype != 'object']
        temp = dask.compute([df[col].drop_duplicates() for col in cat_columns])

        column_mapping = dict()

        for col in continuous_cols:
            column_mapping[col] = 0

        for index, col in enumerate(cat_columns):
            column_mapping[col] = np.array(temp[0][index])

        return column_mapping

Source File: dask.py From PyMove with MIT License

5 votes

def show_trajectories_info(self):
        """Show dataset information from dataframe."""
        raise NotImplementedError('To be implemented')

Source File: dask.py From PyMove with MIT License

5 votes

def max(self, axis=None, skipna=True, split_every=False, out=None):
        """
        Return the maximum of the values for the requested axis..

        Parameters
        ----------
        axis: int, optional, default None, {index (0), columns (1)}.
            Axis for the function to be applied on.
        skipna: bool, optional, default None.
            Exclude NA/null values when computing the result.
        split_every:
            ?
        out:
            ?

        Returns
        -------
        max:Series or DataFrame (if level specified)
            The maximum values for the request axis.

        References
        ----------
        https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.max

        """

        return self._data.max(axis, skipna, split_every, out)

Source File: load.py From predictatops with MIT License

5 votes

def makeDF(well_list):
    """
    Changes format of well list into a pandas dataframe with one column called "UWI_file".
    """
    formatted_well_list = []
    for eachW in well_list:
        formatted_well_list.append({"UWI_file": eachW})
    wells_df = pd.DataFrame(formatted_well_list)
    return wells_df

Source File: dask.py From PyMove with MIT License

5 votes

def min(self, axis=None, skipna=True, split_every=False, out=None):
        """
        Return the minimum of the values for the requested axis.

        Parameters
        ----------
        axis: int, optional, default None, {index (0), columns (1)}.
            Axis for the function to be applied on.
        skipna: bool, optional, default None.
            Exclude NA/null values when computing the result.
        split_every:
            ?
        out:
            ?

        Returns
        -------
        max:Series or DataFrame (if level specified)
            The minimum values for the request axis.

        References
        ----------
        https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.min

        """

        return self._data.min(axis, skipna, split_every, out)

Source File: predictionclasses.py From predictatops with MIT License

5 votes

def predict_from_model(self, model, df_X_toPredict):
        """
        The predict_from_model function takes as argument a model that is already trained on training data, in the demo case a 
        scikit-learn XGBoost model and the dataframe of the columns to predict. From this, it fills in 
        the self.result_df_from_prediction attribute and returns nothing.
    
        """
        self.result_df_from_prediction = model.predict(df_X_toPredict)

Source File: predictionclasses.py From predictatops with MIT License

5 votes

def __init__(self, ML, vs, distClassDF_wRollingCols_training):
        # self.knn_dir = ML.knn_dir
        # self.load_dir = ML.load_dir
        # self.features_dir = ML.features_dir
        # self.machine_learning_dir = ML.machine_learning_dir
        # self.h5_to_load = ML.h5_to_load
        self.train_X = ML.train_X
        self.train_y = ML.train_y
        self.test_X = ML.test_X
        self.test_y = ML.test_y
        self.train_index = ML.train_index
        self.test_index = ML.test_index
        self.preSplitpreBal = ML.preSplitpreBal
        self.result_df_from_prediction = None  # df
        ####
        ####
        self.vs = vs  # object instance from variables class
        self.depth_str = vs["depth_str"]
        self.pick_class_str = vs["pick_class_str"]
        self.UWI_str = vs["UWI_str"]
        self.rollingWindows = vs["rollingWindows"]
        self.distClassIntegersArray = vs["distClassIntegersArray"]
        ####
        self.calc_pred = distClassDF_wRollingCols_training
        self.excludeWellsThatOnlyHaveTheseClasses = (
            []
        )  ### aka dropIfOnlyClasses in optionallyExcludeWellsWithoutStrongPredictions()
        self.NoGoodWellsToExclude = (
            []
        )  #### UWIs of wells that only had zeros in the predicted dsitance class so these wells were excluded from accurracy prediction
        ####
        self.calc_pred_TopMcMr_Pick_pred_DEPT_pred = None  # df
        self.calc_pred_TopTarget_DEPTH = None  # df
        self.fullUWIsSet = []  ### set of UWIs in the dataframe
        self.precentWellsKept = 1
        self.UWIsSetSubsetKept = (
            []
        )  #### subset of the wells that have predictions that aren't just zero or something else not wanted

    ## if zeros, calc_pred is changed to without zeros and zerosExcluded Array is populated

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

5 votes

def drop_cols(cls, df, col_names):
        """Drops any columns which are not required by the user.

        Arguments:
                df : dask dataframe, Dataframe of input data
                col_names : list, Columns in the data to be dropped

        returns:
                dask dataframe, Updated dataframe with columns dropped
        """
        return df.drop(col_names, axis=1)

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

5 votes

def dropping_zero_var_cols(cls, df, target_var, stddev_list):
        """Check columns which have zero variance and removes the from the dataframe.
            As the zero variance columns or contant columns can't be considered as output column

        Arguments:
                df : dask dataframe, The dataframe to validate
                stddev : dask series, Series containing the standard deviation values for columns
                target_var : string, Dependent variable for the analysis

        Returns:
                df : dask dataframe, Dataframe with redundant columns removed

        Raises:
                AssertionError : If the target column has zero deviation
        """
        continuous_cols = [
            col for col in df.columns if df[col].dtype != 'object']
        for col in continuous_cols:
            if stddev_list[col] == 0.0:
                df = df.drop(col, axis=1)
                if col == target_var:
                    err_msg = 'Target variable has zero standard deviation or a contant column. ' \
                              'Please check the data'
                    tf.logging.error(err_msg)
                    raise AssertionError(err_msg)
        return df

Python dask.dataframe() Examples