Python dask.dataframe() Examples
The following are 30
code examples of dask.dataframe().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask
, or try the search function
.
Example #1
Source File: load.py From predictatops with MIT License | 7 votes |
def turn_dict_of_well_dfs_to_single_df(dictOfWellDf): """ Takes in a dict of dataframes, where each dataframe is for a well created by LASIO. Likely created by load_all_wells_in function and is the first item in the returned list. and returns a single dataframe of all wells """ # start by creating empty dataframe and list data_df = pd.DataFrame() list_of_df = [] keys = list(dictOfWellDf.keys()) # get dict of well data frames into values format values = dictOfWellDf.values() # go through each item in values and add to a list count = 0 for each in values: each["UWI"] = keys[count] count += 1 list_of_df.append(each) # concat the list into a single dataframe data_df = pd.concat(list_of_df) return data_df
Example #2
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 6 votes |
def normalize(cls, df, target_var, mean_list, stddev_list): """Normalizes the numerical columns in a dataframe. Arguments: df : dask dataframe, The dataframe to normalize target_var : string, Dependent variable for the analysis mean_list : dask series, Series with all the mean values stddev_list : dask series, Series with all the standard deviation values Returns: df : Dataframe with mean normalized numerical columns """ continuous_cols = [ col for col in df.columns if df[col].dtype != 'object' and col != target_var] for col in continuous_cols: df[col] = df[col].sub(mean_list[col]).div(stddev_list[col]) return df
Example #3
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_transformed_shape(self): # checks if the transformed objects have the correct columns a = dpp.PolynomialFeatures() a.fit(X) n_cols = len(a.get_feature_names()) # dask array assert a.transform(X).shape[1] == n_cols # numpy array assert a.transform(X.compute()).shape[1] == n_cols # dask dataframe assert a.transform(df).shape[1] == n_cols # pandas dataframe assert a.transform(df.compute()).shape[1] == n_cols X_nan_rows = df.values df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns) # dask array with nan rows assert a.transform(X_nan_rows).shape[1] == n_cols # dask data frame with nan rows assert a.transform(df_none_divisions).shape[1] == n_cols
Example #4
Source File: test_parallel_post_fit.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_transform(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = PCA(random_state=0) wrap = ParallelPostFit(PCA(random_state=0)) base.fit(X, y) wrap.fit(X, y) assert_estimator_equal(wrap.estimator, base) result = base.transform(X) expected = wrap.transform(X) assert_eq_ar(result, expected)
Example #5
Source File: test_update.py From kartothek with MIT License | 6 votes |
def test_hash_bucket(col, num_buckets=5): df = pd.DataFrame( { "range": np.arange(10), "range_duplicated": np.repeat(np.arange(2), 5), "random": np.random.randint(0, 100, 10), } ) hashed = _hash_bucket(df, [col], num_buckets) assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET: "nunique"}) == 1).all().all() # Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets) df_sample = df.iloc[[0, 7]] hashed_sample = _hash_bucket(df_sample, [col], num_buckets) expected = hashed.loc[df_sample.index] pdt.assert_frame_equal(expected, hashed_sample)
Example #6
Source File: dataframe.py From kartothek with MIT License | 6 votes |
def _get_dask_meta_for_dataset( ds_factory, table, columns, categoricals, dates_as_object ): """ Calculate a schema suitable for the dask dataframe meta from the dataset. """ table_schema = ds_factory.table_meta[table] meta = empty_dataframe_from_schema( table_schema, columns=columns, date_as_object=dates_as_object ) if categoricals: meta = meta.astype({col: "category" for col in categoricals}) meta = dd.utils.clear_known_categories(meta, categoricals) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, {table: categoricals} ) if categoricals_from_index: meta = meta.astype(categoricals_from_index[table]) return meta
Example #7
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 6 votes |
def calculate_stats(cls, df, target_var): """Calculates descriptive stats of the dataframe required for cleaning. Arguments: df : dask dataframe, The dataframe at hand target_var : string, Dependent variable for the analysis Returns: mean : dask series, mean of each column median : dask series, median of each column dict(zip(categorical_cols, mode)) : dict, Dictionary containing categorical column as keys and their modes as values std : dask series, standard deviation of each column """ categorical_columns = [ col for col in df.columns if col != target_var and df[col].dtype == 'object'] mean_op = df.mean() std_op = df.std() median_op = df.quantile(0.5) mode_op = [df[col].value_counts().idxmax() for col in categorical_columns] mean, median, mode, std = dask.compute( mean_op, median_op, mode_op, std_op) return mean, median, dict(zip(categorical_columns, mode)), std
Example #8
Source File: test_base.py From intake with BSD 2-Clause "Simplified" License | 6 votes |
def test_datasource_discover(source_dataframe): r = source_dataframe.discover() assert source_dataframe.container == 'dataframe' row_dtype = np.dtype([('x', np.int64), ('y', np.int64)]) assert r == { 'datashape': 'datashape', 'dtype': row_dtype, 'shape': (6,), 'npartitions': 2, 'metadata': dict(a=1, b=2, c=3, d=4), } # check attributes have been set assert source_dataframe.datashape == 'datashape' assert source_dataframe.dtype == row_dtype assert source_dataframe.shape == (6,) assert source_dataframe.npartitions == 2 assert source_dataframe.metadata == dict(a=1, b=2, c=3, d=4) # check that _get_schema is only called once assert source_dataframe.call_count['_get_schema'] == 1 source_dataframe.discover() assert source_dataframe.call_count['_get_schema'] == 1
Example #9
Source File: test_dask.py From eliot with Apache License 2.0 | 5 votes |
def test_persist_pandas(self): """persist_with_trace() with a Pandas dataframe. This ensures we don't blow up, which used to be the case. """ df = pd.DataFrame() df = dd.from_pandas(df, npartitions=1) persist_with_trace(df)
Example #10
Source File: csv.py From intake with BSD 2-Clause "Simplified" License | 5 votes |
def _open_dataset(self, urlpath): """Open dataset using dask and use pattern fields to set new columns """ import dask.dataframe if self.pattern is None: self._dataframe = dask.dataframe.read_csv( urlpath, storage_options=self._storage_options, **self._csv_kwargs) return if not (DASK_VERSION >= '0.19.0'): raise ValueError("Your version of dask is '{}'. " "The ability to include filenames in read_csv output " "(``include_path_column``) was added in 0.19.0, so " "pattern urlpaths are not supported.".format(DASK_VERSION)) drop_path_column = 'include_path_column' not in self._csv_kwargs path_column = self._path_column() self._dataframe = dask.dataframe.read_csv( urlpath, storage_options=self._storage_options, **self._csv_kwargs) # add the new columns to the dataframe self._set_pattern_columns(path_column) if drop_path_column: self._dataframe = self._dataframe.drop([path_column], axis=1)
Example #11
Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _pandas_indexing(X, key, key_dtype, axis): """Index a pandas dataframe or a series.""" if hasattr(key, "shape"): # Work-around for indexing with read-only key in pandas # FIXME: solved in pandas 0.25 key = np.asarray(key) key = key if key.flags.writeable else key.copy() # check whether we should index with loc or iloc indexer = X.iloc if key_dtype == "int" else X.loc return indexer[:, key] if axis else indexer[key]
Example #12
Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _num_samples(X): result = sk_validation._num_samples(X) if dask.is_dask_collection(result): # dask dataframe result = result.compute() return result
Example #13
Source File: text.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def transform(self, raw_X): """Transform a sequence of documents to a document-term matrix. Transformation is done in parallel, and correctly handles dask collections. Parameters ---------- raw_X : dask.bag.Bag or dask.dataframe.Series, length = n_samples Each sample must be a text document (either bytes or unicode strings, file name or file object depending on the constructor argument) which will be tokenized and hashed. Returns ------- X : dask.array.Array, shape = (n_samples, self.n_features) Document-term matrix. Each block of the array is a scipy sparse matrix. Notes ----- The returned dask Array is composed scipy sparse matricies. If you need to compute on the result immediately, you may need to convert the individual blocks to ndarrays or pydata/sparse matricies. >>> import sparse >>> X.map_blocks(sparse.COO.from_scipy_sparse, dtype=X.dtype) # doctest: +SKIP See the :doc:`examples/text-vectorization` for more. """ return super().transform(raw_X)
Example #14
Source File: _blockwise.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _collect_probas(self, X): if isinstance(X, da.Array): chunks = (len(self.estimators_), X.chunks[0], len(self.classes_)) meta = np.array([], dtype="float64") # (n_estimators, len(X), n_classses) combined = X.map_blocks( _predict_proba_stack, estimators=self.estimators_, chunks=chunks, meta=meta, ) elif isinstance(X, dd._Frame): # TODO: replace with a _predict_proba_stack version. # This current raises; dask.dataframe doesn't like map_partitions that # return new axes. # meta = np.empty((len(self.estimators_), 0, len(self.classes_)), # dtype="float64") # combined = X.map_partitions(_predict_proba_stack, meta=meta, # estimators=self.estimators_) # combined._chunks = ((len(self.estimators_),), # (np.nan,) * X.npartitions, # (len(X.columns),)) meta = np.empty((0, len(self.classes_)), dtype="float64") probas = [ X.map_partitions(_predict_proba, meta=meta, estimator=estimator) for estimator in self.estimators_ ] # TODO(https://github.com/dask/dask/issues/6177): replace with da.stack chunks = probas[0]._chunks for proba in probas: proba._chunks = ((1,) * len(chunks[0]), chunks[1]) combined = da.stack(probas) combined._chunks = ((1,) * len(self.estimators_),) + chunks else: # ndarray, etc. combined = np.stack( [estimator.predict_proba(X) for estimator in self.estimators_] ) return combined
Example #15
Source File: test_read.py From kartothek with MIT License | 5 votes |
def test_reconstruct_dask_index_sorting(store_factory, monkeypatch): # Make sure we're not shuffling anything monkeypatch.delattr( dask.dataframe.shuffle, dask.dataframe.shuffle.set_index.__name__ ) dataset_uuid = "dataset_uuid" colA = "ColumnA" colB = "ColumnB" df = pd.DataFrame( {colA: np.random.randint(high=100000, low=-100000, size=(50,)), colB: 0} ) store_dataframes_as_dataset( store=store_factory, dataset_uuid=dataset_uuid, dfs=[df], partition_on=colA ) ddf = read_dataset_as_ddf( dataset_uuid=dataset_uuid, store=store_factory, table="table", dask_index_on=colA, ) assert all( ddf.map_partitions(lambda df: df.index.min()).compute().values == ddf.divisions[:-1] )
Example #16
Source File: test_update.py From kartothek with MIT License | 5 votes |
def test_pack_payload_pandas_empty(df_all_types): # For a single row dataframe the packing actually has a few more bytes df_empty = df_all_types.iloc[:0] group_key = [df_all_types.columns[-1]] pdt.assert_frame_equal( df_empty, unpack_payload_pandas( pack_payload_pandas(df_empty, group_key=group_key), unpack_meta=df_empty ), )
Example #17
Source File: test_update.py From kartothek with MIT License | 5 votes |
def test_pack_payload_pandas(df_all_types): # For a single row dataframe the packing actually has a few more bytes df = pd.concat([df_all_types] * 10, ignore_index=True) size_before = df.memory_usage(deep=True).sum() packed_df = pack_payload_pandas(df, group_key=list(df.columns[-2:])) size_after = packed_df.memory_usage(deep=True).sum() assert size_after < size_before
Example #18
Source File: test_update.py From kartothek with MIT License | 5 votes |
def test_pack_payload(df_all_types): # For a single row dataframe the packing actually has a few more bytes df = dd.from_pandas( pd.concat([df_all_types] * 10, ignore_index=True), npartitions=3 ) size_before = df.memory_usage(deep=True).sum() packed_df = pack_payload(df, group_key=list(df.columns[-2:])) size_after = packed_df.memory_usage(deep=True).sum() assert (size_after < size_before).compute()
Example #19
Source File: test_update.py From kartothek with MIT License | 5 votes |
def test_update_dataset_from_ddf_empty(store_factory, shuffle): with pytest.raises(ValueError, match="Cannot store empty datasets"): update_dataset_from_ddf( dask.dataframe.from_delayed([], meta=(("a", int),)), store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=shuffle, partition_on=["a"], ).compute()
Example #20
Source File: dask.py From PyMove with MIT License | 5 votes |
def to_data_frame(self): """ Converts trajectory data to DataFrame format. Returns ------- dask.dataframe.DataFrame Represents the trajectory in DataFrame format. """ return self._data
Example #21
Source File: dask.py From PyMove with MIT License | 5 votes |
def generate_weekend_features(self): """Create or update the feature weekend to the dataframe.""" raise NotImplementedError('To be implemented')
Example #22
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 5 votes |
def find_vocab(self, df): """Finds the number of levels in each categorical column. Helps for creation of feature columns for use in tf.data API Arguments: df : dask dataframe, Dataframe to extract the levels from Returns: A dictionary of column names and the levels in each variables [ 0 for numerical columns and number of levels for categorical columns] """ self.is_not_used() cat_columns = [ col for col in df.columns if df[col].dtype == 'object'] continuous_cols = [ col for col in df.columns if df[col].dtype != 'object'] temp = dask.compute([df[col].drop_duplicates() for col in cat_columns]) column_mapping = dict() for col in continuous_cols: column_mapping[col] = 0 for index, col in enumerate(cat_columns): column_mapping[col] = np.array(temp[0][index]) return column_mapping
Example #23
Source File: dask.py From PyMove with MIT License | 5 votes |
def show_trajectories_info(self): """Show dataset information from dataframe.""" raise NotImplementedError('To be implemented')
Example #24
Source File: dask.py From PyMove with MIT License | 5 votes |
def max(self, axis=None, skipna=True, split_every=False, out=None): """ Return the maximum of the values for the requested axis.. Parameters ---------- axis: int, optional, default None, {index (0), columns (1)}. Axis for the function to be applied on. skipna: bool, optional, default None. Exclude NA/null values when computing the result. split_every: ? out: ? Returns ------- max:Series or DataFrame (if level specified) The maximum values for the request axis. References ---------- https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.max """ return self._data.max(axis, skipna, split_every, out)
Example #25
Source File: load.py From predictatops with MIT License | 5 votes |
def makeDF(well_list): """ Changes format of well list into a pandas dataframe with one column called "UWI_file". """ formatted_well_list = [] for eachW in well_list: formatted_well_list.append({"UWI_file": eachW}) wells_df = pd.DataFrame(formatted_well_list) return wells_df
Example #26
Source File: dask.py From PyMove with MIT License | 5 votes |
def min(self, axis=None, skipna=True, split_every=False, out=None): """ Return the minimum of the values for the requested axis. Parameters ---------- axis: int, optional, default None, {index (0), columns (1)}. Axis for the function to be applied on. skipna: bool, optional, default None. Exclude NA/null values when computing the result. split_every: ? out: ? Returns ------- max:Series or DataFrame (if level specified) The minimum values for the request axis. References ---------- https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.min """ return self._data.min(axis, skipna, split_every, out)
Example #27
Source File: predictionclasses.py From predictatops with MIT License | 5 votes |
def predict_from_model(self, model, df_X_toPredict): """ The predict_from_model function takes as argument a model that is already trained on training data, in the demo case a scikit-learn XGBoost model and the dataframe of the columns to predict. From this, it fills in the self.result_df_from_prediction attribute and returns nothing. """ self.result_df_from_prediction = model.predict(df_X_toPredict)
Example #28
Source File: predictionclasses.py From predictatops with MIT License | 5 votes |
def __init__(self, ML, vs, distClassDF_wRollingCols_training): # self.knn_dir = ML.knn_dir # self.load_dir = ML.load_dir # self.features_dir = ML.features_dir # self.machine_learning_dir = ML.machine_learning_dir # self.h5_to_load = ML.h5_to_load self.train_X = ML.train_X self.train_y = ML.train_y self.test_X = ML.test_X self.test_y = ML.test_y self.train_index = ML.train_index self.test_index = ML.test_index self.preSplitpreBal = ML.preSplitpreBal self.result_df_from_prediction = None # df #### #### self.vs = vs # object instance from variables class self.depth_str = vs["depth_str"] self.pick_class_str = vs["pick_class_str"] self.UWI_str = vs["UWI_str"] self.rollingWindows = vs["rollingWindows"] self.distClassIntegersArray = vs["distClassIntegersArray"] #### self.calc_pred = distClassDF_wRollingCols_training self.excludeWellsThatOnlyHaveTheseClasses = ( [] ) ### aka dropIfOnlyClasses in optionallyExcludeWellsWithoutStrongPredictions() self.NoGoodWellsToExclude = ( [] ) #### UWIs of wells that only had zeros in the predicted dsitance class so these wells were excluded from accurracy prediction #### self.calc_pred_TopMcMr_Pick_pred_DEPT_pred = None # df self.calc_pred_TopTarget_DEPTH = None # df self.fullUWIsSet = [] ### set of UWIs in the dataframe self.precentWellsKept = 1 self.UWIsSetSubsetKept = ( [] ) #### subset of the wells that have predictions that aren't just zero or something else not wanted ## if zeros, calc_pred is changed to without zeros and zerosExcluded Array is populated
Example #29
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 5 votes |
def drop_cols(cls, df, col_names): """Drops any columns which are not required by the user. Arguments: df : dask dataframe, Dataframe of input data col_names : list, Columns in the data to be dropped returns: dask dataframe, Updated dataframe with columns dropped """ return df.drop(col_names, axis=1)
Example #30
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 5 votes |
def dropping_zero_var_cols(cls, df, target_var, stddev_list): """Check columns which have zero variance and removes the from the dataframe. As the zero variance columns or contant columns can't be considered as output column Arguments: df : dask dataframe, The dataframe to validate stddev : dask series, Series containing the standard deviation values for columns target_var : string, Dependent variable for the analysis Returns: df : dask dataframe, Dataframe with redundant columns removed Raises: AssertionError : If the target column has zero deviation """ continuous_cols = [ col for col in df.columns if df[col].dtype != 'object'] for col in continuous_cols: if stddev_list[col] == 0.0: df = df.drop(col, axis=1) if col == target_var: err_msg = 'Target variable has zero standard deviation or a contant column. ' \ 'Please check the data' tf.logging.error(err_msg) raise AssertionError(err_msg) return df