Python dask.dataframe() Examples
The following are 30
code examples of dask.dataframe().
Example #1
Source File: From predictatops with MIT License | 7 votes |
def turn_dict_of_well_dfs_to_single_df(dictOfWellDf): """ Takes in a dict of dataframes, where each dataframe is for a well created by LASIO. Likely created by load_all_wells_in function and is the first item in the returned list. and returns a single dataframe of all wells """ # start by creating empty dataframe and list data_df = pd.DataFrame() list_of_df = [] keys = list(dictOfWellDf.keys()) # get dict of well data frames into values format values = dictOfWellDf.values() # go through each item in values and add to a list count = 0 for each in values: each["UWI"] = keys[count] count += 1 list_of_df.append(each) # concat the list into a single dataframe data_df = pd.concat(list_of_df) return data_df
Example #2
Source File: From professional-services with Apache License 2.0 | 6 votes |
def normalize(cls, df, target_var, mean_list, stddev_list): """Normalizes the numerical columns in a dataframe. Arguments: df : dask dataframe, The dataframe to normalize target_var : string, Dependent variable for the analysis mean_list : dask series, Series with all the mean values stddev_list : dask series, Series with all the standard deviation values Returns: df : Dataframe with mean normalized numerical columns """ continuous_cols = [ col for col in df.columns if df[col].dtype != 'object' and col != target_var] for col in continuous_cols: df[col] = df[col].sub(mean_list[col]).div(stddev_list[col]) return df
Example #3
Source File: From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_transformed_shape(self): # checks if the transformed objects have the correct columns a = dpp.PolynomialFeatures() n_cols = len(a.get_feature_names()) # dask array assert a.transform(X).shape[1] == n_cols # numpy array assert a.transform(X.compute()).shape[1] == n_cols # dask dataframe assert a.transform(df).shape[1] == n_cols # pandas dataframe assert a.transform(df.compute()).shape[1] == n_cols X_nan_rows = df.values df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns) # dask array with nan rows assert a.transform(X_nan_rows).shape[1] == n_cols # dask data frame with nan rows assert a.transform(df_none_divisions).shape[1] == n_cols
Example #4
Source File: From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_transform(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = PCA(random_state=0) wrap = ParallelPostFit(PCA(random_state=0)), y), y) assert_estimator_equal(wrap.estimator, base) result = base.transform(X) expected = wrap.transform(X) assert_eq_ar(result, expected)
Example #5
Source File: From kartothek with MIT License | 6 votes |
def test_hash_bucket(col, num_buckets=5): df = pd.DataFrame( { "range": np.arange(10), "range_duplicated": np.repeat(np.arange(2), 5), "random": np.random.randint(0, 100, 10), } ) hashed = _hash_bucket(df, [col], num_buckets) assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET: "nunique"}) == 1).all().all() # Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets) df_sample = df.iloc[[0, 7]] hashed_sample = _hash_bucket(df_sample, [col], num_buckets) expected = hashed.loc[df_sample.index] pdt.assert_frame_equal(expected, hashed_sample)
Example #6
Source File: From kartothek with MIT License | 6 votes |
def _get_dask_meta_for_dataset( ds_factory, table, columns, categoricals, dates_as_object ): """ Calculate a schema suitable for the dask dataframe meta from the dataset. """ table_schema = ds_factory.table_meta[table] meta = empty_dataframe_from_schema( table_schema, columns=columns, date_as_object=dates_as_object ) if categoricals: meta = meta.astype({col: "category" for col in categoricals}) meta = dd.utils.clear_known_categories(meta, categoricals) categoricals_from_index = _maybe_get_categoricals_from_index( ds_factory, {table: categoricals} ) if categoricals_from_index: meta = meta.astype(categoricals_from_index[table]) return meta
Example #7
Source File: From professional-services with Apache License 2.0 | 6 votes |
def calculate_stats(cls, df, target_var): """Calculates descriptive stats of the dataframe required for cleaning. Arguments: df : dask dataframe, The dataframe at hand target_var : string, Dependent variable for the analysis Returns: mean : dask series, mean of each column median : dask series, median of each column dict(zip(categorical_cols, mode)) : dict, Dictionary containing categorical column as keys and their modes as values std : dask series, standard deviation of each column """ categorical_columns = [ col for col in df.columns if col != target_var and df[col].dtype == 'object'] mean_op = df.mean() std_op = df.std() median_op = df.quantile(0.5) mode_op = [df[col].value_counts().idxmax() for col in categorical_columns] mean, median, mode, std = dask.compute( mean_op, median_op, mode_op, std_op) return mean, median, dict(zip(categorical_columns, mode)), std
Example #8
Source File: From intake with BSD 2-Clause "Simplified" License | 6 votes |
def test_datasource_discover(source_dataframe): r = assert source_dataframe.container == 'dataframe' row_dtype = np.dtype([('x', np.int64), ('y', np.int64)]) assert r == { 'datashape': 'datashape', 'dtype': row_dtype, 'shape': (6,), 'npartitions': 2, 'metadata': dict(a=1, b=2, c=3, d=4), } # check attributes have been set assert source_dataframe.datashape == 'datashape' assert source_dataframe.dtype == row_dtype assert source_dataframe.shape == (6,) assert source_dataframe.npartitions == 2 assert source_dataframe.metadata == dict(a=1, b=2, c=3, d=4) # check that _get_schema is only called once assert source_dataframe.call_count['_get_schema'] == 1 assert source_dataframe.call_count['_get_schema'] == 1
Example #9
Source File: From eliot with Apache License 2.0 | 5 votes |
def test_persist_pandas(self): """persist_with_trace() with a Pandas dataframe. This ensures we don't blow up, which used to be the case. """ df = pd.DataFrame() df = dd.from_pandas(df, npartitions=1) persist_with_trace(df)
Example #10
Source File: From intake with BSD 2-Clause "Simplified" License | 5 votes |
def _open_dataset(self, urlpath): """Open dataset using dask and use pattern fields to set new columns """ import dask.dataframe if self.pattern is None: self._dataframe = dask.dataframe.read_csv( urlpath, storage_options=self._storage_options, **self._csv_kwargs) return if not (DASK_VERSION >= '0.19.0'): raise ValueError("Your version of dask is '{}'. " "The ability to include filenames in read_csv output " "(``include_path_column``) was added in 0.19.0, so " "pattern urlpaths are not supported.".format(DASK_VERSION)) drop_path_column = 'include_path_column' not in self._csv_kwargs path_column = self._path_column() self._dataframe = dask.dataframe.read_csv( urlpath, storage_options=self._storage_options, **self._csv_kwargs) # add the new columns to the dataframe self._set_pattern_columns(path_column) if drop_path_column: self._dataframe = self._dataframe.drop([path_column], axis=1)
Example #11
Source File: From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _pandas_indexing(X, key, key_dtype, axis): """Index a pandas dataframe or a series.""" if hasattr(key, "shape"): # Work-around for indexing with read-only key in pandas # FIXME: solved in pandas 0.25 key = np.asarray(key) key = key if key.flags.writeable else key.copy() # check whether we should index with loc or iloc indexer = X.iloc if key_dtype == "int" else X.loc return indexer[:, key] if axis else indexer[key]
Example #12
Source File: From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _num_samples(X): result = sk_validation._num_samples(X) if dask.is_dask_collection(result): # dask dataframe result = result.compute() return result
Example #13
Source File: From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def transform(self, raw_X): """Transform a sequence of documents to a document-term matrix. Transformation is done in parallel, and correctly handles dask collections. Parameters ---------- raw_X : dask.bag.Bag or dask.dataframe.Series, length = n_samples Each sample must be a text document (either bytes or unicode strings, file name or file object depending on the constructor argument) which will be tokenized and hashed. Returns ------- X : dask.array.Array, shape = (n_samples, self.n_features) Document-term matrix. Each block of the array is a scipy sparse matrix. Notes ----- The returned dask Array is composed scipy sparse matricies. If you need to compute on the result immediately, you may need to convert the individual blocks to ndarrays or pydata/sparse matricies. >>> import sparse >>> X.map_blocks(sparse.COO.from_scipy_sparse, dtype=X.dtype) # doctest: +SKIP See the :doc:`examples/text-vectorization` for more. """ return super().transform(raw_X)
Example #14
Source File: From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _collect_probas(self, X): if isinstance(X, da.Array): chunks = (len(self.estimators_), X.chunks[0], len(self.classes_)) meta = np.array([], dtype="float64") # (n_estimators, len(X), n_classses) combined = X.map_blocks( _predict_proba_stack, estimators=self.estimators_, chunks=chunks, meta=meta, ) elif isinstance(X, dd._Frame): # TODO: replace with a _predict_proba_stack version. # This current raises; dask.dataframe doesn't like map_partitions that # return new axes. # meta = np.empty((len(self.estimators_), 0, len(self.classes_)), # dtype="float64") # combined = X.map_partitions(_predict_proba_stack, meta=meta, # estimators=self.estimators_) # combined._chunks = ((len(self.estimators_),), # (np.nan,) * X.npartitions, # (len(X.columns),)) meta = np.empty((0, len(self.classes_)), dtype="float64") probas = [ X.map_partitions(_predict_proba, meta=meta, estimator=estimator) for estimator in self.estimators_ ] # TODO( replace with da.stack chunks = probas[0]._chunks for proba in probas: proba._chunks = ((1,) * len(chunks[0]), chunks[1]) combined = da.stack(probas) combined._chunks = ((1,) * len(self.estimators_),) + chunks else: # ndarray, etc. combined = np.stack( [estimator.predict_proba(X) for estimator in self.estimators_] ) return combined
Example #15
Source File: From kartothek with MIT License | 5 votes |
def test_reconstruct_dask_index_sorting(store_factory, monkeypatch): # Make sure we're not shuffling anything monkeypatch.delattr( dask.dataframe.shuffle, dask.dataframe.shuffle.set_index.__name__ ) dataset_uuid = "dataset_uuid" colA = "ColumnA" colB = "ColumnB" df = pd.DataFrame( {colA: np.random.randint(high=100000, low=-100000, size=(50,)), colB: 0} ) store_dataframes_as_dataset( store=store_factory, dataset_uuid=dataset_uuid, dfs=[df], partition_on=colA ) ddf = read_dataset_as_ddf( dataset_uuid=dataset_uuid, store=store_factory, table="table", dask_index_on=colA, ) assert all( ddf.map_partitions(lambda df: df.index.min()).compute().values == ddf.divisions[:-1] )
Example #16
Source File: From kartothek with MIT License | 5 votes |
def test_pack_payload_pandas_empty(df_all_types): # For a single row dataframe the packing actually has a few more bytes df_empty = df_all_types.iloc[:0] group_key = [df_all_types.columns[-1]] pdt.assert_frame_equal( df_empty, unpack_payload_pandas( pack_payload_pandas(df_empty, group_key=group_key), unpack_meta=df_empty ), )
Example #17
Source File: From kartothek with MIT License | 5 votes |
def test_pack_payload_pandas(df_all_types): # For a single row dataframe the packing actually has a few more bytes df = pd.concat([df_all_types] * 10, ignore_index=True) size_before = df.memory_usage(deep=True).sum() packed_df = pack_payload_pandas(df, group_key=list(df.columns[-2:])) size_after = packed_df.memory_usage(deep=True).sum() assert size_after < size_before
Example #18
Source File: From kartothek with MIT License | 5 votes |
def test_pack_payload(df_all_types): # For a single row dataframe the packing actually has a few more bytes df = dd.from_pandas( pd.concat([df_all_types] * 10, ignore_index=True), npartitions=3 ) size_before = df.memory_usage(deep=True).sum() packed_df = pack_payload(df, group_key=list(df.columns[-2:])) size_after = packed_df.memory_usage(deep=True).sum() assert (size_after < size_before).compute()
Example #19
Source File: From kartothek with MIT License | 5 votes |
def test_update_dataset_from_ddf_empty(store_factory, shuffle): with pytest.raises(ValueError, match="Cannot store empty datasets"): update_dataset_from_ddf( dask.dataframe.from_delayed([], meta=(("a", int),)), store_factory, dataset_uuid="output_dataset_uuid", table="core", shuffle=shuffle, partition_on=["a"], ).compute()
Example #20
Source File: From PyMove with MIT License | 5 votes |
def to_data_frame(self): """ Converts trajectory data to DataFrame format. Returns ------- dask.dataframe.DataFrame Represents the trajectory in DataFrame format. """ return self._data
Example #21
Source File: From PyMove with MIT License | 5 votes |
def generate_weekend_features(self): """Create or update the feature weekend to the dataframe.""" raise NotImplementedError('To be implemented')
Example #22
Source File: From professional-services with Apache License 2.0 | 5 votes |
def find_vocab(self, df): """Finds the number of levels in each categorical column. Helps for creation of feature columns for use in API Arguments: df : dask dataframe, Dataframe to extract the levels from Returns: A dictionary of column names and the levels in each variables [ 0 for numerical columns and number of levels for categorical columns] """ self.is_not_used() cat_columns = [ col for col in df.columns if df[col].dtype == 'object'] continuous_cols = [ col for col in df.columns if df[col].dtype != 'object'] temp = dask.compute([df[col].drop_duplicates() for col in cat_columns]) column_mapping = dict() for col in continuous_cols: column_mapping[col] = 0 for index, col in enumerate(cat_columns): column_mapping[col] = np.array(temp[0][index]) return column_mapping
Example #23
Source File: From PyMove with MIT License | 5 votes |
def show_trajectories_info(self): """Show dataset information from dataframe.""" raise NotImplementedError('To be implemented')
Example #24
Source File: From PyMove with MIT License | 5 votes |
def max(self, axis=None, skipna=True, split_every=False, out=None): """ Return the maximum of the values for the requested axis.. Parameters ---------- axis: int, optional, default None, {index (0), columns (1)}. Axis for the function to be applied on. skipna: bool, optional, default None. Exclude NA/null values when computing the result. split_every: ? out: ? Returns ------- max:Series or DataFrame (if level specified) The maximum values for the request axis. References ---------- """ return self._data.max(axis, skipna, split_every, out)
Example #25
Source File: From predictatops with MIT License | 5 votes |
def makeDF(well_list): """ Changes format of well list into a pandas dataframe with one column called "UWI_file". """ formatted_well_list = [] for eachW in well_list: formatted_well_list.append({"UWI_file": eachW}) wells_df = pd.DataFrame(formatted_well_list) return wells_df
Example #26
Source File: From PyMove with MIT License | 5 votes |
def min(self, axis=None, skipna=True, split_every=False, out=None): """ Return the minimum of the values for the requested axis. Parameters ---------- axis: int, optional, default None, {index (0), columns (1)}. Axis for the function to be applied on. skipna: bool, optional, default None. Exclude NA/null values when computing the result. split_every: ? out: ? Returns ------- max:Series or DataFrame (if level specified) The minimum values for the request axis. References ---------- """ return self._data.min(axis, skipna, split_every, out)
Example #27
Source File: From predictatops with MIT License | 5 votes |
def predict_from_model(self, model, df_X_toPredict): """ The predict_from_model function takes as argument a model that is already trained on training data, in the demo case a scikit-learn XGBoost model and the dataframe of the columns to predict. From this, it fills in the self.result_df_from_prediction attribute and returns nothing. """ self.result_df_from_prediction = model.predict(df_X_toPredict)
Example #28
Source File: From predictatops with MIT License | 5 votes |
def __init__(self, ML, vs, distClassDF_wRollingCols_training): # self.knn_dir = ML.knn_dir # self.load_dir = ML.load_dir # self.features_dir = ML.features_dir # self.machine_learning_dir = ML.machine_learning_dir # self.h5_to_load = ML.h5_to_load self.train_X = ML.train_X self.train_y = ML.train_y self.test_X = ML.test_X self.test_y = ML.test_y self.train_index = ML.train_index self.test_index = ML.test_index self.preSplitpreBal = ML.preSplitpreBal self.result_df_from_prediction = None # df #### #### self.vs = vs # object instance from variables class self.depth_str = vs["depth_str"] self.pick_class_str = vs["pick_class_str"] self.UWI_str = vs["UWI_str"] self.rollingWindows = vs["rollingWindows"] self.distClassIntegersArray = vs["distClassIntegersArray"] #### self.calc_pred = distClassDF_wRollingCols_training self.excludeWellsThatOnlyHaveTheseClasses = ( [] ) ### aka dropIfOnlyClasses in optionallyExcludeWellsWithoutStrongPredictions() self.NoGoodWellsToExclude = ( [] ) #### UWIs of wells that only had zeros in the predicted dsitance class so these wells were excluded from accurracy prediction #### self.calc_pred_TopMcMr_Pick_pred_DEPT_pred = None # df self.calc_pred_TopTarget_DEPTH = None # df self.fullUWIsSet = [] ### set of UWIs in the dataframe self.precentWellsKept = 1 self.UWIsSetSubsetKept = ( [] ) #### subset of the wells that have predictions that aren't just zero or something else not wanted ## if zeros, calc_pred is changed to without zeros and zerosExcluded Array is populated
Example #29
Source File: From professional-services with Apache License 2.0 | 5 votes |
def drop_cols(cls, df, col_names): """Drops any columns which are not required by the user. Arguments: df : dask dataframe, Dataframe of input data col_names : list, Columns in the data to be dropped returns: dask dataframe, Updated dataframe with columns dropped """ return df.drop(col_names, axis=1)
Example #30
Source File: From professional-services with Apache License 2.0 | 5 votes |
def dropping_zero_var_cols(cls, df, target_var, stddev_list): """Check columns which have zero variance and removes the from the dataframe. As the zero variance columns or contant columns can't be considered as output column Arguments: df : dask dataframe, The dataframe to validate stddev : dask series, Series containing the standard deviation values for columns target_var : string, Dependent variable for the analysis Returns: df : dask dataframe, Dataframe with redundant columns removed Raises: AssertionError : If the target column has zero deviation """ continuous_cols = [ col for col in df.columns if df[col].dtype != 'object'] for col in continuous_cols: if stddev_list[col] == 0.0: df = df.drop(col, axis=1) if col == target_var: err_msg = 'Target variable has zero standard deviation or a contant column. ' \ 'Please check the data' tf.logging.error(err_msg) raise AssertionError(err_msg) return df