Python dask.dataframe.DataFrame() Examples
The following are 30
code examples of dask.dataframe.DataFrame().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask.dataframe
, or try the search function
.
Example #1
Source File: dask_io.py From lambda-packs with MIT License | 6 votes |
def extract_dask_data(data): """Extract data from dask.Series or dask.DataFrame for predictors. Given a distributed dask.DataFrame or dask.Series containing columns or names for one or more predictors, this operation returns a single dask.DataFrame or dask.Series that can be iterated over. Args: data: A distributed dask.DataFrame or dask.Series. Returns: A dask.DataFrame or dask.Series that can be iterated over. If the supplied argument is neither a dask.DataFrame nor a dask.Series this operation returns it without modification. """ if isinstance(data, allowed_classes): return _construct_dask_df_with_divisions(data) else: return data
Example #2
Source File: _update.py From kartothek with MIT License | 6 votes |
def pack_payload_pandas(partition: pd.DataFrame, group_key: List[str]) -> pd.DataFrame: try: # Technically distributed is an optional dependency from distributed.protocol import serialize_bytes except ImportError: _logger.warning( "Shuffle payload columns cannot be compressed since distributed is not installed." ) return partition if partition.empty: res = partition[group_key] res[_PAYLOAD_COL] = b"" else: res = partition.groupby( group_key, sort=False, observed=True, # Keep the as_index s.t. the group values are not dropped. With this # the behaviour seems to be consistent along pandas versions as_index=True, ).apply(lambda x: pd.Series({_PAYLOAD_COL: serialize_bytes(x)})) res = res.reset_index() return res
Example #3
Source File: _update.py From kartothek with MIT License | 6 votes |
def unpack_payload_pandas( partition: pd.DataFrame, unpack_meta: pd.DataFrame ) -> pd.DataFrame: """ Revert ``pack_payload_pandas`` and restore packed payload unpack_meta: A dataframe indicating the sc """ try: # Technically distributed is an optional dependency from distributed.protocol import deserialize_bytes except ImportError: _logger.warning( "Shuffle payload columns cannot be compressed since distributed is not installed." ) return partition if partition.empty: return unpack_meta.iloc[:0] mapped = partition[_PAYLOAD_COL].map(deserialize_bytes) return pd.concat(mapped.values, copy=False, ignore_index=True)
Example #4
Source File: _update.py From kartothek with MIT License | 6 votes |
def unpack_payload(df: dd.DataFrame, unpack_meta: pd.DataFrame) -> dd.DataFrame: """Revert payload packing of ``pack_payload`` and restores full dataframe.""" if ( # https://github.com/pandas-dev/pandas/issues/34455 isinstance(df._meta.index, pd.Float64Index) # TODO: Try to find out what's going on an file a bug report # For datetime indices the apply seems to be corrupt # s.t. apply(lambda x:x) returns different values or isinstance(df._meta.index, pd.DatetimeIndex) ): return df return df.map_partitions( unpack_payload_pandas, unpack_meta=unpack_meta, meta=unpack_meta )
Example #5
Source File: dask.py From cooler with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _get_group_info(path, grouppath, keys): with h5py.File(path, "r") as f: grp = f[grouppath] if keys is None: keys = list(grp.keys()) nrows = len(grp[keys[0]]) categoricals = {} for key in keys: dt = h5py.check_dtype(enum=grp[key].dtype) if dt is not None: categoricals[key] = sorted(dt, key=dt.__getitem__) # Meta is an empty dataframe that serves as a compound "dtype" meta = pd.DataFrame( {key: np.array([], dtype=grp[key].dtype) for key in keys}, columns=keys ) for key in categoricals: meta[key] = pd.Categorical([], categories=categoricals[key], ordered=True) return nrows, keys, meta, categoricals
Example #6
Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def to_indexable(*args, **kwargs): """Ensure that all args are an indexable type. Conversion runs lazily for dask objects, immediately otherwise. Parameters ---------- args : array_like or scalar allow_scalars : bool, optional Whether to allow scalars in args. Default is False. """ if kwargs.get("allow_scalars", False): indexable = _maybe_indexable else: indexable = _indexable for x in args: if x is None or isinstance(x, (da.Array, dd.DataFrame)): yield x elif is_dask_collection(x): yield delayed(indexable, pure=True)(x) else: yield indexable(x)
Example #7
Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def to_keys(dsk, *args): for x in args: if x is None: yield None elif isinstance(x, (da.Array, dd.DataFrame)): x = delayed(x) dsk.update(x.dask) yield x.key elif isinstance(x, Delayed): dsk.update(x.dask) yield x.key else: assert not is_dask_collection(x) key = type(x).__name__ + "-" + tokenize(x) dsk[key] = x yield key
Example #8
Source File: util.py From holoviews with BSD 3-Clause "New" or "Revised" License | 6 votes |
def groupby_pandas(self_or_cls, ndmapping, dimensions, container_type, group_type, sort=False, **kwargs): if 'kdims' in kwargs: idims = [ndmapping.get_dimension(d) for d in kwargs['kdims']] else: idims = [dim for dim in ndmapping.kdims if dim not in dimensions] all_dims = [d.name for d in ndmapping.kdims] inds = [ndmapping.get_dimension_index(dim) for dim in idims] getter = operator.itemgetter(*inds) if inds else lambda x: tuple() multi_index = pd.MultiIndex.from_tuples(ndmapping.keys(), names=all_dims) df = pd.DataFrame(list(map(wrap_tuple, ndmapping.values())), index=multi_index) # TODO: Look at sort here kwargs = dict(dict(get_param_values(ndmapping), kdims=idims), sort=sort, **kwargs) groups = ((wrap_tuple(k), group_type(OrderedDict(unpack_group(group, getter)), **kwargs)) for k, group in df.groupby(level=[d.name for d in dimensions], sort=sort)) if sort: selects = list(get_unique_keys(ndmapping, dimensions)) groups = sorted(groups, key=lambda x: selects.index(x[0])) return container_type(groups, kdims=dimensions, sort=sort)
Example #9
Source File: dask_io.py From deep_image_model with Apache License 2.0 | 6 votes |
def extract_dask_data(data): """Extract data from dask.Series or dask.DataFrame for predictors. Given a distributed dask.DataFrame or dask.Series containing columns or names for one or more predictors, this operation returns a single dask.DataFrame or dask.Series that can be iterated over. Args: data: A distributed dask.DataFrame or dask.Series. Returns: A dask.DataFrame or dask.Series that can be iterated over. If the supplied argument is neither a dask.DataFrame nor a dask.Series this operation returns it without modification. """ if isinstance(data, allowed_classes): return _construct_dask_df_with_divisions(data) else: return data
Example #10
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def inverse_transform( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, copy: Optional[bool] = None, ) -> Union[ArrayLike, DataFrameType]: if not hasattr(self, "scale_"): raise Exception( "This %(name)s instance is not fitted yet. " "Call 'fit' with appropriate arguments before " "using this method." ) X = X.copy() if isinstance(X, dd.DataFrame): X = X.sub(self.min_) X = X.div(self.scale_) else: X -= self.min_ X /= self.scale_ return X
Example #11
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _check_inputs( self, X: Union[ArrayLike, DataFrameType], accept_sparse_negative: bool = False, copy: bool = False, in_fit: bool = True, ) -> Union[ArrayLike, DataFrameType]: if isinstance(X, (pd.DataFrame, dd.DataFrame)): X = X.values if isinstance(X, np.ndarray): C = len(X) // min(multiprocessing.cpu_count(), 2) X = da.from_array(X, chunks=C) rng = check_random_state(self.random_state) # TODO: non-float dtypes? # TODO: sparse arrays? # TODO: mix of sparse, dense? sample = rng.uniform(size=(5, X.shape[1])).astype(X.dtype) super(QuantileTransformer, self)._check_inputs( sample, accept_sparse_negative=accept_sparse_negative, copy=copy, in_fit=in_fit, ) return X
Example #12
Source File: dask_io.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def extract_dask_data(data): """Extract data from dask.Series or dask.DataFrame for predictors. Given a distributed dask.DataFrame or dask.Series containing columns or names for one or more predictors, this operation returns a single dask.DataFrame or dask.Series that can be iterated over. Args: data: A distributed dask.DataFrame or dask.Series. Returns: A dask.DataFrame or dask.Series that can be iterated over. If the supplied argument is neither a dask.DataFrame nor a dask.Series this operation returns it without modification. """ if isinstance(data, allowed_classes): return _construct_dask_df_with_divisions(data) else: return data
Example #13
Source File: batch.py From batchflow with Apache License 2.0 | 6 votes |
def _load_table(self, src, fmt, dst=None, post=None, *args, **kwargs): """ Load a data frame from table formats: csv, hdf5, feather """ if fmt == 'csv': _data = pd.read_csv(src, *args, **kwargs) elif fmt == 'feather': _data = feather.read_dataframe(src, *args, **kwargs) elif fmt == 'hdf5': _data = pd.read_hdf(src, *args, **kwargs) # Put into this batch only part of it (defined by index) if isinstance(_data, pd.DataFrame): _data = _data.loc[self.indices] elif isinstance(_data, dd.DataFrame): # dask.DataFrame.loc supports advanced indexing only with lists _data = _data.loc[list(self.indices)].compute() if callable(post): _data = post(_data, src=src, fmt=fmt, dst=dst, **kwargs) self.load(src=_data, dst=dst)
Example #14
Source File: dask.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def aggregate(cls, dataset, dimensions, function, **kwargs): data = dataset.data cols = [d.name for d in dataset.kdims if d in dimensions] vdims = dataset.dimensions('value', label='name') dtypes = data.dtypes numeric = [c for c, dtype in zip(dtypes.index, dtypes.values) if dtype.kind in 'iufc' and c in vdims] reindexed = data[cols+numeric] inbuilts = {'amin': 'min', 'amax': 'max', 'mean': 'mean', 'std': 'std', 'sum': 'sum', 'var': 'var'} if len(dimensions): groups = reindexed.groupby(cols) if (function.__name__ in inbuilts): agg = getattr(groups, inbuilts[function.__name__])() else: agg = groups.apply(function) df = agg.reset_index() else: if (function.__name__ in inbuilts): agg = getattr(reindexed, inbuilts[function.__name__])() else: raise NotImplementedError df = pd.DataFrame(agg.compute()).T dropped = [] for vd in vdims: if vd not in df.columns: dropped.append(vd) return df, dropped
Example #15
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fit( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "StandardScaler": self._reset() attributes = OrderedDict() if isinstance(X, (pd.DataFrame, dd.DataFrame)): X = X.values if self.with_mean: mean_ = nanmean(X, 0) attributes["mean_"] = mean_ if self.with_std: var_ = nanvar(X, 0) scale_ = var_.copy() scale_[scale_ == 0] = 1 scale_ = da.sqrt(scale_) attributes["scale_"] = scale_ attributes["var_"] = var_ attributes["n_samples_seen_"] = np.nan values = compute(*attributes.values()) for k, v in zip(attributes, values): setattr(self, k, v) self.n_features_in_ = X.shape[1] return self
Example #16
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fit( self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None ) -> "Categorizer": """Find the categorical columns. Parameters ---------- X : pandas.DataFrame or dask.DataFrame y : ignored Returns ------- self """ X = self._check_array(X) if self.categories is not None: # some basic validation columns = pd.Index(self.categories) categories = self.categories elif isinstance(X, pd.DataFrame): columns, categories = self._fit(X) else: columns, categories = self._fit_dask(X) self.columns_ = columns self.categories_ = categories return self
Example #17
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _check_array(self, X: DataFrameType) -> DataFrameType: # TODO: refactor to check_array if not isinstance(X, (pd.DataFrame, dd.DataFrame)): raise TypeError( "Expected a pandas or dask DataFrame, got " "{} instead".format(type(X)) ) return X
Example #18
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fit( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "RobustScaler": q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) if isinstance(X, dd.DataFrame): n_columns = len(X.columns) partition_lengths = X.map_partitions(len).compute() dtype = np.find_common_type(X.dtypes, []) blocks = X.to_delayed() X = da.vstack( [ da.from_delayed( block.values, shape=(length, n_columns), dtype=dtype ) for block, length in zip(blocks, partition_lengths) ] ) quantiles: Any = [da.percentile(col, [q_min, 50.0, q_max]) for col in X.T] quantiles = da.vstack(quantiles).compute() self.center_: List[float] = quantiles[:, 1] self.scale_: List[float] = quantiles[:, 2] - quantiles[:, 0] self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) self.n_features_in_ = X.shape[1] return self
Example #19
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def transform( self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None ) -> DataFrameType: """Dummy encode the categorical columns in X Parameters ---------- X : pd.DataFrame or dd.DataFrame y : ignored Returns ------- transformed : pd.DataFrame or dd.DataFrame Same type as the input """ if not X.columns.equals(self.columns_): raise ValueError( "Columns of 'X' do not match the training " "columns. Got {!r}, expected {!r}".format(X.columns, self.columns_) ) if isinstance(X, pd.DataFrame): return pd.get_dummies(X, drop_first=self.drop_first, columns=self.columns) elif isinstance(X, dd.DataFrame): return dd.get_dummies(X, drop_first=self.drop_first, columns=self.columns) else: raise TypeError("Unexpected type {}".format(type(X)))
Example #20
Source File: util.py From hvplot with BSD 3-Clause "New" or "Revised" License | 5 votes |
def is_geodataframe(data): if 'spatialpandas' in sys.modules: import spatialpandas as spd if isinstance(data, spd.GeoDataFrame): return True return isinstance(data, pd.DataFrame) and hasattr(data, 'geom_type') and hasattr(data, 'geometry')
Example #21
Source File: data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def transform( self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None ) -> DataFrameType: """Ordinal encode the categorical columns in X Parameters ---------- X : pd.DataFrame or dd.DataFrame y : ignored Returns ------- transformed : pd.DataFrame or dd.DataFrame Same type as the input """ if not X.columns.equals(self.columns_): raise ValueError( "Columns of 'X' do not match the training " "columns. Got {!r}, expected {!r}".format(X.columns, self.columns) ) if not isinstance(X, (pd.DataFrame, dd.DataFrame)): raise TypeError("Unexpected type {}".format(type(X))) X = X.copy() for col in self.categorical_columns_: X[col] = X[col].cat.codes return X
Example #22
Source File: dask.py From hvplot with BSD 3-Clause "New" or "Revised" License | 5 votes |
def patch(name='hvplot', extension='bokeh', logo=False): from . import hvPlotTabular, post_patch try: import dask.dataframe as dd except: raise ImportError('Could not patch plotting API onto dask. ' 'Dask could not be imported.') _patch_plot = lambda self: hvPlotTabular(self) _patch_plot.__doc__ = hvPlotTabular.__call__.__doc__ patch_property = property(_patch_plot) setattr(dd.DataFrame, name, patch_property) setattr(dd.Series, name, patch_property) post_patch(extension, logo)
Example #23
Source File: testdaskinterface.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_dataset_from_multi_index(self): raise SkipTest("Temporarily skipped") df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)}) ddf = dd.from_pandas(df, 1) ds = Dataset(ddf.groupby(['x', 'y']).mean(), ['x', 'y']) self.assertEqual(ds, Dataset(df, ['x', 'y']))
Example #24
Source File: dask.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def init(cls, eltype, data, kdims, vdims): import dask.dataframe as dd data, dims, extra = PandasInterface.init(eltype, data, kdims, vdims) if not isinstance(data, dd.DataFrame): data = dd.from_pandas(data, npartitions=cls.default_partitions, sort=False) kdims = [d.name if isinstance(d, Dimension) else d for d in dims['kdims']] # If a key dimension can be found, speculatively reset index # to work around lacking dask support for MultiIndex if any(d for d in kdims if d not in data.columns): reset = data.reset_index() if all(d for d in kdims if d in reset.columns): data = reset return data, dims, extra
Example #25
Source File: testdaskinterface.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_dataset_from_multi_index_tuple_dims(self): raise SkipTest("Temporarily skipped") df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)}) ddf = dd.from_pandas(df, 1) ds = Dataset(ddf.groupby(['x', 'y']).mean(), [('x', 'X'), ('y', 'Y')]) self.assertEqual(ds, Dataset(df, [('x', 'X'), ('y', 'Y')]))
Example #26
Source File: dask.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def applies(cls, obj): if not cls.loaded(): return False import dask.dataframe as dd return isinstance(obj, (dd.DataFrame, dd.Series))
Example #27
Source File: testdaskinterface.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_dataset_range_categorical_dimension(self): ddf = dd.from_pandas(pd.DataFrame({'a': ['1', '2', '3']}), 1) ds = Dataset(ddf) self.assertEqual(ds.range(0), ('1', '3'))
Example #28
Source File: util.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def is_dataframe(data): """ Checks whether the supplied data is of DataFrame type. """ dd = None if 'dask.dataframe' in sys.modules and 'pandas' in sys.modules: import dask.dataframe as dd return((pd is not None and isinstance(data, pd.DataFrame)) or (dd is not None and isinstance(data, dd.DataFrame)))
Example #29
Source File: testdaskinterface.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_select_expression_lazy(self): df = pd.DataFrame({ 'a': [1, 2, 3, 4, 5], 'b': [10, 10, 11, 11, 10], }) ddf = dd.from_pandas(df, npartitions=2) ds = Dataset(ddf) new_ds = ds.select(selection_expr=dim('b') == 10) # Make sure that selecting by expression didn't cause evaluation self.assertIsInstance(new_ds.data, dd.DataFrame) self.assertEqual(new_ds.data.compute(), df[df.b == 10])
Example #30
Source File: testdaskinterface.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_dataset_range_categorical_dimension_empty(self): ddf = dd.from_pandas(pd.DataFrame({'a': ['1', '2', '3']}), 1) ds = Dataset(ddf).iloc[:0] ds_range = ds.range(0) self.assertTrue(np.isnan(ds_range[0])) self.assertTrue(np.isnan(ds_range[1]))