Python dask.array.concatenate() Examples
The following are 30
code examples of dask.array.concatenate().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask.array
, or try the search function
.
Example #1
Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_blockwise_shufflesplit(): splitter = dask_ml.model_selection.ShuffleSplit(random_state=0) assert splitter.get_n_splits() == 10 gen = splitter.split(dX) train_idx, test_idx = next(gen) assert isinstance(train_idx, da.Array) assert isinstance(test_idx, da.Array) assert train_idx.shape == (99,) # 90% of 110 assert test_idx.shape == (11,) assert train_idx.chunks == ((45, 45, 9),) assert test_idx.chunks == ((5, 5, 1),) counts = pd.value_counts(train_idx.compute()) assert counts.max() == 1 N = len(X) np.testing.assert_array_equal( np.unique(da.concatenate([train_idx, test_idx])), np.arange(N) )
Example #2
Source File: __init__.py From pyresample with GNU Lesser General Public License v3.0 | 6 votes |
def _concatenate_chunks(chunks): """Concatenate chunks to full output array.""" # Form the full array col, res = [], [] prev_y = 0 for y, x in sorted(chunks): if len(chunks[(y, x)]) > 1: chunk = da.nanmax(da.stack(chunks[(y, x)], axis=-1), axis=-1) else: chunk = chunks[(y, x)][0] if y == prev_y: col.append(chunk) continue res.append(da.concatenate(col, axis=1)) col = [chunk] prev_y = y res.append(da.concatenate(col, axis=1)) res = da.concatenate(res, axis=2).squeeze() return res
Example #3
Source File: utils.py From xmitgcm with MIT License | 6 votes |
def find_concat_dim(da, possible_concat_dims): """ look for available dimensions in dataaray and pick the one from a list of candidates PARAMETERS ---------- da : xarray.DataArray xmitgcm llc data array possible_concat_dims : list list of potential dims RETURNS ------- out : str dimension on which to concatenate """ out = None for d in possible_concat_dims: if d in da.dims: out = d return out
Example #4
Source File: utilities.py From minian with GNU General Public License v3.0 | 6 votes |
def handle_crash(varr, vpath, ssname, vlist, varr_list, frame_dict): seg1_list = list(filter(lambda v: re.search('seg1', v), vlist)) seg2_list = list(filter(lambda v: re.search('seg2', v), vlist)) if seg1_list and seg2_list: tframe = frame_dict[ssname] varr1 = darr.concatenate( list(compress(varr_list, seg1_list)), axis=0) varr2 = darr.concatenate( list(compress(varr_list, seg2_list)), axis=0) fm1, fm2 = varr1.shape[0], varr2.shape[0] fm_crds = varr.coords['frame'] fm_crds1 = fm_crds.sel(frame=slice(None, fm1 - 1)).values fm_crds2 = fm_crds.sel(frame=slice(fm1, None)).values fm_crds2 = fm_crds2 + (tframe - fm_crds2.max()) fm_crds_new = np.concatenate([fm_crds1, fm_crds2], axis=0) return varr.assign_coords(frame=fm_crds_new) else: return varr
Example #5
Source File: test__order.py From dask-image with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_order_comprehensions(da_func, kwargs): np.random.seed(0) a = np.random.random((3, 12, 14)) d = da.from_array(a, chunks=(3, 6, 7)) l2s = [da_func(d[i], **kwargs) for i in range(len(d))] l2c = [da_func(d[i], **kwargs)[None] for i in range(len(d))] dau.assert_eq(np.stack(l2s), da.stack(l2s)) dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))
Example #6
Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_kfold(shuffle): splitter = dask_ml.model_selection.KFold( n_splits=5, random_state=0, shuffle=shuffle ) assert splitter.get_n_splits() == 5 gen = splitter.split(dX) train_idx, test_idx = next(gen) assert isinstance(train_idx, da.Array) assert isinstance(test_idx, da.Array) assert train_idx.shape == (88,) # 80% of 110 assert test_idx.shape == (22,) assert train_idx.chunks == ((28, 50, 10),) assert test_idx.chunks == ((22,),) counts = pd.value_counts(train_idx.compute()) assert counts.max() == 1 N = len(X) np.testing.assert_array_equal( np.unique(da.concatenate([train_idx, test_idx])), np.arange(N) ) expected_chunks = [ (((22, 6, 50, 10),), ((22,),)), (((44, 34, 10),), ((6, 16),)), (((50, 16, 12, 10),), ((22,),)), (((50, 38),), ((12, 10),)), ] for (exp_train_idx, exp_test_idx), (train_idx, test_idx) in zip( expected_chunks, gen ): assert train_idx.chunks == exp_train_idx assert test_idx.chunks == exp_test_idx
Example #7
Source File: spectral.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _slice_mostly_sorted(array, keep, rest, ind=None): """Slice dask array `array` that is almost entirely sorted already. We perform approximately `2 * len(keep)` slices on `array`. This is OK, since `keep` is small. Individually, each of these slices is entirely sorted. Parameters ---------- array : dask.array.Array keep : ndarray[Int] This must be sorted. rest : ndarray[Bool] ind : ndarray[Int], optional Returns ------- sliced : dask.array.Array """ if ind is None: ind = np.arange(len(array)) idx = np.argsort(np.concatenate([keep, ind[rest]])) slices = [] if keep[0] > 0: # avoid creating empty slices slices.append(slice(None, keep[0])) slices.append([keep[0]]) windows = zip(keep[:-1], keep[1:]) for l, r in windows: if r > l + 1: # avoid creating empty slices slices.append(slice(l + 1, r)) slices.append([r]) if keep[-1] < len(array) - 1: # avoid creating empty slices slices.append(slice(keep[-1] + 1, None)) result = da.concatenate([array[idx[slice_]] for slice_ in slices]) return result
Example #8
Source File: _split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _blockwise_slice(arr, idx): """Slice an array that is blockwise-aligned with idx. Parameters ---------- arr : Dask array idx : Dask array Should have the following properties * Same blocks as `arr` along the first dimension * Contains only integers * Each block's values should be between ``[0, len(block))`` Returns ------- sliced : dask.Array """ objs = [] offsets = np.hstack([0, np.cumsum(arr.chunks[0])[:-1]]) for i, (x, idx2) in enumerate( zip(arr.to_delayed().ravel(), idx.to_delayed().ravel()) ): idx3 = idx2 - offsets[i] objs.append(x[idx3]) shapes = idx.chunks[0] if arr.ndim == 2: P = arr.shape[1] shapes = [(x, P) for x in shapes] else: shapes = [(x,) for x in shapes] sliced = da.concatenate( [ da.from_delayed(x, shape=shape, dtype=arr.dtype) for x, shape in zip(objs, shapes) ] ) return sliced
Example #9
Source File: _split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _split_blockwise(self, X, seeds): chunks = X.chunks[0] train_pct, test_pct = _maybe_normalize_split_sizes( self.train_size, self.test_size ) sizes = [_validate_shuffle_split(c, test_pct, train_pct) for c in chunks] objs = [ dask.delayed(_generate_idx, nout=2)(chunksize, seed, n_train, n_test) for chunksize, seed, (n_train, n_test) in zip(chunks, seeds, sizes) ] train_objs, test_objs = zip(*objs) offsets = np.hstack([0, np.cumsum(chunks)]) train_idx = da.concatenate( [ da.from_delayed(x + offset, (train_size,), np.dtype("int")) for x, chunksize, (train_size, _), offset in zip( train_objs, chunks, sizes, offsets ) ] ) test_idx = da.concatenate( [ da.from_delayed(x + offset, (test_size,), np.dtype("int")) for x, chunksize, (_, test_size), offset in zip( test_objs, chunks, sizes, offsets ) ] ) return train_idx, test_idx
Example #10
Source File: text.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def transform(self, raw_X): msg = "'X' should be a 1-dimensional array with length 'num_samples'." if not dask.is_dask_collection(raw_X): return self._hasher(**self.get_params()).transform(raw_X) if isinstance(raw_X, db.Bag): bag2 = raw_X.map_partitions(self._transformer) objs = bag2.to_delayed() arrs = [ da.from_delayed(obj, (np.nan, self.n_features), self.dtype) for obj in objs ] result = da.concatenate(arrs, axis=0) elif isinstance(raw_X, dd.Series): result = raw_X.map_partitions(self._transformer) elif isinstance(raw_X, da.Array): # dask.Array chunks = ((np.nan,) * raw_X.numblocks[0], (self.n_features,)) if raw_X.ndim == 1: result = raw_X.map_blocks( self._transformer, dtype="f8", chunks=chunks, new_axis=1 ) else: raise ValueError(msg) else: raise ValueError(msg) meta = scipy.sparse.eye(0, format="csr") result._meta = meta return result
Example #11
Source File: pairwise.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def pairwise_distances_argmin_min( X: ArrayLike, Y: ArrayLike, axis: int = 1, metric: Union[str, Callable[[ArrayLike, ArrayLike], float]] = "euclidean", batch_size: Optional[int] = None, metric_kwargs: Optional[Dict[str, Any]] = None, ): if batch_size is not None: msg = "'batch_size' is deprecated. Use sklearn.config_context instead.'" warnings.warn(msg, FutureWarning) XD = X.to_delayed().flatten().tolist() func = delayed(metrics.pairwise_distances_argmin_min, pure=True, nout=2) blocks = [func(x, Y, metric=metric, metric_kwargs=metric_kwargs) for x in XD] argmins, mins = zip(*blocks) argmins = [ da.from_delayed(block, (chunksize,), np.int64) for block, chunksize in zip(argmins, X.chunks[0]) ] # Scikit-learn seems to always use float64 mins = [ da.from_delayed(block, (chunksize,), "f8") for block, chunksize in zip(mins, X.chunks[0]) ] argmins = da.concatenate(argmins) mins = da.concatenate(mins) return argmins, mins
Example #12
Source File: seviri_l2_bufr.py From satpy with GNU General Public License v3.0 | 5 votes |
def get_array(self, key): """Get all data from file for the given BUFR key.""" with open(self.filename, "rb") as fh: msgCount = 0 while True: bufr = ec.codes_bufr_new_from_file(fh) if bufr is None: break ec.codes_set(bufr, 'unpack', 1) # if is the first message initialise our final array if (msgCount == 0): arr = da.from_array(ec.codes_get_array( bufr, key, float), chunks=CHUNK_SIZE) else: tmpArr = da.from_array(ec.codes_get_array( bufr, key, float), chunks=CHUNK_SIZE) arr = da.concatenate((arr, tmpArr)) msgCount = msgCount+1 ec.codes_release(bufr) if arr.size == 1: arr = arr[0] return arr
Example #13
Source File: iasi_l2_so2_bufr.py From satpy with GNU General Public License v3.0 | 5 votes |
def get_array(self, key): """Get all data from file for the given BUFR key.""" with open(self.filename, "rb") as fh: msgCount = 0 while True: bufr = ec.codes_bufr_new_from_file(fh) if bufr is None: break ec.codes_set(bufr, 'unpack', 1) values = ec.codes_get_array( bufr, key, float) if len(values) == 1: values = np.repeat(values, 120) # if is the first message initialise our final array if (msgCount == 0): arr = da.from_array([values], chunks=CHUNK_SIZE) else: tmpArr = da.from_array([values], chunks=CHUNK_SIZE) arr = da.concatenate((arr, tmpArr), axis=0) msgCount = msgCount+1 ec.codes_release(bufr) if arr.size == 1: arr = arr[0] return arr
Example #14
Source File: test__diff.py From dask-image with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_laplace_comprehensions(): np.random.seed(0) a = np.random.random((3, 12, 14)) d = da.from_array(a, chunks=(3, 6, 7)) l2s = [da_ndf.laplace(d[i]) for i in range(len(d))] l2c = [da_ndf.laplace(d[i])[None] for i in range(len(d))] dau.assert_eq(np.stack(l2s), da.stack(l2s)) dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))
Example #15
Source File: test__conv.py From dask-image with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_convolutions_comprehensions(da_func): np.random.seed(0) a = np.random.random((3, 12, 14)) d = da.from_array(a, chunks=(3, 6, 7)) weights = np.ones((1, 1)) l2s = [da_func(d[i], weights) for i in range(len(d))] l2c = [da_func(d[i], weights)[None] for i in range(len(d))] dau.assert_eq(np.stack(l2s), da.stack(l2s)) dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))
Example #16
Source File: test__generic.py From dask-image with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_generic_filter_comprehensions(da_func): da_wfunc = lambda arr: da_func(arr, lambda x: x, 1) # noqa: E731 np.random.seed(0) a = np.random.random((3, 12, 14)) d = da.from_array(a, chunks=(3, 6, 7)) l2s = [da_wfunc(d[i]) for i in range(len(d))] l2c = [da_wfunc(d[i])[None] for i in range(len(d))] dau.assert_eq(np.stack(l2s), da.stack(l2s)) dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))
Example #17
Source File: test__edge.py From dask-image with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_edge_comprehensions(da_func): np.random.seed(0) a = np.random.random((3, 12, 14)) d = da.from_array(a, chunks=(3, 6, 7)) l2s = [da_func(d[i]) for i in range(len(d))] l2c = [da_func(d[i])[None] for i in range(len(d))] dau.assert_eq(np.stack(l2s), da.stack(l2s)) dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))
Example #18
Source File: test__smooth.py From dask-image with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_uniform_comprehensions(): da_func = lambda arr: da_ndf.uniform_filter(arr, 1, origin=0) # noqa: E731 np.random.seed(0) a = np.random.random((3, 12, 14)) d = da.from_array(a, chunks=(3, 6, 7)) l2s = [da_func(d[i]) for i in range(len(d))] l2c = [da_func(d[i])[None] for i in range(len(d))] dau.assert_eq(np.stack(l2s), da.stack(l2s)) dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))
Example #19
Source File: utils.py From xmitgcm with MIT License | 5 votes |
def llc_facets_3d_spatial_to_compact(facets, dimname, extra_metadata): """ Write in compact form a list of 3d facets PARAMETERS ---------- facets : dict dict of xarray.dataarrays for the facets extra_metadata : dict extra_metadata from get_extra_metadata RETURNS ------- flatdata : numpy.array all the data in vector form """ nz = len(facets['facet0'][dimname]) nfacets = len(facets) flatdata = np.array([]) for kz in range(nz): # rebuild the dict tmpdict = {} for kfacet in range(nfacets): this_facet = facets['facet' + str(kfacet)] if this_facet is not None: tmpdict['facet' + str(kfacet)] = this_facet.isel(k=kz) else: tmpdict['facet' + str(kfacet)] = None # concatenate all 2d arrays compact2d = llc_facets_2d_to_compact(tmpdict, extra_metadata) flatdata = np.concatenate([flatdata, compact2d]) return flatdata
Example #20
Source File: utils.py From xmitgcm with MIT License | 5 votes |
def find_concat_dim_facet(da, facet, extra_metadata): """ In llc grids, find along which horizontal dimension to concatenate facet between i, i_g and j, j_g. If the order of the facet is F, concat along i or i_g. If order is C, concat along j or j_g. Also return horizontal dim not to concatenate PARAMETERS ---------- da : xarray.DataArray xmitgcm llc data array facet : int facet number extra_metadata : dict dict of extra_metadata from get_extra_metadata RETURNS ------- concat_dim, nonconcat_dim : str, str names of the dimensions for concatenation or not """ order = extra_metadata['facet_orders'][facet] if order == 'C': possible_concat_dims = ['j', 'j_g'] elif order == 'F': possible_concat_dims = ['i', 'i_g'] concat_dim = find_concat_dim(da, possible_concat_dims) # we also need to other horizontal dimension for vector indexing all_dims = list(da.dims) # discard face all_dims.remove('face') # remove the concat_dim to find horizontal non_concat dimension all_dims.remove(concat_dim) non_concat_dim = all_dims[0] return concat_dim, non_concat_dim
Example #21
Source File: __init__.py From pyresample with GNU Lesser General Public License v3.0 | 5 votes |
def get_border_lonlats(geo_def): """Get the border x- and y-coordinates.""" if geo_def.proj_dict['proj'] == 'geos': lon_b, lat_b = get_geostationary_bounding_box(geo_def, 3600) else: lons, lats = geo_def.get_boundary_lonlats() lon_b = np.concatenate((lons.side1, lons.side2, lons.side3, lons.side4)) lat_b = np.concatenate((lats.side1, lats.side2, lats.side3, lats.side4)) return lon_b, lat_b
Example #22
Source File: _bed_read.py From pandas-plink with MIT License | 5 votes |
def read_bed(filepath, nrows, ncols): from dask.array import concatenate, from_delayed from dask.delayed import delayed chunk_size = 1024 row_start = 0 col_xs = [] while row_start < nrows: row_end = min(row_start + chunk_size, nrows) col_start = 0 row_xs = [] while col_start < ncols: col_end = min(col_start + chunk_size, ncols) x = delayed(_read_bed_chunk)( filepath, nrows, ncols, row_start, row_end, col_start, col_end ) shape = (row_end - row_start, col_end - col_start) row_xs += [from_delayed(x, shape, float64)] col_start = col_end col_xs += [concatenate(row_xs, axis=1)] row_start = row_end X = concatenate(col_xs, axis=0) return X
Example #23
Source File: meta.py From gbdxtools with MIT License | 5 votes |
def _slice_padded(self, _bounds): pads = (max(-_bounds[0], 0), max(-_bounds[1], 0), max(_bounds[2]-self.shape[2], 0), max(_bounds[3]-self.shape[1], 0)) bounds = (max(_bounds[0], 0), max(_bounds[1], 0), max(min(_bounds[2], self.shape[2]), 0), max(min(_bounds[3], self.shape[1]), 0)) result = self[:, bounds[1]:bounds[3], bounds[0]:bounds[2]] if pads[0] > 0: dims = (result.shape[0], result.shape[1], pads[0]) result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype), result], axis=2) if pads[2] > 0: dims = (result.shape[0], result.shape[1], pads[2]) result = da.concatenate([result, da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=2) if pads[1] > 0: dims = (result.shape[0], pads[1], result.shape[2]) result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype), result], axis=1) if pads[3] > 0: dims = (result.shape[0], pads[3], result.shape[2]) result = da.concatenate([result, da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=1) return (result, _bounds[0], _bounds[1])
Example #24
Source File: transform.py From nbodykit with GNU General Public License v3.0 | 4 votes |
def ConcatenateSources(*sources, **kwargs): """ Concatenate CatalogSource objects together, optionally including only certain columns in the returned source. .. note:: The returned catalog object carries the meta-data from only the first catalog supplied to this function (in the ``attrs`` dict). Parameters ---------- *sources : subclass of :class:`~nbodykit.base.catalog.CatalogSource` the catalog source objects to concatenate together columns : str, list of str, optional the columns to include in the concatenated catalog Returns ------- CatalogSource : the concatenated catalog source object Examples -------- >>> from nbodykit.lab import * >>> source1 = UniformCatalog(nbar=100, BoxSize=1.0) >>> source2 = UniformCatalog(nbar=100, BoxSize=1.0) >>> print(source1.csize, source2.csize) >>> combined = transform.ConcatenateSources(source1, source2, columns=['Position', 'Velocity']) >>> print(combined.csize) """ from nbodykit.base.catalog import CatalogSource columns = kwargs.get('columns', None) if isinstance(columns, string_types): columns = [columns] # concatenate all columns, if none provided if columns is None or columns == []: columns = sources[0].columns # check comms if not all(src.comm == sources[0].comm for src in sources): raise ValueError("cannot concatenate sources: comm mismatch") # check all columns are there for source in sources: if not all(col in source for col in columns): raise ValueError(("cannot concatenate sources: columns are missing " "from some sources")) # the total size size = numpy.sum([src.size for src in sources], dtype='intp') data = {} for col in columns: data[col] = da.concatenate([src[col] for src in sources], axis=0) toret = CatalogSource._from_columns(size, sources[0].comm, **data) toret.attrs.update(sources[0].attrs) return toret
Example #25
Source File: utilities.py From minian with GNU General Public License v3.0 | 4 votes |
def save_video(movpath, fname_mov_orig, fname_mov_rig, fname_AC, fname_ACbf, dsratio): """ Parameters ---------- movpath : fname_mov_orig : fname_mov_rig : fname_AC : fname_ACbf : dsratio : Returns ------- """ mov_orig = np.load(fname_mov_orig, mmap_mode='r') mov_rig = np.load(fname_mov_rig, mmap_mode='r') mov_ac = np.load(fname_AC, mmap_mode='r') mov_acbf = np.load(fname_ACbf, mmap_mode='r') vw = skv.FFmpegWriter( movpath, inputdict={'-framerate': '30'}, outputdict={'-r': '30'}) for fidx in range(0, mov_orig.shape[0], dsratio): print("writing frame: " + str(fidx)) fm_orig = mov_orig[fidx, :, :] * 255 fm_rig = mov_rig[fidx, :, :] * 255 fm_acbf = mov_acbf[fidx, :, :] * 255 fm_ac = mov_ac[fidx, :, :] * 255 fm = np.concatenate( [ np.concatenate([fm_orig, fm_rig], axis=1), np.concatenate([fm_acbf, fm_ac], axis=1) ], axis=0) vw.writeFrame(fm) vw.close()
Example #26
Source File: utils.py From xmitgcm with MIT License | 4 votes |
def _pad_array(data, file_metadata, face=0): """ Return a padded array. If input data is a numpy.memmap and no padding is necessary, the function preserves its type. Otherwise, the concatenate forces it to load into memory. Parameters ---------- data : numpy array or memmap input data file_metadata : dict metadata for file face : int, optional llc face if applicable Returns ------- numpy.array or numpy.memmap """ # Pad data before in y direction if 'pad_before_y' in file_metadata: if file_metadata['has_faces']: facet_origin = file_metadata['face_facets'][face] nypad_before = file_metadata['pad_before_y'][facet_origin] else: nypad_before = file_metadata['pad_before_y'] pad_before = np.zeros((nypad_before, file_metadata['nx'])) data_padded_before = np.concatenate( (pad_before, data), axis=0) else: data_padded_before = data # Pad data after in y direction if 'pad_after_y' in file_metadata: if file_metadata['has_faces']: facet_origin = file_metadata['face_facets'][face] nypad_after = file_metadata['pad_after_y'][facet_origin] else: nypad_after = file_metadata['pad_after_y'] pad_after = np.zeros((nypad_after, file_metadata['nx'])) data_padded_after = np.concatenate( (data_padded_before, pad_after), axis=0) else: data_padded_after = data_padded_before return data_padded_after
Example #27
Source File: utils.py From xmitgcm with MIT License | 4 votes |
def _reshape_llc_data(data, jdim): # pragma: no cover """Fix the weird problem with llc data array order.""" # Can we do this without copying any data? # If not, we need to go upstream and implement this at the MDS level # Or can we fudge it with dask? # this is all very specific to the llc file output # would be nice to generalize more, but how? nside = data.shape[jdim] // LLC_NUM_FACES # how the LLC data is laid out along the j dimension strides = ((0,3), (3,6), (6,7), (7,10), (10,13)) # whether to reshape each face reshape = (False, False, False, True, True) # this will slice the data into 5 facets slices = [jdim * (slice(None),) + (slice(nside*st[0], nside*st[1]),) for st in strides] facet_arrays = [data[sl] for sl in slices] face_arrays = [] for ar, rs, st in zip(facet_arrays, reshape, strides): nfaces_in_facet = st[1] - st[0] shape = list(ar.shape) if rs: # we assume the other horizontal dimension is immediately after jdim shape[jdim] = ar.shape[jdim+1] shape[jdim+1] = ar.shape[jdim] # insert a length-1 dimension along which to concatenate shape.insert(jdim, 1) # this modify the array shape in place, with no copies allowed # but it doesn't work with dask arrays # ar.shape = shape ar = ar.reshape(shape) # now ar is propery shaped, but we still need to slice it into faces face_slice_dim = jdim + 1 + rs for n in range(nfaces_in_facet): face_slice = (face_slice_dim * (slice(None),) + (slice(nside*n, nside*(n+1)),)) data_face = ar[face_slice] face_arrays.append(data_face) # We can't concatenate using numpy (hcat etc.) because it makes a copy, # presumably loading the memmaps into memory. # Using dask gets around this. # But what if we want different chunks, or already chunked the data # upstream? Doesn't seem like this is ideal # TODO: Refactor handling of dask arrays and chunking #return np.concatenate(face_arrays, axis=jdim) # the dask version doesn't work because of this: # https://github.com/dask/dask/issues/1645 face_arrays_dask = [dsa.from_array(fa, chunks=fa.shape) for fa in face_arrays] concat = dsa.concatenate(face_arrays_dask, axis=jdim) return concat
Example #28
Source File: _split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _split(self, test_start, test_stop, n_samples, chunks, seeds): train_objs = [] test_objs = [] train_sizes = [] test_sizes = [] offset = 0 for chunk, seed in zip(chunks, seeds): start, stop = offset, offset + chunk test_id_start = max(test_start, start) test_id_stop = min(test_stop, stop) if test_id_start < test_id_stop: test_objs.append( dask.delayed(_generate_offset_idx)( chunk, test_id_start, test_id_stop, offset, seed ) ) test_sizes.append(test_id_stop - test_id_start) train_id_stop = min(test_id_start, stop) if train_id_stop > start: train_objs.append( dask.delayed(_generate_offset_idx)( chunk, start, train_id_stop, offset, seed ) ) train_sizes.append(train_id_stop - start) train_id_start = max(test_id_stop, start) if train_id_start < stop: train_objs.append( dask.delayed(_generate_offset_idx)( chunk, train_id_start, stop, offset, seed ) ) train_sizes.append(stop - train_id_start) offset = stop train_idx = da.concatenate( [ da.from_delayed(obj, (train_size,), np.dtype("int")) for obj, train_size in zip(train_objs, train_sizes) ] ) test_idx = da.concatenate( [ da.from_delayed(obj, (test_size,), np.dtype("int")) for obj, test_size in zip(test_objs, test_sizes) ] ) return train_idx, test_idx
Example #29
Source File: io_utils.py From pyxem with GNU General Public License v3.0 | 4 votes |
def _untangle_raw(data, hdr_info, stack_size): """ Corrects for the tangled raw mib format - Only the case for quad chip is considered here. Parameters -------- data: dask array as stack with the detector array unreshaped, e.g. for a single frame 512*512: (1, 262144) hdr_info: dict info read from the header- ouput of the _parse_hdr function stack_size: int The number of frames in the data Outputs ---------- untangled_data: dask array corrected dask array object reshaped on the detector plane, e.g. for a single frame case as above: (1, 512, 512) """ width = hdr_info["width"] height = hdr_info["height"] width_height = width * height if ( hdr_info["Counter Depth (number)"] == 24 or hdr_info["Counter Depth (number)"] == 12 ): cols = 4 elif hdr_info["Counter Depth (number)"] == 1: cols = 64 elif hdr_info["Counter Depth (number)"] == 6: cols = 8 data = data.reshape((stack_size * width_height)) data = data.reshape(stack_size, height * (height // cols), cols) data = da.flip(data, 2) if hdr_info["Assembly Size"] == "2x2": data = data.reshape((stack_size * width_height)) data = data.reshape(stack_size, 512 // 2, 512 * 2) det1 = data[:, :, 0:256] det2 = data[:, :, 256:512] det3 = data[:, :, 512 : 512 + 256] det4 = data[:, :, 512 + 256 :] det3 = da.flip(det3, 2) det3 = da.flip(det3, 1) det4 = da.flip(det4, 2) det4 = da.flip(det4, 1) untangled_data = da.concatenate( (da.concatenate((det1, det3), 1), da.concatenate((det2, det4), 1)), 2 ) return untangled_data
Example #30
Source File: _encoders.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _transform( self, X: Union[ArrayLike, DataFrameType], handle_unknown: str = "error" ) -> Union[ArrayLike, DataFrameType]: X = check_array( X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True ) is_array = isinstance(X, da.Array) if is_array: _, n_features = X.shape else: n_features = len(X.columns) if is_array: # We encode each column independently, as they have different categories. Xs = [ _encode_dask_array( X[:, i], uniques=self.categories_[i], encode=True, onehot_dtype=self.dtype, )[1] for i in range(n_features) ] X = da.concatenate(Xs, axis=1) if not self.sparse: X = X.map_blocks(lambda x: x.toarray(), dtype=self.dtype) else: import dask.dataframe as dd # Validate that all are categorical. if not (X.dtypes == "category").all(): raise ValueError("Must be all categorical.") if not len(X.columns) == len(self.categories_): raise ValueError( "Number of columns ({}) does not match number " "of categories_ ({})".format(len(X.columns), len(self.categories_)) ) for col, dtype in zip(X.columns, self.dtypes_): if not (X[col].dtype == dtype): raise ValueError( "Different CategoricalDtype for fit and " "transform. '{}' != {}'".format(dtype, X[col].dtype) ) return dd.get_dummies(X, sparse=self.sparse, dtype=self.dtype) return X