Python Examples of dask.array.concatenate

Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_blockwise_shufflesplit():
    splitter = dask_ml.model_selection.ShuffleSplit(random_state=0)
    assert splitter.get_n_splits() == 10
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (99,)  # 90% of 110
    assert test_idx.shape == (11,)

    assert train_idx.chunks == ((45, 45, 9),)
    assert test_idx.chunks == ((5, 5, 1),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    )

Source File: __init__.py From pyresample with GNU Lesser General Public License v3.0

6 votes

def _concatenate_chunks(chunks):
    """Concatenate chunks to full output array."""
    # Form the full array
    col, res = [], []
    prev_y = 0
    for y, x in sorted(chunks):
        if len(chunks[(y, x)]) > 1:
            chunk = da.nanmax(da.stack(chunks[(y, x)], axis=-1), axis=-1)
        else:
            chunk = chunks[(y, x)][0]
        if y == prev_y:
            col.append(chunk)
            continue
        res.append(da.concatenate(col, axis=1))
        col = [chunk]
        prev_y = y
    res.append(da.concatenate(col, axis=1))

    res = da.concatenate(res, axis=2).squeeze()

    return res

Source File: utils.py From xmitgcm with MIT License

6 votes

def find_concat_dim(da, possible_concat_dims):
    """ look for available dimensions in dataaray and pick the one
    from a list of candidates

    PARAMETERS
    ----------
    da : xarray.DataArray
        xmitgcm llc data array
    possible_concat_dims : list
        list of potential dims

    RETURNS
    -------
    out : str
        dimension on which to concatenate

    """
    out = None
    for d in possible_concat_dims:
        if d in da.dims:
            out = d
    return out

Source File: utilities.py From minian with GNU General Public License v3.0

6 votes

def handle_crash(varr, vpath, ssname, vlist, varr_list, frame_dict):
    seg1_list = list(filter(lambda v: re.search('seg1', v), vlist))
    seg2_list = list(filter(lambda v: re.search('seg2', v), vlist))
    if seg1_list and seg2_list:
        tframe = frame_dict[ssname]
        varr1 = darr.concatenate(
            list(compress(varr_list, seg1_list)),
            axis=0)
        varr2 = darr.concatenate(
            list(compress(varr_list, seg2_list)),
            axis=0)
        fm1, fm2 = varr1.shape[0], varr2.shape[0]
        fm_crds = varr.coords['frame']
        fm_crds1 = fm_crds.sel(frame=slice(None, fm1 - 1)).values
        fm_crds2 = fm_crds.sel(frame=slice(fm1, None)).values
        fm_crds2 = fm_crds2 + (tframe - fm_crds2.max())
        fm_crds_new = np.concatenate([fm_crds1, fm_crds2], axis=0)
        return varr.assign_coords(frame=fm_crds_new)
    else:
        return varr

Source File: test__order.py From dask-image with BSD 3-Clause "New" or "Revised" License

5 votes

def test_order_comprehensions(da_func, kwargs):
    np.random.seed(0)

    a = np.random.random((3, 12, 14))
    d = da.from_array(a, chunks=(3, 6, 7))

    l2s = [da_func(d[i], **kwargs) for i in range(len(d))]
    l2c = [da_func(d[i], **kwargs)[None] for i in range(len(d))]

    dau.assert_eq(np.stack(l2s), da.stack(l2s))
    dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))

Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_kfold(shuffle):
    splitter = dask_ml.model_selection.KFold(
        n_splits=5, random_state=0, shuffle=shuffle
    )
    assert splitter.get_n_splits() == 5
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (88,)  # 80% of 110
    assert test_idx.shape == (22,)

    assert train_idx.chunks == ((28, 50, 10),)
    assert test_idx.chunks == ((22,),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    )

    expected_chunks = [
        (((22, 6, 50, 10),), ((22,),)),
        (((44, 34, 10),), ((6, 16),)),
        (((50, 16, 12, 10),), ((22,),)),
        (((50, 38),), ((12, 10),)),
    ]

    for (exp_train_idx, exp_test_idx), (train_idx, test_idx) in zip(
        expected_chunks, gen
    ):
        assert train_idx.chunks == exp_train_idx
        assert test_idx.chunks == exp_test_idx

Source File: spectral.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _slice_mostly_sorted(array, keep, rest, ind=None):
    """Slice dask array `array` that is almost entirely sorted already.

    We perform approximately `2 * len(keep)` slices on `array`.
    This is OK, since `keep` is small. Individually, each of these slices
    is entirely sorted.

    Parameters
    ----------
    array : dask.array.Array
    keep : ndarray[Int]
        This must be sorted.
    rest : ndarray[Bool]
    ind : ndarray[Int], optional

    Returns
    -------
    sliced : dask.array.Array
    """
    if ind is None:
        ind = np.arange(len(array))
    idx = np.argsort(np.concatenate([keep, ind[rest]]))

    slices = []
    if keep[0] > 0:  # avoid creating empty slices
        slices.append(slice(None, keep[0]))
    slices.append([keep[0]])
    windows = zip(keep[:-1], keep[1:])

    for l, r in windows:
        if r > l + 1:  # avoid creating empty slices
            slices.append(slice(l + 1, r))
        slices.append([r])

    if keep[-1] < len(array) - 1:  # avoid creating empty slices
        slices.append(slice(keep[-1] + 1, None))
    result = da.concatenate([array[idx[slice_]] for slice_ in slices])
    return result

Source File: _split.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _blockwise_slice(arr, idx):
    """Slice an array that is blockwise-aligned with idx.

    Parameters
    ----------
    arr : Dask array
    idx : Dask array
        Should have the following properties

        * Same blocks as `arr` along the first dimension
        * Contains only integers
        * Each block's values should be between ``[0, len(block))``

    Returns
    -------
    sliced : dask.Array
    """
    objs = []
    offsets = np.hstack([0, np.cumsum(arr.chunks[0])[:-1]])

    for i, (x, idx2) in enumerate(
        zip(arr.to_delayed().ravel(), idx.to_delayed().ravel())
    ):
        idx3 = idx2 - offsets[i]
        objs.append(x[idx3])

    shapes = idx.chunks[0]
    if arr.ndim == 2:
        P = arr.shape[1]
        shapes = [(x, P) for x in shapes]
    else:
        shapes = [(x,) for x in shapes]

    sliced = da.concatenate(
        [
            da.from_delayed(x, shape=shape, dtype=arr.dtype)
            for x, shape in zip(objs, shapes)
        ]
    )
    return sliced

Source File: _split.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _split_blockwise(self, X, seeds):
        chunks = X.chunks[0]

        train_pct, test_pct = _maybe_normalize_split_sizes(
            self.train_size, self.test_size
        )
        sizes = [_validate_shuffle_split(c, test_pct, train_pct) for c in chunks]

        objs = [
            dask.delayed(_generate_idx, nout=2)(chunksize, seed, n_train, n_test)
            for chunksize, seed, (n_train, n_test) in zip(chunks, seeds, sizes)
        ]

        train_objs, test_objs = zip(*objs)
        offsets = np.hstack([0, np.cumsum(chunks)])
        train_idx = da.concatenate(
            [
                da.from_delayed(x + offset, (train_size,), np.dtype("int"))
                for x, chunksize, (train_size, _), offset in zip(
                    train_objs, chunks, sizes, offsets
                )
            ]
        )
        test_idx = da.concatenate(
            [
                da.from_delayed(x + offset, (test_size,), np.dtype("int"))
                for x, chunksize, (_, test_size), offset in zip(
                    test_objs, chunks, sizes, offsets
                )
            ]
        )

        return train_idx, test_idx

Source File: text.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def transform(self, raw_X):
        msg = "'X' should be a 1-dimensional array with length 'num_samples'."

        if not dask.is_dask_collection(raw_X):
            return self._hasher(**self.get_params()).transform(raw_X)

        if isinstance(raw_X, db.Bag):
            bag2 = raw_X.map_partitions(self._transformer)
            objs = bag2.to_delayed()
            arrs = [
                da.from_delayed(obj, (np.nan, self.n_features), self.dtype)
                for obj in objs
            ]
            result = da.concatenate(arrs, axis=0)
        elif isinstance(raw_X, dd.Series):
            result = raw_X.map_partitions(self._transformer)
        elif isinstance(raw_X, da.Array):
            # dask.Array
            chunks = ((np.nan,) * raw_X.numblocks[0], (self.n_features,))
            if raw_X.ndim == 1:
                result = raw_X.map_blocks(
                    self._transformer, dtype="f8", chunks=chunks, new_axis=1
                )
            else:
                raise ValueError(msg)
        else:
            raise ValueError(msg)

        meta = scipy.sparse.eye(0, format="csr")
        result._meta = meta
        return result

Source File: pairwise.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def pairwise_distances_argmin_min(
    X: ArrayLike,
    Y: ArrayLike,
    axis: int = 1,
    metric: Union[str, Callable[[ArrayLike, ArrayLike], float]] = "euclidean",
    batch_size: Optional[int] = None,
    metric_kwargs: Optional[Dict[str, Any]] = None,
):
    if batch_size is not None:
        msg = "'batch_size' is deprecated. Use sklearn.config_context instead.'"
        warnings.warn(msg, FutureWarning)

    XD = X.to_delayed().flatten().tolist()
    func = delayed(metrics.pairwise_distances_argmin_min, pure=True, nout=2)
    blocks = [func(x, Y, metric=metric, metric_kwargs=metric_kwargs) for x in XD]
    argmins, mins = zip(*blocks)

    argmins = [
        da.from_delayed(block, (chunksize,), np.int64)
        for block, chunksize in zip(argmins, X.chunks[0])
    ]
    # Scikit-learn seems to always use float64
    mins = [
        da.from_delayed(block, (chunksize,), "f8")
        for block, chunksize in zip(mins, X.chunks[0])
    ]
    argmins = da.concatenate(argmins)
    mins = da.concatenate(mins)
    return argmins, mins

Source File: seviri_l2_bufr.py From satpy with GNU General Public License v3.0

5 votes

def get_array(self, key):
        """Get all data from file for the given BUFR key."""
        with open(self.filename, "rb") as fh:
            msgCount = 0
            while True:
                bufr = ec.codes_bufr_new_from_file(fh)
                if bufr is None:
                    break

                ec.codes_set(bufr, 'unpack', 1)

                # if is the first message initialise our final array
                if (msgCount == 0):
                    arr = da.from_array(ec.codes_get_array(
                        bufr, key, float), chunks=CHUNK_SIZE)
                else:
                    tmpArr = da.from_array(ec.codes_get_array(
                        bufr, key, float), chunks=CHUNK_SIZE)
                    arr = da.concatenate((arr, tmpArr))

                msgCount = msgCount+1
                ec.codes_release(bufr)

        if arr.size == 1:
            arr = arr[0]

        return arr

Source File: iasi_l2_so2_bufr.py From satpy with GNU General Public License v3.0

5 votes

def get_array(self, key):
        """Get all data from file for the given BUFR key."""
        with open(self.filename, "rb") as fh:
            msgCount = 0
            while True:

                bufr = ec.codes_bufr_new_from_file(fh)
                if bufr is None:
                    break

                ec.codes_set(bufr, 'unpack', 1)

                values = ec.codes_get_array(
                        bufr, key, float)

                if len(values) == 1:
                    values = np.repeat(values, 120)

                # if is the first message initialise our final array
                if (msgCount == 0):

                    arr = da.from_array([values], chunks=CHUNK_SIZE)
                else:
                    tmpArr = da.from_array([values], chunks=CHUNK_SIZE)

                    arr = da.concatenate((arr, tmpArr), axis=0)

                msgCount = msgCount+1
                ec.codes_release(bufr)

        if arr.size == 1:
            arr = arr[0]

        return arr

Source File: test__diff.py From dask-image with BSD 3-Clause "New" or "Revised" License

5 votes

def test_laplace_comprehensions():
    np.random.seed(0)

    a = np.random.random((3, 12, 14))
    d = da.from_array(a, chunks=(3, 6, 7))

    l2s = [da_ndf.laplace(d[i]) for i in range(len(d))]
    l2c = [da_ndf.laplace(d[i])[None] for i in range(len(d))]

    dau.assert_eq(np.stack(l2s), da.stack(l2s))
    dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))

Source File: test__conv.py From dask-image with BSD 3-Clause "New" or "Revised" License

5 votes

def test_convolutions_comprehensions(da_func):
    np.random.seed(0)

    a = np.random.random((3, 12, 14))
    d = da.from_array(a, chunks=(3, 6, 7))

    weights = np.ones((1, 1))

    l2s = [da_func(d[i], weights) for i in range(len(d))]
    l2c = [da_func(d[i], weights)[None] for i in range(len(d))]

    dau.assert_eq(np.stack(l2s), da.stack(l2s))
    dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))

Source File: test__generic.py From dask-image with BSD 3-Clause "New" or "Revised" License

5 votes

def test_generic_filter_comprehensions(da_func):
    da_wfunc = lambda arr: da_func(arr, lambda x: x, 1)  # noqa: E731

    np.random.seed(0)

    a = np.random.random((3, 12, 14))
    d = da.from_array(a, chunks=(3, 6, 7))

    l2s = [da_wfunc(d[i]) for i in range(len(d))]
    l2c = [da_wfunc(d[i])[None] for i in range(len(d))]

    dau.assert_eq(np.stack(l2s), da.stack(l2s))
    dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))

Source File: test__edge.py From dask-image with BSD 3-Clause "New" or "Revised" License

5 votes

def test_edge_comprehensions(da_func):
    np.random.seed(0)

    a = np.random.random((3, 12, 14))
    d = da.from_array(a, chunks=(3, 6, 7))

    l2s = [da_func(d[i]) for i in range(len(d))]
    l2c = [da_func(d[i])[None] for i in range(len(d))]

    dau.assert_eq(np.stack(l2s), da.stack(l2s))
    dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))

Source File: test__smooth.py From dask-image with BSD 3-Clause "New" or "Revised" License

5 votes

def test_uniform_comprehensions():
    da_func = lambda arr: da_ndf.uniform_filter(arr, 1, origin=0)  # noqa: E731

    np.random.seed(0)

    a = np.random.random((3, 12, 14))
    d = da.from_array(a, chunks=(3, 6, 7))

    l2s = [da_func(d[i]) for i in range(len(d))]
    l2c = [da_func(d[i])[None] for i in range(len(d))]

    dau.assert_eq(np.stack(l2s), da.stack(l2s))
    dau.assert_eq(np.concatenate(l2c), da.concatenate(l2c))

Source File: utils.py From xmitgcm with MIT License

5 votes

def llc_facets_3d_spatial_to_compact(facets, dimname, extra_metadata):
    """ Write in compact form a list of 3d facets

    PARAMETERS
    ----------
    facets : dict
        dict of xarray.dataarrays for the facets
    extra_metadata : dict
        extra_metadata from get_extra_metadata

    RETURNS
    -------
    flatdata : numpy.array
        all the data in vector form
    """

    nz = len(facets['facet0'][dimname])
    nfacets = len(facets)
    flatdata = np.array([])

    for kz in range(nz):
        # rebuild the dict
        tmpdict = {}
        for kfacet in range(nfacets):
            this_facet = facets['facet' + str(kfacet)]
            if this_facet is not None:
                tmpdict['facet' + str(kfacet)] = this_facet.isel(k=kz)
            else:
                tmpdict['facet' + str(kfacet)] = None
        # concatenate all 2d arrays
        compact2d = llc_facets_2d_to_compact(tmpdict, extra_metadata)
        flatdata = np.concatenate([flatdata, compact2d])

    return flatdata

Source File: utils.py From xmitgcm with MIT License

5 votes

def find_concat_dim_facet(da, facet, extra_metadata):
    """ In llc grids, find along which horizontal dimension to concatenate
    facet between i, i_g and j, j_g. If the order of the facet is F, concat
    along i or i_g. If order is C, concat along j or j_g. Also return
    horizontal dim not to concatenate

    PARAMETERS
    ----------
    da : xarray.DataArray
        xmitgcm llc data array
    facet : int
        facet number
    extra_metadata : dict
        dict of extra_metadata from get_extra_metadata

    RETURNS
    -------
    concat_dim, nonconcat_dim : str, str
        names of the dimensions for concatenation or not

    """
    order = extra_metadata['facet_orders'][facet]
    if order == 'C':
        possible_concat_dims = ['j', 'j_g']
    elif order == 'F':
        possible_concat_dims = ['i', 'i_g']

    concat_dim = find_concat_dim(da, possible_concat_dims)

    # we also need to other horizontal dimension for vector indexing
    all_dims = list(da.dims)
    # discard face
    all_dims.remove('face')
    # remove the concat_dim to find horizontal non_concat dimension
    all_dims.remove(concat_dim)
    non_concat_dim = all_dims[0]
    return concat_dim, non_concat_dim

Source File: __init__.py From pyresample with GNU Lesser General Public License v3.0

5 votes

def get_border_lonlats(geo_def):
    """Get the border x- and y-coordinates."""
    if geo_def.proj_dict['proj'] == 'geos':
        lon_b, lat_b = get_geostationary_bounding_box(geo_def, 3600)
    else:
        lons, lats = geo_def.get_boundary_lonlats()
        lon_b = np.concatenate((lons.side1, lons.side2, lons.side3, lons.side4))
        lat_b = np.concatenate((lats.side1, lats.side2, lats.side3, lats.side4))

    return lon_b, lat_b

Source File: _bed_read.py From pandas-plink with MIT License

5 votes

def read_bed(filepath, nrows, ncols):
    from dask.array import concatenate, from_delayed
    from dask.delayed import delayed

    chunk_size = 1024

    row_start = 0
    col_xs = []
    while row_start < nrows:
        row_end = min(row_start + chunk_size, nrows)
        col_start = 0
        row_xs = []
        while col_start < ncols:
            col_end = min(col_start + chunk_size, ncols)

            x = delayed(_read_bed_chunk)(
                filepath, nrows, ncols, row_start, row_end, col_start, col_end
            )

            shape = (row_end - row_start, col_end - col_start)
            row_xs += [from_delayed(x, shape, float64)]
            col_start = col_end
        col_xs += [concatenate(row_xs, axis=1)]
        row_start = row_end
    X = concatenate(col_xs, axis=0)
    return X

Source File: meta.py From gbdxtools with MIT License

5 votes

def _slice_padded(self, _bounds):
        pads = (max(-_bounds[0], 0), max(-_bounds[1], 0),
                max(_bounds[2]-self.shape[2], 0), max(_bounds[3]-self.shape[1], 0))
        bounds = (max(_bounds[0], 0),
                  max(_bounds[1], 0),
                  max(min(_bounds[2], self.shape[2]), 0),
                  max(min(_bounds[3], self.shape[1]), 0))
        result = self[:, bounds[1]:bounds[3], bounds[0]:bounds[2]]
        if pads[0] > 0:
            dims = (result.shape[0], result.shape[1], pads[0])
            result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype),
                                     result], axis=2)
        if pads[2] > 0:
            dims = (result.shape[0], result.shape[1], pads[2])
            result = da.concatenate([result,
                                     da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=2)
        if pads[1] > 0:
            dims = (result.shape[0], pads[1], result.shape[2])
            result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype),
                                     result], axis=1)
        if pads[3] > 0:
            dims = (result.shape[0], pads[3], result.shape[2])
            result = da.concatenate([result,
                                     da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=1)

        return (result, _bounds[0], _bounds[1])

Source File: transform.py From nbodykit with GNU General Public License v3.0

4 votes

def ConcatenateSources(*sources, **kwargs):
    """
    Concatenate CatalogSource objects together, optionally including only
    certain columns in the returned source.

    .. note::
        The returned catalog object carries the meta-data from only
        the first catalog supplied to this function (in the ``attrs`` dict).

    Parameters
    ----------
    *sources : subclass of :class:`~nbodykit.base.catalog.CatalogSource`
        the catalog source objects to concatenate together
    columns : str, list of str, optional
        the columns to include in the concatenated catalog

    Returns
    -------
    CatalogSource :
        the concatenated catalog source object

    Examples
    --------
    >>> from nbodykit.lab import *
    >>> source1 = UniformCatalog(nbar=100, BoxSize=1.0)
    >>> source2 = UniformCatalog(nbar=100, BoxSize=1.0)
    >>> print(source1.csize, source2.csize)
    >>> combined = transform.ConcatenateSources(source1, source2, columns=['Position', 'Velocity'])
    >>> print(combined.csize)
    """
    from nbodykit.base.catalog import CatalogSource

    columns = kwargs.get('columns', None)
    if isinstance(columns, string_types):
        columns = [columns]

    # concatenate all columns, if none provided
    if columns is None or columns == []:
        columns = sources[0].columns

    # check comms
    if not all(src.comm == sources[0].comm for src in sources):
        raise ValueError("cannot concatenate sources: comm mismatch")

    # check all columns are there
    for source in sources:
        if not all(col in source for col in columns):
            raise ValueError(("cannot concatenate sources: columns are missing "
                              "from some sources"))
    # the total size
    size = numpy.sum([src.size for src in sources], dtype='intp')

    data = {}
    for col in columns:
        data[col] = da.concatenate([src[col] for src in sources], axis=0)

    toret = CatalogSource._from_columns(size, sources[0].comm, **data)
    toret.attrs.update(sources[0].attrs)
    return toret

Source File: utilities.py From minian with GNU General Public License v3.0

4 votes

def save_video(movpath, fname_mov_orig, fname_mov_rig, fname_AC, fname_ACbf,
               dsratio):
    """

    Parameters
    ----------
    movpath :

    fname_mov_orig :

    fname_mov_rig :

    fname_AC :

    fname_ACbf :

    dsratio :


    Returns
    -------


    """
    mov_orig = np.load(fname_mov_orig, mmap_mode='r')
    mov_rig = np.load(fname_mov_rig, mmap_mode='r')
    mov_ac = np.load(fname_AC, mmap_mode='r')
    mov_acbf = np.load(fname_ACbf, mmap_mode='r')
    vw = skv.FFmpegWriter(
        movpath, inputdict={'-framerate': '30'}, outputdict={'-r': '30'})
    for fidx in range(0, mov_orig.shape[0], dsratio):
        print("writing frame: " + str(fidx))
        fm_orig = mov_orig[fidx, :, :] * 255
        fm_rig = mov_rig[fidx, :, :] * 255
        fm_acbf = mov_acbf[fidx, :, :] * 255
        fm_ac = mov_ac[fidx, :, :] * 255
        fm = np.concatenate(
            [
                np.concatenate([fm_orig, fm_rig], axis=1),
                np.concatenate([fm_acbf, fm_ac], axis=1)
            ],
            axis=0)
        vw.writeFrame(fm)
    vw.close()

Source File: utils.py From xmitgcm with MIT License

4 votes

def _pad_array(data, file_metadata, face=0):
    """
    Return a padded array. If input data is a numpy.memmap and no padding
    is necessary, the function preserves its type. Otherwise, the concatenate
    forces it to load into memory.

    Parameters
    ----------

    data          : numpy array or memmap
                    input data
    file_metadata : dict
                    metadata for file
    face          : int, optional
                    llc face if applicable

    Returns
    -------
    numpy.array or numpy.memmap

    """

    # Pad data before in y direction
    if 'pad_before_y' in file_metadata:
        if file_metadata['has_faces']:
            facet_origin = file_metadata['face_facets'][face]
            nypad_before = file_metadata['pad_before_y'][facet_origin]
        else:
            nypad_before = file_metadata['pad_before_y']

        pad_before = np.zeros((nypad_before, file_metadata['nx']))
        data_padded_before = np.concatenate(
            (pad_before, data), axis=0)
    else:
        data_padded_before = data

    # Pad data after in y direction
    if 'pad_after_y' in file_metadata:
        if file_metadata['has_faces']:
            facet_origin = file_metadata['face_facets'][face]
            nypad_after = file_metadata['pad_after_y'][facet_origin]
        else:
            nypad_after = file_metadata['pad_after_y']

        pad_after = np.zeros((nypad_after, file_metadata['nx']))
        data_padded_after = np.concatenate(
            (data_padded_before, pad_after), axis=0)
    else:
        data_padded_after = data_padded_before

    return data_padded_after

Source File: utils.py From xmitgcm with MIT License

4 votes

def _reshape_llc_data(data, jdim):  # pragma: no cover
    """Fix the weird problem with llc data array order."""
    # Can we do this without copying any data?
    # If not, we need to go upstream and implement this at the MDS level
    # Or can we fudge it with dask?
    # this is all very specific to the llc file output
    # would be nice to generalize more, but how?
    nside = data.shape[jdim] // LLC_NUM_FACES
    # how the LLC data is laid out along the j dimension
    strides = ((0,3), (3,6), (6,7), (7,10), (10,13))
    # whether to reshape each face
    reshape = (False, False, False, True, True)
    # this will slice the data into 5 facets
    slices = [jdim * (slice(None),) + (slice(nside*st[0], nside*st[1]),)
              for st in strides]
    facet_arrays = [data[sl] for sl in slices]
    face_arrays = []
    for ar, rs, st in zip(facet_arrays, reshape, strides):
        nfaces_in_facet = st[1] - st[0]
        shape = list(ar.shape)
        if rs:
            # we assume the other horizontal dimension is immediately after jdim
            shape[jdim] = ar.shape[jdim+1]
            shape[jdim+1] = ar.shape[jdim]
        # insert a length-1 dimension along which to concatenate
        shape.insert(jdim, 1)
        # this modify the array shape in place, with no copies allowed
        # but it doesn't work with dask arrays
        # ar.shape = shape
        ar = ar.reshape(shape)
        # now ar is propery shaped, but we still need to slice it into faces
        face_slice_dim = jdim + 1 + rs
        for n in range(nfaces_in_facet):
            face_slice = (face_slice_dim * (slice(None),) +
                          (slice(nside*n, nside*(n+1)),))
            data_face = ar[face_slice]
            face_arrays.append(data_face)

    # We can't concatenate using numpy (hcat etc.) because it makes a copy,
    # presumably loading the memmaps into memory.
    # Using dask gets around this.
    # But what if we want different chunks, or already chunked the data
    # upstream? Doesn't seem like this is ideal
    # TODO: Refactor handling of dask arrays and chunking
    #return np.concatenate(face_arrays, axis=jdim)
    # the dask version doesn't work because of this:
    # https://github.com/dask/dask/issues/1645
    face_arrays_dask = [dsa.from_array(fa, chunks=fa.shape)
                        for fa in face_arrays]
    concat = dsa.concatenate(face_arrays_dask, axis=jdim)
    return concat

Source File: _split.py From dask-ml with BSD 3-Clause "New" or "Revised" License

4 votes

def _split(self, test_start, test_stop, n_samples, chunks, seeds):
        train_objs = []
        test_objs = []
        train_sizes = []
        test_sizes = []

        offset = 0
        for chunk, seed in zip(chunks, seeds):
            start, stop = offset, offset + chunk

            test_id_start = max(test_start, start)
            test_id_stop = min(test_stop, stop)

            if test_id_start < test_id_stop:
                test_objs.append(
                    dask.delayed(_generate_offset_idx)(
                        chunk, test_id_start, test_id_stop, offset, seed
                    )
                )
                test_sizes.append(test_id_stop - test_id_start)

            train_id_stop = min(test_id_start, stop)
            if train_id_stop > start:
                train_objs.append(
                    dask.delayed(_generate_offset_idx)(
                        chunk, start, train_id_stop, offset, seed
                    )
                )
                train_sizes.append(train_id_stop - start)

            train_id_start = max(test_id_stop, start)
            if train_id_start < stop:
                train_objs.append(
                    dask.delayed(_generate_offset_idx)(
                        chunk, train_id_start, stop, offset, seed
                    )
                )
                train_sizes.append(stop - train_id_start)
            offset = stop

        train_idx = da.concatenate(
            [
                da.from_delayed(obj, (train_size,), np.dtype("int"))
                for obj, train_size in zip(train_objs, train_sizes)
            ]
        )

        test_idx = da.concatenate(
            [
                da.from_delayed(obj, (test_size,), np.dtype("int"))
                for obj, test_size in zip(test_objs, test_sizes)
            ]
        )

        return train_idx, test_idx

Source File: io_utils.py From pyxem with GNU General Public License v3.0

4 votes

def _untangle_raw(data, hdr_info, stack_size):
    """
    Corrects for the tangled raw mib format - Only the case for quad chip is considered here.

    Parameters
    --------
        data: dask array
            as stack with the detector array unreshaped, e.g. for a single frame 512*512: (1, 262144)
        hdr_info: dict
            info read from the header- ouput of the _parse_hdr function
        stack_size: int
            The number of frames in the data

    Outputs
    ----------
    untangled_data: dask array
        corrected dask array object reshaped on the detector plane, e.g. for a single frame case
        as above: (1, 512, 512)
    """
    width = hdr_info["width"]
    height = hdr_info["height"]
    width_height = width * height
    if (
        hdr_info["Counter Depth (number)"] == 24
        or hdr_info["Counter Depth (number)"] == 12
    ):
        cols = 4

    elif hdr_info["Counter Depth (number)"] == 1:
        cols = 64

    elif hdr_info["Counter Depth (number)"] == 6:
        cols = 8

    data = data.reshape((stack_size * width_height))

    data = data.reshape(stack_size, height * (height // cols), cols)

    data = da.flip(data, 2)

    if hdr_info["Assembly Size"] == "2x2":
        data = data.reshape((stack_size * width_height))
        data = data.reshape(stack_size, 512 // 2, 512 * 2)

        det1 = data[:, :, 0:256]
        det2 = data[:, :, 256:512]
        det3 = data[:, :, 512 : 512 + 256]
        det4 = data[:, :, 512 + 256 :]

        det3 = da.flip(det3, 2)
        det3 = da.flip(det3, 1)

        det4 = da.flip(det4, 2)
        det4 = da.flip(det4, 1)

        untangled_data = da.concatenate(
            (da.concatenate((det1, det3), 1), da.concatenate((det2, det4), 1)), 2
        )
    return untangled_data

Source File: _encoders.py From dask-ml with BSD 3-Clause "New" or "Revised" License

4 votes

def _transform(
        self, X: Union[ArrayLike, DataFrameType], handle_unknown: str = "error"
    ) -> Union[ArrayLike, DataFrameType]:
        X = check_array(
            X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True
        )

        is_array = isinstance(X, da.Array)

        if is_array:
            _, n_features = X.shape
        else:
            n_features = len(X.columns)

        if is_array:
            # We encode each column independently, as they have different categories.
            Xs = [
                _encode_dask_array(
                    X[:, i],
                    uniques=self.categories_[i],
                    encode=True,
                    onehot_dtype=self.dtype,
                )[1]
                for i in range(n_features)
            ]
            X = da.concatenate(Xs, axis=1)

            if not self.sparse:
                X = X.map_blocks(lambda x: x.toarray(), dtype=self.dtype)

        else:
            import dask.dataframe as dd

            # Validate that all are categorical.
            if not (X.dtypes == "category").all():
                raise ValueError("Must be all categorical.")

            if not len(X.columns) == len(self.categories_):
                raise ValueError(
                    "Number of columns ({}) does not match number "
                    "of categories_ ({})".format(len(X.columns), len(self.categories_))
                )

            for col, dtype in zip(X.columns, self.dtypes_):
                if not (X[col].dtype == dtype):
                    raise ValueError(
                        "Different CategoricalDtype for fit and "
                        "transform. '{}' != {}'".format(dtype, X[col].dtype)
                    )

            return dd.get_dummies(X, sparse=self.sparse, dtype=self.dtype)

        return X

Python dask.array.concatenate() Examples