Python Examples of numpy.searchsorted

Source File: sparse.py From recruit with Apache License 2.0

6 votes

def _first_fill_value_loc(self):
        """
        Get the location of the first missing value.

        Returns
        -------
        int
        """
        if len(self) == 0 or self.sp_index.npoints == len(self):
            return -1

        indices = self.sp_index.to_int_index().indices
        if not len(indices) or indices[0] > 0:
            return 0

        diff = indices[1:] - indices[:-1]
        return np.searchsorted(diff, 2) + 1

Source File: Collection.py From fullrmc with GNU Affero General Public License v3.0

6 votes

def get_real_index(self, relativeIndex):
        """
        Compute real index of the given relativeIndex considering
        already collected indexes.

        :Parameters:
            #. relativeIndex (int): Atom relative index to already collected
               indexes.

        :Parameters:
            #. index (int): Atom real index.
        """
        ### THIS IS NOT TESTED YET.
        indexes = np.array( sorted(self.indexes) )
        shift   = np.searchsorted(a=indexes, v=relativeIndex, side='left')
        index   = relativeIndex+shift
        for idx in indexes[shift:]:
            if idx > index:
                break
            index += 1
        return index

Source File: utils.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0

6 votes

def sample_categorical(prob, rng):
    """Sample from independent categorical distributions

    Each batch is an independent categorical distribution.

    Parameters
    ----------
    prob : numpy.ndarray
      Probability of the categorical distribution. Shape --> (batch_num, category_num)
    rng : numpy.random.RandomState

    Returns
    -------
    ret : numpy.ndarray
      Sampling result. Shape --> (batch_num,)
    """
    ret = numpy.empty(prob.shape[0], dtype=numpy.float32)
    for ind in range(prob.shape[0]):
        ret[ind] = numpy.searchsorted(numpy.cumsum(prob[ind]), rng.rand()).clip(min=0.0,
                                                                                max=prob.shape[
                                                                                        1] - 0.5)
    return ret

Source File: pregenerate_training_data.py From tpu_pretrain with Apache License 2.0

6 votes

def sample_doc(self, current_idx, sentence_weighted=True):
        # Uses the current iteration counter to ensure we don't sample the same doc twice
        if sentence_weighted:
            # With sentence weighting, we sample docs proportionally to their sentence length
            if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):
                self._precalculate_doc_weights()
            rand_start = self.doc_cumsum[current_idx]
            rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
            sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
            sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
        else:
            # If we don't use sentence weighting, then every doc has an equal chance to be chosen
            sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
        assert sampled_doc_index != current_idx
        if self.reduce_memory:
            return self.document_shelf[str(sampled_doc_index)]
        else:
            return self.documents[sampled_doc_index]

Source File: MoveGenerator.py From fullrmc with GNU Affero General Public License v3.0

6 votes

def move(self, coordinates):
        """
        Move coordinates.

        :Parameters:
            #. coordinates (np.ndarray): The coordinates on which to apply
               the transformation.

        :Returns:
            #. coordinates (np.ndarray): The new coordinates after applying
               the transformation.
        """
        if self.__randomize:
            index = INT_TYPE( np.searchsorted(self.__selectionScheme, generate_random_float()) )
            moveGenerator = self.__collection[ index ]
        else:
            moveGenerator = self.__collection[self.__step]
            self.__step   = (self.__step+1)%len(self.__collection)
        # perform the move
        return moveGenerator.move(coordinates)

Source File: Collection.py From fullrmc with GNU Affero General Public License v3.0

6 votes

def collect(self, index, dataDict, check=True):
        """
        Collect atom given its index.

        :Parameters:
            #. index (int): The atom index to collect.
            #. dataDict (dict): The atom data dict to collect.
            #. check (boolean):  Whether to check dataDict keys before
               collecting. If set to False, user promises that collected
               data is a dictionary and contains the needed keys.
        """
        assert not self.is_collected(index), LOGGER.error("attempting to collect and already collected atom of index '%i'"%index)
        # add data
        if check:
            assert isinstance(dataDict, dict), LOGGER.error("dataDict must be a dictionary of data where keys are dataKeys")
            assert tuple(sorted(dataDict)) == self.__dataKeys, LOGGER.error("dataDict keys don't match promised dataKeys")
        self.__collectedData[index] = dataDict
        # set indexes sorted array
        idx = np.searchsorted(a=self.__indexesSortedArray, v=index, side='left')
        self.__indexesSortedArray = np.insert(self.__indexesSortedArray, idx, index)
        # set state
        self.__state = str(uuid.uuid1())

Source File: Collection.py From fullrmc with GNU Affero General Public License v3.0

6 votes

def release(self, index):
        """
        Release atom from list of collected atoms and return its
        collected data.

        :Parameters:
            #. index (int): The atom index to release.

        :Returns:
            #. dataDict (dict): The released atom collected data.
        """
        if not self.is_collected(index):
            LOGGER.warn("Attempting to release atom %i that is not collected."%index)
            return
        index = self.__collectedData.pop(index)
        # set indexes sorted array
        idx = np.searchsorted(a=self.__indexesSortedArray, v=index, side='left')
        self.__indexesSortedArray = np.insert(self.__indexesSortedArray, idx, index)
        # set state
        self.__state = str(uuid.uuid1())
        # return
        return index

Source File: pfilter.py From pfilter with MIT License

6 votes

def residual_resample(weights):
    n = len(weights)
    indices = np.zeros(n, np.uint32)
    # take int(N*w) copies of each weight
    num_copies = (n * weights).astype(np.uint32)
    k = 0
    for i in range(n):
        for _ in range(num_copies[i]):  # make n copies
            indices[k] = i
            k += 1
    # use multinormial resample on the residual to fill up the rest.
    residual = weights - num_copies  # get fractional part
    residual /= np.sum(residual)
    cumsum = np.cumsum(residual)
    cumsum[-1] = 1
    indices[k:n] = np.searchsorted(cumsum, np.random.uniform(0, 1, n - k))
    return indices

Source File: selection.py From pyshgp with MIT License

6 votes

def select(self, population: Population, n: int = 1) -> Sequence[Individual]:
        """Return `n` individuals from the population.

        Parameters
        ----------
        population
            A Population of Individuals.
        n : int
            The number of parents to select from the population. Default is 1.

        Returns
        -------
        Sequence[Individual]
            The selected Individuals.

        """
        super().select(population, n)
        population_total_errors = np.array([i.total_error for i in population])
        sum_of_total_errors = np.sum(population_total_errors)
        probabilities = 1.0 - (population_total_errors / sum_of_total_errors)
        selected_ndxs = np.searchsorted(np.cumsum(probabilities), random(n))
        return [population[ndx] for ndx in selected_ndxs]

Source File: carbonara.py From gnocchi with Apache License 2.0

6 votes

def __init__(self, ts, granularity, start=None):
        # NOTE(sileht): The whole class assumes ts is ordered and don't have
        # duplicate timestamps, it uses numpy.unique that sorted list, but
        # we always assume the orderd to be the same as the input.
        self.granularity = granularity
        self.can_derive = isinstance(granularity, numpy.timedelta64)
        self.start = start
        if start is None:
            self._ts = ts
            self._ts_for_derive = ts
        else:
            self._ts = ts[numpy.searchsorted(ts['timestamps'], start):]
            if self.can_derive:
                start_derive = start - granularity
                self._ts_for_derive = ts[
                    numpy.searchsorted(ts['timestamps'], start_derive):
                ]
        if self.can_derive:
            self.indexes = round_timestamp(self._ts['timestamps'], granularity)
        elif calendar.GROUPINGS.get(granularity):
            self.indexes = calendar.GROUPINGS.get(granularity)(
                self._ts['timestamps'])
        self.tstamps, self.counts = numpy.unique(self.indexes,
                                                 return_counts=True)

Source File: carbonara.py From gnocchi with Apache License 2.0

6 votes

def __getitem__(self, key):
        if isinstance(key, numpy.datetime64):
            idx = numpy.searchsorted(self.timestamps, key)
            if self.timestamps[idx] == key:
                return self[idx]
            raise KeyError(key)
        if isinstance(key, slice):
            if isinstance(key.start, numpy.datetime64):
                start = numpy.searchsorted(self.timestamps, key.start)
            else:
                start = key.start
            if isinstance(key.stop, numpy.datetime64):
                stop = numpy.searchsorted(self.timestamps, key.stop)
            else:
                stop = key.stop
            key = slice(start, stop, key.step)
        return self.ts[key]

Source File: carbonara.py From gnocchi with Apache License 2.0

6 votes

def set_values(self, values, before_truncate_callback=None):
        """Set the timestamps and values in this timeseries.

        :param values: A sorted timeseries array.
        :param before_truncate_callback: A callback function to call before
                                         truncating the BoundTimeSerie to its
                                         maximum size.
        :return: None of the return value of before_truncate_callback
        """
        if self.block_size is not None and len(self.ts) != 0:
            index = numpy.searchsorted(values['timestamps'],
                                       self.first_block_timestamp())
            values = values[index:]
        super(BoundTimeSerie, self).set_values(values)
        if before_truncate_callback:
            return_value = before_truncate_callback(self)
        else:
            return_value = None
        self._truncate()
        return return_value

Source File: inference_utils.py From ffn with Apache License 2.0

6 votes

def compute_histogram_lut(image):
  """Computes the inverted CDF of image intensity.

  Args:
    image: 2d numpy array containing the image

  Returns:
    a 256-element numpy array representing a lookup table `lut`,
    such that lut[uniform_image] will transform `uniform_image` with
    a uniform intensity distribution to have an intensity distribution
    matching `image`.
  """
  cdf, bins = skimage.exposure.cumulative_distribution(image)
  lut = np.zeros(256, dtype=np.uint8)
  for i in range(0, 256):
    lut[i] = bins[np.searchsorted(cdf, i / 255.0)]

  return lut

Source File: test_bayestar.py From dustmaps with GNU General Public License v2.0

6 votes

def _interp_ebv(self, datum, dist):
        """
        Calculate samples of E(B-V) at an arbitrary distance (in kpc) for one
        test coordinate.
        """
        dm = 5. * (np.log10(dist) + 2.)
        idx_ceil = np.searchsorted(datum['DM_bin_edges'], dm)
        if idx_ceil == 0:
            dist_0 = 10.**(datum['DM_bin_edges'][0]/5. - 2.)
            return dist/dist_0 * datum['samples'][:,0]
        elif idx_ceil == len(datum['DM_bin_edges']):
            return datum['samples'][:,-1]
        else:
            dm_ceil = datum['DM_bin_edges'][idx_ceil]
            dm_floor = datum['DM_bin_edges'][idx_ceil-1]
            a = (dm_ceil - dm) / (dm_ceil - dm_floor)
            return (
                (1.-a) * datum['samples'][:,idx_ceil]
                +    a * datum['samples'][:,idx_ceil-1]
            )

Source File: period.py From recruit with Apache License 2.0

6 votes

def asof_locs(self, where, mask):
        """
        where : array of timestamps
        mask : array of booleans where data is not NA

        """
        where_idx = where
        if isinstance(where_idx, DatetimeIndex):
            where_idx = PeriodIndex(where_idx.values, freq=self.freq)

        locs = self._ndarray_values[mask].searchsorted(
            where_idx._ndarray_values, side='right')

        locs = np.where(locs > 0, locs - 1, 0)
        result = np.arange(len(self))[mask].take(locs)

        first = mask.argmax()
        result[(locs == 0) & (where_idx._ndarray_values <
                              self._ndarray_values[first])] = -1

        return result

Source File: _pandas_ndarray_store.py From arctic with GNU Lesser General Public License v2.1

6 votes

def _index_range(self, version, symbol, date_range=None, **kwargs):
        """ Given a version, read the segment_index and return the chunks associated
        with the date_range. As the segment index is (id -> last datetime)
        we need to take care in choosing the correct chunks. """
        if date_range and 'segment_index' in version:
            # index is read-only but it's never written to
            index = np.frombuffer(decompress(version['segment_index']), dtype=INDEX_DTYPE)
            dtcol = self._datetime64_index(index)
            if dtcol and len(index):
                dts = index[dtcol]
                start, end = _start_end(date_range, dts)
                if start > dts[-1]:
                    return -1, -1
                idxstart = min(np.searchsorted(dts, start), len(dts) - 1)
                idxend = min(np.searchsorted(dts, end, side='right'), len(dts) - 1)
                return int(index['index'][idxstart]), int(index['index'][idxend] + 1)
        return super(PandasStore, self)._index_range(version, symbol, **kwargs)

Source File: ls_fap.py From feets with MIT License

6 votes

def fap_bootstrap(
    Z,
    fmax,
    t,
    y,
    dy,
    normalization="standard",
    n_bootstraps=1000,
    random_seed=None,
):
    rng = np.random.RandomState(random_seed)

    def bootstrapped_power():
        resample = rng.randint(0, len(y), len(y))  # sample with replacement
        ls_boot = LombScargle(t, y[resample], dy[resample])
        freq, power = ls_boot.autopower(
            normalization=normalization, maximum_frequency=fmax
        )
        return power.max()

    pmax = np.array([bootstrapped_power() for i in range(n_bootstraps)])
    pmax.sort()
    return 1 - np.searchsorted(pmax, Z) / len(pmax)

Source File: period.py From recruit with Apache License 2.0

6 votes

def searchsorted(self, value, side='left', sorter=None):
        if isinstance(value, Period):
            if value.freq != self.freq:
                msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
                                            own_freq=self.freqstr,
                                            other_freq=value.freqstr)
                raise IncompatibleFrequency(msg)
            value = value.ordinal
        elif isinstance(value, compat.string_types):
            try:
                value = Period(value, freq=self.freq).ordinal
            except DateParseError:
                raise KeyError("Cannot interpret '{}' as period".format(value))

        return self._ndarray_values.searchsorted(value, side=side,
                                                 sorter=sorter)

Source File: population.py From ibllib with MIT License

6 votes

def _index_of(arr, lookup):
    """Replace scalars in an array by their indices in a lookup table.

    Implicitely assume that:

    * All elements of arr and lookup are non-negative integers.
    * All elements or arr belong to lookup.

    This is not checked for performance reasons.

    """
    # Equivalent of np.digitize(arr, lookup) - 1, but much faster.
    # TODO: assertions to disable in production for performance reasons.
    # TODO: np.searchsorted(lookup, arr) is faster on small arrays with large
    # values
    lookup = np.asarray(lookup, dtype=np.int32)
    m = (lookup.max() if len(lookup) else 0) + 1
    tmp = np.zeros(m + 1, dtype=np.int)
    # Ensure that -1 values are kept.
    tmp[-1] = -1
    if len(lookup):
        tmp[lookup] = np.arange(len(lookup))
    return tmp[arr]

Source File: graph.py From jwalk with Apache License 2.0

6 votes

def encode_edges(edges, nodes):
    """Encode data with dictionary

    Args:
        edges (np.ndarray): np array of the form [node1, node2].
        nodes (np.array): list of unique nodes

    Returns:
        np.ndarray: relabeled edges

    Examples:
        >>> import numpy as np
        >>> edges = np.array([['A', 'B'], ['A', 'C']])
        >>> nodes = np.array(['C', 'B', 'A'])
        >>> print(encode_edges(edges, nodes))
        [[2 1]
         [2 0]]
    """
    sidx = nodes.argsort()
    relabeled_edges = sidx[np.searchsorted(nodes, edges, sorter=sidx)]
    return relabeled_edges

Source File: dmc.py From pyqmc with MIT License

6 votes

def branch(configs, weights):
    """
    Perform branching on a set of walkers  by stochastic reconfiguration

    Walkers are resampled with probability proportional to the weights, and the new weights are all set to be equal to the average weight.
    
    Args:
      configs: (nconfig,nelec,3) walker coordinates

      weights: (nconfig,) walker weights

    Returns:
      configs: resampled walker configurations

      weights: (nconfig,) all weights are equal to average weight
    """
    nconfig = configs.configs.shape[0]
    wtot = np.sum(weights)
    probability = np.cumsum(weights / wtot)
    base = np.random.rand()
    newinds = np.searchsorted(probability, (base + np.arange(nconfig) / nconfig) % 1.0)
    configs.resample(newinds)
    weights.fill(wtot / nconfig)
    return configs, weights

Source File: cascade_lifetime.py From news-popularity-prediction with Apache License 2.0

6 votes

def get_k_based_on_lifetime(data_frame, lifetime, min_k, max_k):
    lifetime_col = data_frame["timestamp"] - data_frame["timestamp"].iloc[0]
    lifetime_col = lifetime_col.iloc[min_k:]

    index = np.searchsorted(lifetime_col, lifetime)

    index = max(0, index[0]-1)

    k = min_k + index

    if lifetime_col.size > (index+1):
        next_t = lifetime_col.iloc[index+1]
        if k == min_k:
            if lifetime_col.iloc[index] == lifetime_col.iloc[index+1]:
                k += 1
                if lifetime_col.size > (index+2):
                    next_t = lifetime_col.iloc[index+2]
                else:
                    next_t = np.nan
    else:
        next_t = np.nan

    return k, next_t

Source File: carbonara.py From gnocchi with Apache License 2.0

6 votes

def truncate(self, oldest_point=None):
        """Truncate the time series up to oldest_point excluded.

        :param oldest_point: Oldest point to keep from, this excluded.
                             Default is the aggregation timespan.
        :type oldest_point: numpy.datetime64 or numpy.timedelta64
        :return: The oldest point that could have been kept.
        """
        last = self.last
        if last is None:
            return
        if oldest_point is None:
            oldest_point = self.aggregation.timespan
            if oldest_point is None:
                return
        if isinstance(oldest_point, numpy.timedelta64):
            oldest_point = last - oldest_point
        index = numpy.searchsorted(self.ts['timestamps'], oldest_point,
                                   side='right')
        self.ts = self.ts[index:]
        return oldest_point

Source File: period.py From recruit with Apache License 2.0

5 votes

def _get_string_slice(self, key):
        if not self.is_monotonic:
            raise ValueError('Partial indexing only valid for '
                             'ordered time series')

        key, parsed, reso = parse_time_string(key, self.freq)
        grp = resolution.Resolution.get_freq_group(reso)
        freqn = resolution.get_freq_group(self.freq)
        if reso in ['day', 'hour', 'minute', 'second'] and not grp < freqn:
            raise KeyError(key)

        t1, t2 = self._parsed_string_to_bounds(reso, parsed)
        return slice(self.searchsorted(t1.ordinal, side='left'),
                     self.searchsorted(t2.ordinal, side='right'))

Source File: base.py From recruit with Apache License 2.0

5 votes

def _searchsorted_monotonic(self, label, side='left'):
        if self.is_monotonic_increasing:
            return self.searchsorted(label, side=side)
        elif self.is_monotonic_decreasing:
            # np.searchsorted expects ascending sort order, have to reverse
            # everything for it to work (element ordering, search side and
            # resulting value).
            pos = self[::-1].searchsorted(label, side='right' if side == 'left'
                                          else 'left')
            return len(self) - pos

        raise ValueError('index must be monotonic increasing or decreasing')

Source File: utils.py From mars with Apache License 2.0

5 votes

def calc_columns_index(column_name, df):
    """
    Calculate the chunk index on the axis 1 according to the selected column.
    :param column_name: selected column name
    :param df: input tiled DataFrame
    :return: chunk index on the columns axis
    """
    column_nsplits = df.nsplits[1]
    column_loc = df.columns_value.to_pandas().get_loc(column_name)
    return np.searchsorted(np.cumsum(column_nsplits), column_loc + 1)

Source File: test_datetime_index.py From recruit with Apache License 2.0

5 votes

def test_resample_size():
    n = 10000
    dr = date_range('2015-09-19', periods=n, freq='T')
    ts = Series(np.random.randn(n), index=np.random.choice(dr, n))

    left = ts.resample('7T').size()
    ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T')

    bins = np.searchsorted(ix.values, ts.index.values, side='right')
    val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype('int64',
                                                              copy=False)

    right = Series(val, index=ix)
    assert_series_equal(left, right)

Source File: histograms.py From recruit with Apache License 2.0

5 votes

def _search_sorted_inclusive(a, v):
    """
    Like `searchsorted`, but where the last item in `v` is placed on the right.

    In the context of a histogram, this makes the last bin edge inclusive
    """
    return np.concatenate((
        a.searchsorted(v[:-1], 'left'),
        a.searchsorted(v[-1:], 'right')
    ))

Source File: sparse.py From recruit with Apache License 2.0

5 votes

def searchsorted(self, v, side="left", sorter=None):
        msg = "searchsorted requires high memory usage."
        warnings.warn(msg, PerformanceWarning, stacklevel=2)
        if not is_scalar(v):
            v = np.asarray(v)
        v = np.asarray(v)
        return np.asarray(self, dtype=self.dtype.subtype).searchsorted(
            v, side, sorter
        )

Source File: test_datetime_index.py From recruit with Apache License 2.0

5 votes

def test_resample_group_info(n, k):
    # GH10914

    # use a fixed seed to always have the same uniques
    prng = np.random.RandomState(1234)

    dr = date_range(start='2015-08-27', periods=n // 10, freq='T')
    ts = Series(prng.randint(0, n // k, n).astype('int64'),
                index=prng.choice(dr, n))

    left = ts.resample('30T').nunique()
    ix = date_range(start=ts.index.min(), end=ts.index.max(),
                    freq='30T')

    vals = ts.values
    bins = np.searchsorted(ix.values, ts.index, side='right')

    sorter = np.lexsort((vals, bins))
    vals, bins = vals[sorter], bins[sorter]

    mask = np.r_[True, vals[1:] != vals[:-1]]
    mask |= np.r_[True, bins[1:] != bins[:-1]]

    arr = np.bincount(bins[mask] - 1,
                      minlength=len(ix)).astype('int64', copy=False)
    right = Series(arr, index=ix)

    assert_series_equal(left, right)

Python numpy.searchsorted() Examples