Python Examples of pandas.core.algorithms.unique

Source File: test_algos.py From vnpy_crypto with MIT License

7 votes

def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            ['2015-01-03T00:00:00.000000000+0000',
             '2015-01-01T00:00:00.000000000+0000'],
            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

Source File: frequencies.py From recruit with Apache License 2.0

6 votes

def _get_wom_rule(self):
        #         wdiffs = unique(np.diff(self.index.week))
        # We also need -47, -49, -48 to catch index spanning year boundary
        #     if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
        #         return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        # Only attempt to infer up to WOM-4. See #9425
        week_of_months = week_of_months[week_of_months < 4]
        if len(week_of_months) == 0 or len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = int_to_weekday[weekdays[0]]

        return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)

Source File: multi.py From recruit with Apache License 2.0

6 votes

def _engine(self):
        # Calculate the number of bits needed to represent labels in each
        # level, as log2 of their sizes (including -1 for NaN):
        sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))

        # Sum bit counts, starting from the _right_....
        lev_bits = np.cumsum(sizes[::-1])[::-1]

        # ... in order to obtain offsets such that sorting the combination of
        # shifted codes (one for each level, resulting in a unique integer) is
        # equivalent to sorting lexicographically the codes themselves. Notice
        # that each level needs to be shifted by the number of bits needed to
        # represent the _previous_ ones:
        offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')

        # Check the total number of bits needed for our representation:
        if lev_bits[0] > 64:
            # The levels would overflow a 64 bit uint - use Python integers:
            return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
        return MultiIndexUIntEngine(self.levels, self.codes, offsets)

Source File: pytables.py From Computable with MIT License

6 votes

def _reindex_axis(obj, axis, labels, other=None):
    ax = obj._get_axis(axis)
    labels = _ensure_index(labels)

    # try not to reindex even if other is provided
    # if it equals our current index
    if other is not None:
        other = _ensure_index(other)
    if (other is None or labels.equals(other)) and labels.equals(ax):
        return obj

    labels = _ensure_index(labels.unique())
    if other is not None:
        labels = labels & _ensure_index(other.unique())
    if not labels.equals(ax):
        slicer = [slice(None, None)] * obj.ndim
        slicer[axis] = labels
        obj = obj.loc[tuple(slicer)]
    return obj

Source File: frequencies.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def _get_wom_rule(self):
        #         wdiffs = unique(np.diff(self.index.week))
        # We also need -47, -49, -48 to catch index spanning year boundary
        #     if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
        #         return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        # Only attempt to infer up to WOM-4. See #9425
        week_of_months = week_of_months[week_of_months < 4]
        if len(week_of_months) == 0 or len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = int_to_weekday[weekdays[0]]

        return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)

Source File: frequencies.py From Computable with MIT License

6 votes

def _get_wom_rule(self):
#         wdiffs = unique(np.diff(self.index.week))
        #We also need -47, -49, -48 to catch index spanning year boundary
#         if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
#             return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        if len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = _weekday_rule_aliases[weekdays[0]]

        return 'WOM-%d%s' % (week, wd)

Source File: pytables.py From vnpy_crypto with MIT License

6 votes

def _reindex_axis(obj, axis, labels, other=None):
    ax = obj._get_axis(axis)
    labels = _ensure_index(labels)

    # try not to reindex even if other is provided
    # if it equals our current index
    if other is not None:
        other = _ensure_index(other)
    if (other is None or labels.equals(other)) and labels.equals(ax):
        return obj

    labels = _ensure_index(labels.unique())
    if other is not None:
        labels = _ensure_index(other.unique()) & labels
    if not labels.equals(ax):
        slicer = [slice(None, None)] * obj.ndim
        slicer[axis] = labels
        obj = obj.loc[tuple(slicer)]
    return obj

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            ['2015-01-03T00:00:00.000000000+0000',
             '2015-01-01T00:00:00.000000000+0000'],
            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

Source File: pytables.py From vnpy_crypto with MIT License

6 votes

def write(self, obj, **kwargs):
        super(BlockManagerFixed, self).write(obj, **kwargs)
        data = obj._data
        if not data.is_consolidated():
            data = data.consolidate()

        self.attrs.ndim = data.ndim
        for i, ax in enumerate(data.axes):
            if i == 0:
                if not ax.is_unique:
                    raise ValueError(
                        "Columns index has to be unique for fixed format")
            self.write_index('axis%d' % i, ax)

        # Supporting mixed-type DataFrame objects...nontrivial
        self.attrs.nblocks = len(data.blocks)
        for i, blk in enumerate(data.blocks):
            # I have no idea why, but writing values before items fixed #2299
            blk_items = data.items.take(blk.mgr_locs)
            self.write_array('block%d_values' % i, blk.values, items=blk_items)
            self.write_index('block%d_items' % i, blk_items)

Source File: pytables.py From recruit with Apache License 2.0

6 votes

def _reindex_axis(obj, axis, labels, other=None):
    ax = obj._get_axis(axis)
    labels = ensure_index(labels)

    # try not to reindex even if other is provided
    # if it equals our current index
    if other is not None:
        other = ensure_index(other)
    if (other is None or labels.equals(other)) and labels.equals(ax):
        return obj

    labels = ensure_index(labels.unique())
    if other is not None:
        labels = ensure_index(other.unique()).intersection(labels, sort=False)
    if not labels.equals(ax):
        slicer = [slice(None, None)] * obj.ndim
        slicer[axis] = labels
        obj = obj.loc[tuple(slicer)]
    return obj

Source File: test_algos.py From recruit with Apache License 2.0

6 votes

def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

Source File: test_algos.py From recruit with Apache License 2.0

6 votes

def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            ['2015-01-03T00:00:00.000000000+0000',
             '2015-01-01T00:00:00.000000000+0000'],
            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000',
                                   '2015-01-01T00:00:00.000000000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

Source File: test_algos.py From vnpy_crypto with MIT License

5 votes

def test_get_unique(self):
        s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
        exp = np.array([1, 2, 2**63], dtype=np.uint64)
        tm.assert_numpy_array_equal(s.unique(), exp)

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_hashtable_unique(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, 'make' + tm_dtype + 'Index')
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.unique()
        expected_unique = s_duplicated.drop_duplicates(keep='first').values
        result_unique = htable().unique(s_duplicated.values)
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # test return_inverse=True
        # reconstruction can only succeed if the inverse is correct
        result_unique, result_inverse = htable().unique(s_duplicated.values,
                                                        return_inverse=True)
        tm.assert_numpy_array_equal(result_unique, expected_unique)
        reconstr = result_unique[result_inverse]
        tm.assert_numpy_array_equal(reconstr, s_duplicated.values)

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_do_not_mangle_na_values(self, unique_nulls_fixture,
                                     unique_nulls_fixture2):
        # GH 22295
        if unique_nulls_fixture is unique_nulls_fixture2:
            return  # skip it, values not unique
        a = np.array([unique_nulls_fixture,
                      unique_nulls_fixture2], dtype=np.object)
        result = pd.unique(a)
        assert result.size == 2
        assert a[0] is unique_nulls_fixture
        assert a[1] is unique_nulls_fixture2

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_different_nans(self):
        # GH 21866
        # create different nans from bit-patterns:
        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
        assert NAN1 != NAN1
        assert NAN2 != NAN2
        a = np.array([NAN1, NAN2])  # NAN1 and NAN2 are equivalent
        result = pd.unique(a)
        expected = np.array([np.nan])
        tm.assert_numpy_array_equal(result, expected)

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_signed_zero(self):
        # GH 21866
        a = np.array([-0.0, 0.0])
        result = pd.unique(a)
        expected = np.array([-0.0])  # 0.0 and -0.0 are equivalent
        tm.assert_numpy_array_equal(result, expected)

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_unique_tuples(self, arr, unique):
        # https://github.com/pandas-dev/pandas/issues/16519
        expected = np.empty(len(unique), dtype=object)
        expected[:] = unique

        result = pd.unique(arr)
        tm.assert_numpy_array_equal(result, expected)

Source File: test_algos.py From vnpy_crypto with MIT License

5 votes

def test_obj_none_preservation(self):
        # GH 20866
        arr = np.array(['foo', None], dtype=object)
        result = pd.unique(arr)
        expected = np.array(['foo', None], dtype=object)

        tm.assert_numpy_array_equal(result, expected, strict_nan=True)

Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_hashtable_factorize(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, 'make' + tm_dtype + 'Index')
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)
        na_mask = s_duplicated.isna().values

        result_unique, result_inverse = htable().factorize(s_duplicated.values)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.factorize()
        # since factorize removes all NaNs, we do the same here
        expected_unique = s_duplicated.dropna().drop_duplicates().values
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # reconstruction can only succeed if the inverse is correct. Since
        # factorize removes the NaNs, those have to be excluded here as well
        result_reconstruct = result_unique[result_inverse[~na_mask]]
        expected_reconstruct = s_duplicated.dropna().values
        tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)

Source File: pytables.py From Computable with MIT License

5 votes

def unique(self, key, column, **kwargs):
        warnings.warn("unique(key,column) is deprecated\n"
                      "use select_column(key,column).unique() instead",
                      FutureWarning)
        return self.get_storer(key).read_column(column=column,
                                                **kwargs).unique()

Source File: test_algos.py From vnpy_crypto with MIT License

5 votes

def test_unique_tuples(self, arr, unique):
        # https://github.com/pandas-dev/pandas/issues/16519
        expected = np.empty(len(unique), dtype=object)
        expected[:] = unique

        result = pd.unique(arr)
        tm.assert_numpy_array_equal(result, expected)

Source File: multi.py From vnpy_crypto with MIT License

5 votes

def unique(self, level=None):

        if level is None:
            return super(MultiIndex, self).unique()
        else:
            level = self._get_level_number(level)
            return self._get_level_values(level=level, unique=True)

Source File: frequencies.py From Computable with MIT License

5 votes

def _get_annual_rule(self):
        if len(self.ydiffs) > 1:
            return None

        if len(algos.unique(self.fields['M'])) > 1:
            return None

        pos_check = self.month_position_check()
        return {'cs': 'AS', 'bs': 'BAS',
                'ce': 'A', 'be': 'BA'}.get(pos_check)

Source File: test_algos.py From vnpy_crypto with MIT License

5 votes

def test_unique_label_indices():

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False)

    a[np.random.choice(len(a), 10)] = -1
    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False)

Source File: multi.py From vnpy_crypto with MIT License

5 votes

def _verify_integrity(self, labels=None, levels=None):
        """

        Parameters
        ----------
        labels : optional list
            Labels to check for validity. Defaults to current labels.
        levels : optional list
            Levels to check for validity. Defaults to current levels.

        Raises
        ------
        ValueError
            If length of levels and labels don't match, if any label would
            exceed level bounds, or there are any duplicate levels.
        """
        # NOTE: Currently does not check, among other things, that cached
        # nlevels matches nor that sortorder matches actually sortorder.
        labels = labels or self.labels
        levels = levels or self.levels

        if len(levels) != len(labels):
            raise ValueError("Length of levels and labels must match. NOTE:"
                             " this index is in an inconsistent state.")
        label_length = len(self.labels[0])
        for i, (level, label) in enumerate(zip(levels, labels)):
            if len(label) != label_length:
                raise ValueError("Unequal label lengths: %s" %
                                 ([len(lab) for lab in labels]))
            if len(label) and label.max() >= len(level):
                raise ValueError("On level %d, label max (%d) >= length of"
                                 " level  (%d). NOTE: this index is in an"
                                 " inconsistent state" % (i, label.max(),
                                                          len(level)))
            if not level.is_unique:
                raise ValueError("Level values must be unique: {values} on "
                                 "level {level}".format(
                                     values=[value for value in level],
                                     level=i))

Source File: multi.py From vnpy_crypto with MIT License

5 votes

def _get_level_values(self, level, unique=False):
        """
        Return vector of label values for requested level,
        equal to the length of the index

        **this is an internal method**

        Parameters
        ----------
        level : int level
        unique : bool, default False
            if True, drop duplicated values

        Returns
        -------
        values : ndarray
        """

        values = self.levels[level]
        labels = self.labels[level]
        if unique:
            labels = algos.unique(labels)
        filled = algos.take_1d(values._values, labels,
                               fill_value=values._na_value)
        values = values._shallow_copy(filled)
        return values

Source File: datetimes.py From vnpy_crypto with MIT License

5 votes

def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
    """
    Convert array of dates with a cache and box the result

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
    cache_array : Series
        Cache of converted, unique dates
    box : boolean
        True boxes result as an Index-like, False returns an ndarray
    errors : string
        'ignore' plus box=True will convert result to Index
    name : string, default None
        Name for a DatetimeIndex

    Returns
    -------
    result : datetime of converted dates
        Returns:

        - Index-like if box=True
        - ndarray if box=False
    """
    from pandas import Series, DatetimeIndex, Index
    result = Series(arg).map(cache_array)
    if box:
        if errors == 'ignore':
            return Index(result)
        else:
            return DatetimeIndex(result, name=name)
    return result.values

Source File: datetimes.py From vnpy_crypto with MIT License

5 votes

def _maybe_cache(arg, format, cache, tz, convert_listlike):
    """
    Create a cache of unique dates from an array of dates

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
    format : string
        Strftime format to parse time
    cache : boolean
        True attempts to create a cache of converted values
    tz : string
        Timezone of the dates
    convert_listlike : function
        Conversion function to apply on dates

    Returns
    -------
    cache_array : Series
        Cache of converted, unique dates. Can be empty
    """
    from pandas import Series
    cache_array = Series()
    if cache:
        # Perform a quicker unique check
        from pandas import Index
        if not Index(arg).is_unique:
            unique_dates = algorithms.unique(arg)
            cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
            cache_array = Series(cache_dates, index=unique_dates)
    return cache_array

Python pandas.core.algorithms.unique() Examples