Python Examples of pandas.MultiIndex

Source File: test_base.py From recruit with Apache License 2.0

6 votes

def setup_method(self, method):
        self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100),
                            strIndex=tm.makeStringIndex(100),
                            dateIndex=tm.makeDateIndex(100),
                            periodIndex=tm.makePeriodIndex(100),
                            tdIndex=tm.makeTimedeltaIndex(100),
                            intIndex=tm.makeIntIndex(100),
                            uintIndex=tm.makeUIntIndex(100),
                            rangeIndex=tm.makeRangeIndex(100),
                            floatIndex=tm.makeFloatIndex(100),
                            boolIndex=Index([True, False]),
                            catIndex=tm.makeCategoricalIndex(100),
                            empty=Index([]),
                            tuples=MultiIndex.from_tuples(lzip(
                                ['foo', 'bar', 'baz'], [1, 2, 3])),
                            repeats=Index([0, 0, 1, 1, 2, 2]))
        self.setup_indices()

Source File: test_common.py From recruit with Apache License 2.0

6 votes

def test_constructor_non_hashable_name(self, indices):
        # GH 20527

        if isinstance(indices, MultiIndex):
            pytest.skip("multiindex handled in test_multi.py")

        message = "Index.name must be a hashable type"
        renamed = [['1']]

        # With .rename()
        with pytest.raises(TypeError, match=message):
            indices.rename(name=renamed)

        # With .set_names()
        with pytest.raises(TypeError, match=message):
            indices.set_names(names=renamed)

Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def _link_index(self, df_a, df_b):
        """Build an index for linking two datasets.

        Parameters
        ----------
        df_a : (tuple of) pandas.Series
            The data of the left DataFrame to build the index with.
        df_b : (tuple of) pandas.Series
            The data of the right DataFrame to build the index with.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair
            contains the index values of two records.

        """
        raise NotImplementedError(
            "Not possible to call index for the BaseEstimator"
        )

Source File: test_common.py From recruit with Apache License 2.0

6 votes

def test_droplevel(self, indices):
        # GH 21115
        if isinstance(indices, MultiIndex):
            # Tested separately in test_multi.py
            return

        assert indices.droplevel([]).equals(indices)

        for level in indices.name, [indices.name]:
            if isinstance(indices.name, tuple) and level is indices.name:
                # GH 21121 : droplevel with tuple name
                continue
            with pytest.raises(ValueError):
                indices.droplevel(level)

        for level in 'wrong', ['wrong']:
            with pytest.raises(KeyError):
                indices.droplevel(level)

Source File: test_common.py From recruit with Apache License 2.0

6 votes

def test_duplicated(self, indices, keep):
        if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
            # MultiIndex tested separately in:
            # tests/indexes/multi/test_unique_and_duplicates
            pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex')

        holder = type(indices)

        idx = holder(indices)
        if idx.has_duplicates:
            # We are testing the duplicated-method here, so we need to know
            # exactly which indices are duplicate and how (for the result).
            # This is not possible if "idx" has duplicates already, which we
            # therefore remove. This is seemingly circular, as drop_duplicates
            # invokes duplicated, but in the end, it all works out because we
            # cross-check with Series.duplicated, which is tested separately.
            idx = idx.drop_duplicates()

        n, k = len(idx), 10
        duplicated_selection = np.random.choice(n, k * n)
        expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
        idx = holder(idx.values[duplicated_selection])

        result = idx.duplicated(keep=keep)
        tm.assert_numpy_array_equal(result, expected)

Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def fit_predict(self, comparison_vectors, match_index=None):
        """Train the classifier.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The comparison vectors.
        match_index : pandas.MultiIndex
            The true matches.
        return_type : str
            Deprecated. Use recordlinkage.options instead. Use the option
            `recordlinkage.set_option('classification.return_type', 'index')`
            instead.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """
        self.fit(comparison_vectors, match_index)
        result = self.predict(comparison_vectors)

        return result

Source File: febrl.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def _febrl_links(df):
    """Get the links of a FEBRL dataset."""

    index = df.index.to_series()
    keys = index.str.extract(r'rec-(\d+)', expand=True)[0]

    index_int = numpy.arange(len(df))

    df_helper = pandas.DataFrame({
        'key': keys,
        'index': index_int
    })

    # merge the two frame and make MultiIndex.
    pairs_df = df_helper.merge(
        df_helper, on='key'
    )[['index_x', 'index_y']]
    pairs_df = pairs_df[pairs_df['index_x'] > pairs_df['index_y']]

    return pandas.MultiIndex(
        levels=[df.index.values, df.index.values],
        codes=[pairs_df['index_x'].values, pairs_df['index_y'].values],
        names=[None, None],
        verify_integrity=False
    )

Source File: test_pandas_store.py From arctic with GNU Lesser General Public License v2.1

6 votes

def test_data_info_cols(library):
    i = MultiIndex.from_tuples([(1, "ab"), (2, "bb"), (3, "cb")])
    s = DataFrame(data=[100, 200, 300], index=i)
    library.write('test_data', s)
    md = library.get_info('test_data')
    # {'dtype': [('level_0', '<i8'), ('level_1', 'S2'), ('0', '<i8')],
    #                  'col_names': {u'index': [u'level_0', u'level_1'], u'columns': [u'0'], 'index_tz': [None, None]},
    #                  'type': u'pandasdf',
    #                  'handler': 'PandasDataFrameStore',
    #                  'rows': 3,
    #                  'segment_count': 1,
    #                  'size': 50}
    assert 'size' in md
    assert md['segment_count'] == 1
    assert md['rows'] == 3
    assert md['handler'] == 'PandasDataFrameStore'
    assert md['type'] == 'pandasdf'
    assert md['col_names'] == {'index': ['level_0', u'level_1'], 'columns': [u'0'], 'index_tz': [None, None]}
    assert len(md['dtype']) == 3
    assert md['dtype'][0][0] == 'level_0'
    assert md['dtype'][1][0] == 'level_1'
    assert md['dtype'][2][0] == '0'

Source File: measures.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def true_positives(links_true, links_pred):
    """Count the number of True Positives.

    Returns the number of correctly predicted links, also called the number of
    True Positives (TP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of correctly predicted links.
    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true & links_pred)

Source File: measures.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def false_positives(links_true, links_pred):
    """Count the number of False Positives.

    Returns the number of incorrect predictions of true non-links. (true non-
    links, but predicted as links). This value is known as the number of False
    Positives (FP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false positives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_pred.difference(links_true))

Source File: measures.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def false_negatives(links_true, links_pred):
    """Count the number of False Negatives.

    Returns the number of incorrect predictions of true links. (true links,
    but predicted as non-links). This value is known as the number of False
    Negatives (FN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false negatives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true.difference(links_pred))

Source File: test_indexing.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = Full()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = Full()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        pdt.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it.

Source File: test_indexing.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def test_index_names_pandas023(self, index_class):
        # Pandas changes the behaviour of MultiIndex names.
        # https://github.com/pandas-dev/pandas/pull/18882
        # https://github.com/J535D165/recordlinkage/issues/55
        # This test tests compatibility.

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs_link = index_class._link_index(df_a, df_b)

        if pairs_link.names[0] is not None:
            assert pairs_link.names[0] != pairs_link.names[1]

        # make the index
        pairs_dedup = index_class._dedup_index(df_a)

        if pairs_link.names[0] is not None:
            assert pairs_dedup.names[0] != pairs_dedup.names[1]

Source File: test_indexing.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def test_lower_triangular(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)
        pairs = index_class.index(df_a)

        # expected
        levels = [df_a.index.values, df_a.index.values]
        codes = np.tril_indices(len(df_a.index), k=-1)

        full_pairs = pd.MultiIndex(levels=levels,
                                   codes=codes,
                                   verify_integrity=False)

        # all pairs are in the lower triangle of the matrix.
        assert len(pairs.difference(full_pairs)) == 0

Source File: test_datasets.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def test_krebs_dataset_download():

    # remove downloaded datasets
    clear_data_home()

    krebs_data, krebs_matches = load_krebsregister()

    for i in range(1, 11):
        assert Path(get_data_home(), "krebsregister",
                    "block_{}.zip".format(i)).is_file()

    # count the number of recordss
    assert type(krebs_data), pandas.DataFrame
    assert type(krebs_matches), pandas.MultiIndex
    assert len(krebs_data) == 5749132
    assert len(krebs_matches) == 20931

Source File: multi_index.py From arctic with GNU Lesser General Public License v2.1

6 votes

def multi_index_insert_row(df, index_row, values_row):
    """ Return a new dataframe with a row inserted for a multi-index dataframe.
        This will sort the rows according to the ordered multi-index levels.
    """
    if PD_VER < '0.24.0':
        row_index = pd.MultiIndex(levels=[[i] for i in index_row],
                                  labels=[[0] for i in index_row])
    else:
        row_index = pd.MultiIndex(levels=[[i] for i in index_row],
                                  codes=[[0] for i in index_row])
    row = pd.DataFrame(values_row, index=row_index, columns=df.columns)
    df = pd.concat((df, row))
    if df.index.lexsort_depth == len(index_row) and df.index[-2] < df.index[-1]:
        # We've just appended a row to an already-sorted dataframe
        return df
    # The df wasn't sorted or the row has to be put in the middle somewhere
    return df.sort_index()

Source File: model_processing.py From respy with MIT License

6 votes

def _infer_choices_with_experience(params, options):
    """Infer choices with experiences.

    Example
    -------
    >>> options = {"covariates": {"a": "exp_white_collar + exp_a", "b": "exp_b >= 2"}}
    >>> index = pd.MultiIndex.from_product([["category"], ["a", "b"]])
    >>> params = pd.Series(index=index, dtype="object")
    >>> _infer_choices_with_experience(params, options)
    ['a', 'b', 'white_collar']

    """
    covariates = options["covariates"]
    parameters = params.index.get_level_values(1)

    used_covariates = [cov for cov in covariates if cov in parameters]

    matches = []
    for param in parameters:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", str(param))
    for cov in used_covariates:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", covariates[cov])

    return sorted(set(matches))

Source File: multi_index.py From arctic with GNU Lesser General Public License v2.1

5 votes

def groupby_asof(df, as_of=None, dt_col='sample_dt', asof_col='observed_dt'):
    ''' Common use case for selecting the latest rows from a bitemporal dataframe as-of a certain date.

    Parameters
    ----------
    df: ``pd.DataFrame``
        Dataframe with a MultiIndex index
    as_of: ``datetime``
        Return a timeseries with values observed <= this as-of date. By default, the latest observed
        values will be returned.
    dt_col: ``str`` or ``int``
        Name or index of the column in the MultiIndex that is the sample date
    asof_col: ``str`` or ``int``
        Name or index of the column in the MultiIndex that is the observed date
    '''
    if as_of:
        if as_of.tzinfo is None and df.index.get_level_values(asof_col).tz is not None:
            as_of = as_of.replace(tzinfo=mktz())
    return fancy_group_by(df,
                          grouping_level=dt_col,
                          aggregate_level=asof_col,
                          method='last',
                          max_=as_of)


# ----------------------- Insert/Append ---------------------------- #

Source File: test_pandas_store.py From arctic with GNU Lesser General Public License v2.1

5 votes

def test_save_read_pandas_empty_series_with_datetime_multiindex_with_timezone(library):
    try:
        # hack to support modern and older versions of pandas
        empty_index = pd.MultiIndex(levels=(pd.DatetimeIndex([], tz="America/Chicago"), pd.Index([])), codes=([], []))
    except Exception:
        empty_index = pd.MultiIndex(levels=(pd.DatetimeIndex([], tz="America/Chicago"), pd.Index([])), labels=([], []))

    df = Series(data=[], index=empty_index)
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert empty_index.equal_levels(saved_df.index), "Index timezone information should be maintained, even when empty"

Source File: test_market.py From pyTD with MIT License

5 votes

def test_batch_history_pandas(self):
        data = pyTD.market.get_price_history(["AAPL", "TSLA", "MSFT"],
                                             output_format='pandas')

        assert isinstance(data, pd.DataFrame)
        assert isinstance(data.columns, pd.MultiIndex)

        assert "AAPL" in data.columns
        assert "TSLA" in data.columns
        assert "MSFT" in data.columns

        assert data.iloc[0].name.date() == datetime.date(2018, 1, 2)

Source File: method_of_simulated_moments.py From respy with MIT License

5 votes

def _create_tidy_data(data, moment_set_labels):
    """Create tidy data from list of pandas.DataFrames."""
    counter = itertools.count()
    tidy_data = []
    for series_or_df, label in zip(data, moment_set_labels):
        # Join index levels for MultiIndex objects.
        if isinstance(series_or_df.index, pd.MultiIndex):
            series_or_df = series_or_df.rename(index=str)
            series_or_df.index = series_or_df.index.to_flat_index().str.join("_")
        # If moments are a pandas.Series, convert into pandas.DataFrame.
        if isinstance(series_or_df, pd.Series):
            # Unnamed pandas.Series receive a name based on a counter.
            if series_or_df.name is None:
                series_or_df = series_or_df.to_frame(name=next(counter))
            else:
                series_or_df = series_or_df.to_frame()

        # Create pandas.DataFrame in tidy format.
        tidy_df = series_or_df.unstack()
        tidy_df.index.names = ("moment_column", "moment_index")
        tidy_df.rename("value", inplace=True)
        tidy_df = tidy_df.reset_index()
        tidy_df["moment_set"] = label
        tidy_data.append(tidy_df)

    return pd.concat(tidy_data, ignore_index=True)

Source File: test_classify.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

5 votes

def test_fit_predict_unsupervised(self, classifier):

        cl = classifier()
        cl.fit(self.X_train)
        result = cl.predict(self.X_train)

        assert isinstance(result, pd.MultiIndex)

        cl2 = classifier()
        expected = cl2.fit_predict(self.X_train)

        assert isinstance(expected, pd.MultiIndex)
        assert result.values.shape == expected.values.shape

        pdt.assert_index_equal(result, expected)

Source File: mot.py From PoseWarper with Apache License 2.0

5 votes

def new_event_dataframe():
        """Create a new DataFrame for event tracking."""
        idx = pd.MultiIndex(levels=[[],[]], labels=[[],[]], names=['FrameId','Event'])
        cats = pd.Categorical([], categories=['FP', 'MISS', 'SWITCH', 'MATCH'])
        df = pd.DataFrame(
            OrderedDict([
                ('Type', pd.Series(cats)),          # Type of event. One of FP (false positive), MISS, SWITCH, MATCH
                ('OId', pd.Series(dtype=str)),      # Object ID or -1 if FP. Using float as missing values will be converted to NaN anyways.
                ('HId', pd.Series(dtype=str)),      # Hypothesis ID or NaN if MISS. Using float as missing values will be converted to NaN anyways.
                ('D', pd.Series(dtype=float)),      # Distance or NaN when FP or MISS            
            ]),
            index=idx
        )
        return df

Source File: test_common.py From recruit with Apache License 2.0

5 votes

def test_copy_and_deepcopy(self, indices):
        from copy import copy, deepcopy

        if isinstance(indices, MultiIndex):
            pytest.skip('Skip check for MultiIndex')

        for func in (copy, deepcopy):
            idx_copy = func(indices)
            assert idx_copy is not indices
            assert idx_copy.equals(indices)

        new_copy = indices.copy(deep=True, name="banana")
        assert new_copy.name == "banana"

Source File: test_pandas_store.py From arctic with GNU Lesser General Public License v2.1

5 votes

def test_save_read_pandas_series_with_multiindex_and_name(library):
    df = Series(data=['A', 'BC', 'DEF'],
                index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]),
                name='Foo')
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)
    assert df.name == 'Foo'

Source File: test_pandas_store.py From arctic with GNU Lesser General Public License v2.1

5 votes

def test_save_read_pandas_dataframe_with_multiindex(library):
    df = DataFrame(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)

Source File: test_pandas_store.py From arctic with GNU Lesser General Public License v2.1

5 votes

def test_save_read_pandas_dataframe_with_unicode_index_name(library):
    df = DataFrame(data=['A', 'BC', 'DEF'],
                   index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),),
                                                 (np.datetime64(dt(2013, 1, 2)),),
                                                 (np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME']))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)

Source File: test_common.py From recruit with Apache License 2.0

5 votes

def test_set_name_methods(self, indices):
        new_name = "This is the new name for this index"

        # don't tests a MultiIndex here (as its tested separated)
        if isinstance(indices, MultiIndex):
            pytest.skip('Skip check for MultiIndex')
        original_name = indices.name
        new_ind = indices.set_names([new_name])
        assert new_ind.name == new_name
        assert indices.name == original_name
        res = indices.rename(new_name, inplace=True)

        # should return None
        assert res is None
        assert indices.name == new_name
        assert indices.names == [new_name]
        # with pytest.raises(TypeError, match="list-like"):
        #    # should still fail even if it would be the right length
        #    ind.set_names("a")
        with pytest.raises(ValueError, match="Level must be None"):
            indices.set_names("a", level=0)

        # rename in place just leaves tuples and other containers alone
        name = ('A', 'B')
        indices.rename(name, inplace=True)
        assert indices.name == name
        assert indices.names == [name]

Source File: test_common.py From recruit with Apache License 2.0

5 votes

def test_to_flat_index(self, indices):
        # 22866
        if isinstance(indices, MultiIndex):
            pytest.skip("Separate expectation for MultiIndex")

        result = indices.to_flat_index()
        tm.assert_index_equal(result, indices)

Source File: test_pandas_store.py From arctic with GNU Lesser General Public License v2.1

5 votes

def test_save_read_pandas_series_with_multiindex(library):
    df = Series(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)

Python pandas.MultiIndex() Examples