Python sklearn.metrics.mutual_info_score() Examples

The following are 7 code examples of sklearn.metrics.mutual_info_score(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.metrics , or try the search function .
Example #1
Source File: stats.py    From scprep with GNU General Public License v3.0 6 votes vote down vote up
def mutual_information(x, y, bins=8):
    """Mutual information score with set number of bins

    Helper function for `sklearn.metrics.mutual_info_score` that builds a
    contingency table over a set number of bins.
    Credit: `Warran Weckesser <https://stackoverflow.com/a/20505476/3996580>`_.


    Parameters
    ----------
    x : array-like, shape=[n_samples]
        Input data (feature 1)
    y : array-like, shape=[n_samples]
        Input data (feature 2)
    bins : int or array-like, (default: 8)
        Passed to np.histogram2d to calculate a contingency table.

    Returns
    -------
    mi : float
        Mutual information between x and y.

    Examples
    --------
    >>> import scprep
    >>> data = scprep.io.load_csv("my_data.csv")
    >>> mi = scprep.stats.mutual_information(data['GENE1'], data['GENE2'])
    """
    x, y = _vector_coerce_two_dense(x, y)
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = metrics.mutual_info_score(None, None, contingency=c_xy)
    return mi 
Example #2
Source File: pypsr.py    From pypsr with MIT License 5 votes vote down vote up
def ami(x, y=None, n_bins=10):
    """Calculate the average mutual information between $x(t)$ and $y(t)$.

    Parameters
    ----------
    x : array-like
    y : array-like, optional
        $x(t)$ and $y(t)$.
        If only `x` is passed, it must have two columns;
        the first column defines $x(t)$ and the second $y(t)$.
    n_bins : int
        The number of bins to use when computing the joint histogram.

    Returns
    -------
    scalar
        Average mutual information between $x(t)$ and $y(t)$, in nats (natural log equivalent of bits).

    See Also
    --------
    lagged_ami

    References
    ----------
    Arbanel, H. D. (1996). *Analysis of Observed Chaotic Data* (p. 28). New York: Springer.

    """
    x, y = _vector_pair(x, y)
    if x.shape[0] != y.shape[0]:
        raise ValueError('timeseries must have the same length')

    return metrics.mutual_info_score(None, None, contingency=np.histogram2d(x, y, bins=n_bins)[0]) 
Example #3
Source File: correlate.py    From rapidtide with Apache License 2.0 5 votes vote down vote up
def calc_MI(x, y, bins):
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi 
Example #4
Source File: hypothesis_test.py    From fairtest with Apache License 2.0 4 votes vote down vote up
def permutation_test_ct2(data, num_samples=10000):
    """
    Monte-Carlo permutation test for a 2-way contingency table

    Parameters
    ----------
    data :
        the contingency table

    num_samples :
        the number of random permutations to perform

    Returns
    -------
    pval :
        the p-value

    References
    ----------
    https://en.wikipedia.org/wiki/Resampling_(statistics)
    """
    if isinstance(data, pd.DataFrame):
        data = np.array(data)

    dim = data.shape
    data_x = []
    data_y = []

    for x in range(0, dim[0]):
        for y in range(0, dim[1]):
            data_x += [x]*data[x, y]
            data_y += [y]*data[x, y]

    stat_0 = metrics.mutual_info_score(data_x, data_y)

    k = 0
    for _ in range(num_samples):
        np.random.shuffle(data_x)
        mi = metrics.mutual_info_score(data_x, data_y)
        k += stat_0 < mi

    pval = (1.0*k) / num_samples
    return max(pval, 1.0/num_samples) 
Example #5
Source File: scikit_base.py    From dffml with MIT License 4 votes vote down vote up
def accuracy(self, sources: Sources) -> Accuracy:
        if not self._filepath.is_file():
            raise ModelNotTrained("Train model before assessing for accuracy.")
        xdata = []
        ydata = []
        target = []
        estimator_type = self.clf._estimator_type
        if estimator_type == "clusterer":
            target = (
                []
                if self.parent.config.tcluster is None
                else [self.parent.config.tcluster.name]
            )
        async for record in sources.with_features(self.features):
            feature_data = record.features(self.features)
            xdata.append(list(feature_data.values()))
            ydata.append(list(record.features(target).values()))
        xdata = self.np.array(xdata)
        self.logger.debug("Number of input records: {}".format(len(xdata)))
        if target:
            ydata = self.np.array(ydata).flatten()
            if hasattr(self.clf, "predict"):
                # xdata can be training data or unseen data
                # inductive clusterer with ground truth
                y_pred = self.clf.predict(xdata)
                self.confidence = mutual_info_score(ydata, y_pred)
            else:
                # requires xdata = training data
                # transductive clusterer with ground truth
                self.logger.critical(
                    "Accuracy found transductive clusterer, ensure data being passed is training data"
                )
                self.confidence = mutual_info_score(ydata, self.clf.labels_)
        else:
            if hasattr(self.clf, "predict"):
                # xdata can be training data or unseen data
                # inductive clusterer without ground truth
                y_pred = self.clf.predict(xdata)
                self.confidence = silhouette_score(xdata, y_pred)
            else:
                # requires xdata = training data
                # transductive clusterer without ground truth
                self.logger.critical(
                    "Accuracy found transductive clusterer, ensure data being passed is training data"
                )
                self.confidence = silhouette_score(xdata, self.clf.labels_)
        self.logger.debug("Model Accuracy: {}".format(self.confidence))
        return self.confidence 
Example #6
Source File: misc.py    From audit-ai with MIT License 4 votes vote down vote up
def one_way_mi(df, feature_list, group_column, y_var, bins):

    """
    Calculates one-way mutual information group variable and a
    target variable (y) given a feature list regarding.

    Parameters
    ----------
    df : pandas DataFrame
         df with features used to train model, plus a target variable
         and a group column.
    feature_list : list DataFrame
        List of strings, feature names.
    group_column : string
        name of column for testing bias, should contain numeric categories
    y_var : string
        name of target variable column
    bins : tuple
        number of bins for each dimension

    Returns
    -------
    mi_table : pandas DataFrame
        data frame with mutual information values, with one row per feature
        in the feature_list, columns for group and y.
    """

    group_cats = df[group_column].values
    y_cats = df[y_var].values

    c_g = [
        np.histogramdd([np.array(df[feature]), group_cats], bins=bins)[0]
        for feature in feature_list
        ]
    c_y = [
        np.histogramdd([np.array(df[feature]), y_cats], bins=bins)[0]
        for feature in feature_list
        ]

    # compute mutual information (MI) between trait and gender/eth/y
    mi_g = [mutual_info_score(None, None, contingency=i) for i in c_g]
    mi_y = [mutual_info_score(None, None, contingency=i) for i in c_y]
    mi_table = pd.DataFrame({'feature': feature_list,
                             group_column: mi_g,
                             y_var: mi_y})

    # NOTE: Scale group and y where the highest MI is scaled to 1 to
    # facilitate interpreting relative importance to bias and performance
    mi_table["{}_scaled".format(group_column)] = (
        mi_table[group_column] / mi_table[group_column].max()
    )
    mi_table["{}_scaled".format(y_var)] = (
        mi_table[y_var] / mi_table[y_var].max()
    )

    return mi_table 
Example #7
Source File: test_metrics.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def test_mutual_info_score(self):
        result = self.df.metrics.mutual_info_score()
        expected = metrics.mutual_info_score(self.target, self.pred)
        self.assertEqual(result, expected)