Python sklearn.metrics.mutual_info_score() Examples
The following are 7
code examples of sklearn.metrics.mutual_info_score().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.metrics
, or try the search function
.
Example #1
Source File: stats.py From scprep with GNU General Public License v3.0 | 6 votes |
def mutual_information(x, y, bins=8): """Mutual information score with set number of bins Helper function for `sklearn.metrics.mutual_info_score` that builds a contingency table over a set number of bins. Credit: `Warran Weckesser <https://stackoverflow.com/a/20505476/3996580>`_. Parameters ---------- x : array-like, shape=[n_samples] Input data (feature 1) y : array-like, shape=[n_samples] Input data (feature 2) bins : int or array-like, (default: 8) Passed to np.histogram2d to calculate a contingency table. Returns ------- mi : float Mutual information between x and y. Examples -------- >>> import scprep >>> data = scprep.io.load_csv("my_data.csv") >>> mi = scprep.stats.mutual_information(data['GENE1'], data['GENE2']) """ x, y = _vector_coerce_two_dense(x, y) c_xy = np.histogram2d(x, y, bins)[0] mi = metrics.mutual_info_score(None, None, contingency=c_xy) return mi
Example #2
Source File: pypsr.py From pypsr with MIT License | 5 votes |
def ami(x, y=None, n_bins=10): """Calculate the average mutual information between $x(t)$ and $y(t)$. Parameters ---------- x : array-like y : array-like, optional $x(t)$ and $y(t)$. If only `x` is passed, it must have two columns; the first column defines $x(t)$ and the second $y(t)$. n_bins : int The number of bins to use when computing the joint histogram. Returns ------- scalar Average mutual information between $x(t)$ and $y(t)$, in nats (natural log equivalent of bits). See Also -------- lagged_ami References ---------- Arbanel, H. D. (1996). *Analysis of Observed Chaotic Data* (p. 28). New York: Springer. """ x, y = _vector_pair(x, y) if x.shape[0] != y.shape[0]: raise ValueError('timeseries must have the same length') return metrics.mutual_info_score(None, None, contingency=np.histogram2d(x, y, bins=n_bins)[0])
Example #3
Source File: correlate.py From rapidtide with Apache License 2.0 | 5 votes |
def calc_MI(x, y, bins): c_xy = np.histogram2d(x, y, bins)[0] mi = mutual_info_score(None, None, contingency=c_xy) return mi
Example #4
Source File: hypothesis_test.py From fairtest with Apache License 2.0 | 4 votes |
def permutation_test_ct2(data, num_samples=10000): """ Monte-Carlo permutation test for a 2-way contingency table Parameters ---------- data : the contingency table num_samples : the number of random permutations to perform Returns ------- pval : the p-value References ---------- https://en.wikipedia.org/wiki/Resampling_(statistics) """ if isinstance(data, pd.DataFrame): data = np.array(data) dim = data.shape data_x = [] data_y = [] for x in range(0, dim[0]): for y in range(0, dim[1]): data_x += [x]*data[x, y] data_y += [y]*data[x, y] stat_0 = metrics.mutual_info_score(data_x, data_y) k = 0 for _ in range(num_samples): np.random.shuffle(data_x) mi = metrics.mutual_info_score(data_x, data_y) k += stat_0 < mi pval = (1.0*k) / num_samples return max(pval, 1.0/num_samples)
Example #5
Source File: scikit_base.py From dffml with MIT License | 4 votes |
def accuracy(self, sources: Sources) -> Accuracy: if not self._filepath.is_file(): raise ModelNotTrained("Train model before assessing for accuracy.") xdata = [] ydata = [] target = [] estimator_type = self.clf._estimator_type if estimator_type == "clusterer": target = ( [] if self.parent.config.tcluster is None else [self.parent.config.tcluster.name] ) async for record in sources.with_features(self.features): feature_data = record.features(self.features) xdata.append(list(feature_data.values())) ydata.append(list(record.features(target).values())) xdata = self.np.array(xdata) self.logger.debug("Number of input records: {}".format(len(xdata))) if target: ydata = self.np.array(ydata).flatten() if hasattr(self.clf, "predict"): # xdata can be training data or unseen data # inductive clusterer with ground truth y_pred = self.clf.predict(xdata) self.confidence = mutual_info_score(ydata, y_pred) else: # requires xdata = training data # transductive clusterer with ground truth self.logger.critical( "Accuracy found transductive clusterer, ensure data being passed is training data" ) self.confidence = mutual_info_score(ydata, self.clf.labels_) else: if hasattr(self.clf, "predict"): # xdata can be training data or unseen data # inductive clusterer without ground truth y_pred = self.clf.predict(xdata) self.confidence = silhouette_score(xdata, y_pred) else: # requires xdata = training data # transductive clusterer without ground truth self.logger.critical( "Accuracy found transductive clusterer, ensure data being passed is training data" ) self.confidence = silhouette_score(xdata, self.clf.labels_) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
Example #6
Source File: misc.py From audit-ai with MIT License | 4 votes |
def one_way_mi(df, feature_list, group_column, y_var, bins): """ Calculates one-way mutual information group variable and a target variable (y) given a feature list regarding. Parameters ---------- df : pandas DataFrame df with features used to train model, plus a target variable and a group column. feature_list : list DataFrame List of strings, feature names. group_column : string name of column for testing bias, should contain numeric categories y_var : string name of target variable column bins : tuple number of bins for each dimension Returns ------- mi_table : pandas DataFrame data frame with mutual information values, with one row per feature in the feature_list, columns for group and y. """ group_cats = df[group_column].values y_cats = df[y_var].values c_g = [ np.histogramdd([np.array(df[feature]), group_cats], bins=bins)[0] for feature in feature_list ] c_y = [ np.histogramdd([np.array(df[feature]), y_cats], bins=bins)[0] for feature in feature_list ] # compute mutual information (MI) between trait and gender/eth/y mi_g = [mutual_info_score(None, None, contingency=i) for i in c_g] mi_y = [mutual_info_score(None, None, contingency=i) for i in c_y] mi_table = pd.DataFrame({'feature': feature_list, group_column: mi_g, y_var: mi_y}) # NOTE: Scale group and y where the highest MI is scaled to 1 to # facilitate interpreting relative importance to bias and performance mi_table["{}_scaled".format(group_column)] = ( mi_table[group_column] / mi_table[group_column].max() ) mi_table["{}_scaled".format(y_var)] = ( mi_table[y_var] / mi_table[y_var].max() ) return mi_table
Example #7
Source File: test_metrics.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 4 votes |
def test_mutual_info_score(self): result = self.df.metrics.mutual_info_score() expected = metrics.mutual_info_score(self.target, self.pred) self.assertEqual(result, expected)