Python Examples of scipy.stats.f

Source File: TargetAnalysisCategorical.py From exploripy with MIT License

8 votes

def Anova(self):
		"""		
		Calculate the F-Score (One Way Anova) for each of Categorical Variables with all the Continuous Variables.
		Output --> List of Continuous Variables, whose pValue is < 0.05
		"""
		target = self.target		
		AnovaList = []
		for ContinuousVar in self.ContinuousFeatures:
			temp_df = self.df[[ContinuousVar, target]].dropna()
			try:
				f,p = stats.f_oneway(*[list(temp_df[temp_df[target]==name][ContinuousVar]) for name in set(temp_df[target])])
				AnovaList.append(dict(Continuous = ContinuousVar, PValue = p))
			except:
				# Do nothing. Skip.
				1==1
			
		Anova_df = pd.DataFrame(AnovaList)
		if Anova_df.shape[0]>0:
			Anova_df = Anova_df[Anova_df['PValue']<=0.05]
			Anova_df.sort_values(['PValue'],ascending = True, inplace=True)
		
		return Anova_df

Source File: TargetAnalysisContinuous.py From exploripy with MIT License

6 votes

def Anova(self):
		"""		
		Calculate the F-Score (One Way Anova) for each of Categorical Variables with all the Continuous Variables.
		Output --> List of Continuous Variables, whose pValue is < 0.05
		"""
		target = self.target		
		AnovaList = []
		print('Performing ANOVA...')
		for CategoricalVar in tqdm(self.CategoricalFeatures):
			temp_df = self.df[[CategoricalVar, target]].dropna()
			try:
				f,p = stats.f_oneway(*[list(temp_df[temp_df[CategoricalVar]==name][target]) for name in set(temp_df[CategoricalVar])])
				AnovaList.append(dict(Categorical = CategoricalVar, PValue = p))
			except:
				# Do Nothing. Skip.
				1==1
			
		Anova_df = pd.DataFrame(AnovaList)
		if Anova_df.shape[0]>0:
			Anova_df = Anova_df[Anova_df['PValue']<=0.05]
			Anova_df.sort_values(['PValue'],ascending = True, inplace=True)
		
		return Anova_df

Source File: test_correct.py From abagen with BSD 3-Clause "New" or "Revised" License

6 votes

def test__batch():
    rs = np.random.RandomState(1234)
    # p-values for ANOVA should all be ~0 (large group differences) before
    # batch correction
    y = [rs.normal(size=(100, 1000)) + f for f in [5, 0, 0]]
    assert np.allclose(sstats.f_oneway(*y)[1], 0)

    # F-values for ANOVA should all be ~0 (no group differences) after batch
    # correction; p-values returned here are sometimes NaN so not a good test
    out = correct._batch_correct(y)
    assert np.allclose(sstats.f_oneway(*out)[0], 0)

    # mean expressions after correction should be ~equal
    assert np.allclose([o.mean() for o in out], 1.24871965683026)

    with pytest.raises(ValueError):
        correct._batch_correct([y[0]])

Source File: test_stats.py From GraphicDesignPatternByPython with MIT License

5 votes

def test_result_attributes(self):
        a = np.array([655, 788], dtype=np.uint16)
        b = np.array([789, 772], dtype=np.uint16)
        res = stats.f_oneway(a, b)
        attributes = ('statistic', 'pvalue')
        check_named_results(res, attributes)

Source File: EDA.py From exploripy with MIT License

5 votes

def Anova(self):
		"""		
		Calculate the F-Score (One Way Anova) for each of Categorical Variables with all the Continuous Variables
		"""
		# Drop records with Null values
		temp_df = self.df.dropna()
		start = time.time()
		AnovaList = []
		SummaryAnovaList = []
		Insight1 = "With Confidence interval of 0.05, the variable - \"{0}\" is influenced by the categorical variable - \"{1}\". "
		Insight2 = "As the Categorical variable - \"{0}\" is binary, Tukey's HSD test is not necessary. "
		Insight3 = "As the p-Value is higher than the Confidence Interval 0.05, the variable - \"{0}\" is not influenced by the categorical variable - \"{1}\". "
		for CategoricalVar in self.CategoricalFeatures:
			Binary = 'Yes' if CategoricalVar in self.BinaryCategoricalFeatures else 'No'
			for ContinuousVar in self.ContinuousFeatures:
				TukeyResult = None 
				f,p = stats.f_oneway(*[list(temp_df[temp_df[CategoricalVar]==name][ContinuousVar]) for name in set(temp_df[CategoricalVar])])
				if (p<0.05 and CategoricalVar in self.BinaryCategoricalFeatures):
					Insight = Insight1.format(ContinuousVar, CategoricalVar) + Insight2.format(CategoricalVar)
				elif p<0.05:
					TukeyResult = self.Tukey(CategoricalVar, ContinuousVar)
					Insight = Insight1.format(ContinuousVar, CategoricalVar)
				else:
					Insight = Insight3.format(ContinuousVar, CategoricalVar)
				AnovaList.append(dict(Categorical = CategoricalVar, Continuous = ContinuousVar, f = f, p = p, Binary = Binary, Insight = Insight,
				TukeyResult = TukeyResult))
		for entry in AnovaList:
			Categorical = entry['Categorical']
			Continuous = entry['Continuous']
			PValue = entry['p']			
			SummaryAnovaList.append(dict(Categorical=Categorical,Continuous=Continuous,PValue=PValue))
		
		end = time.time()
		if self.debug == 'YES':
			print('Anova',end-start)
		return AnovaList,pd.DataFrame(SummaryAnovaList)

Source File: test_feature_select.py From twitter-stock-recommendation with MIT License

5 votes

def test_f_oneway_ints():
    # Smoke test f_oneway on integers: that it does raise casting errors
    # with recent numpys
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(10, 10))
    y = np.arange(10)
    fint, pint = f_oneway(X, y)

    # test that is gives the same result as with float
    f, p = f_oneway(X.astype(np.float), y)
    assert_array_almost_equal(f, fint, decimal=4)
    assert_array_almost_equal(p, pint, decimal=4)

Source File: test_feature_select.py From twitter-stock-recommendation with MIT License

5 votes

def test_f_oneway_vs_scipy_stats():
    # Test that our f_oneway gives the same result as scipy.stats
    rng = np.random.RandomState(0)
    X1 = rng.randn(10, 3)
    X2 = 1 + rng.randn(10, 3)
    f, pv = stats.f_oneway(X1, X2)
    f2, pv2 = f_oneway(X1, X2)
    assert_true(np.allclose(f, f2))
    assert_true(np.allclose(pv, pv2))

Source File: misc.py From audit-ai with MIT License

5 votes

def anova(labels, results, subset_labels=None):
    """
    Returns one-way ANOVA f-statistic and p-value from
    input vectors of categorical labels and numeric results

    Parameters
    ------------
    labels : array_like
        containing categorical values like ['M', 'F']
    results : array_like
        containing real numbers
    subset_labels : list of strings, optional
        if only specific labels should be included

    Returns
    ----------
    F_onewayResult : scipy.stats object (essentially a 2-tuple)
        contains one-way f-statistic and p-value, indicating whether
        scores have same sample mean

    """
    check_consistent_length(labels, results)

    df = pd.DataFrame(list(zip(labels, results)), columns=['label', 'result'])
    if subset_labels is not None:
        df = df.loc[df['label'].isin(subset_labels)]

    unique_labels = df['label'].dropna().unique()
    score_vectors = [df.loc[df['label'] == lab, 'result']
                     for lab in unique_labels]
    return f_oneway(*score_vectors)

Source File: plot.py From SCALE with MIT License

5 votes

def feature_specifity(feature, ref, classes, figsize=(6,6), save=None):
    """
    Calculate the feature specifity:

    Input:
        feature: latent feature
        ref: cluster assignments
        classes: cluster classes
    """
    from scipy.stats import f_oneway
    # n_cluster = max(ref) + 1
    n_cluster = len(classes)
    dim = feature.shape[1] # feature dimension
    pvalue_mat = np.zeros((dim, n_cluster))
    for i,cluster in enumerate(classes):
        for feat in range(dim):
            a = feature.iloc[:, feat][ref == cluster]
            b = feature.iloc[:, feat][ref != cluster]
            pvalue = f_oneway(a,b)[1]
            pvalue_mat[feat, i] = pvalue

    plt.figure(figsize=figsize)
    grid = sns.heatmap(-np.log10(pvalue_mat), cmap='RdBu_r', 
                       vmax=20,
                       yticklabels=np.arange(10)+1, 
                       xticklabels=classes[:n_cluster],
                       )
    grid.set_ylabel('Feature', fontsize=18)
    grid.set_xticklabels(labels=classes[:n_cluster], rotation=45, fontsize=18)
    grid.set_yticklabels(labels=np.arange(dim)+1, fontsize=16)
    cbar = grid.collections[0].colorbar
    cbar.set_label('-log10 (Pvalue)', fontsize=18) #, rotation=0, x=-0.9, y=0)
    
    if save:
        plt.savefig(save, format='pdf', bbox_inches='tight')
    else:
        plt.show()

Source File: test_stats.py From GraphicDesignPatternByPython with MIT License

5 votes

def test_nist(self):
        # These are the nist ANOVA files. They can be found at:
        # http://www.itl.nist.gov/div898/strd/anova/anova.html
        filenames = ['SiRstv.dat', 'SmLs01.dat', 'SmLs02.dat', 'SmLs03.dat',
                     'AtmWtAg.dat', 'SmLs04.dat', 'SmLs05.dat', 'SmLs06.dat',
                     'SmLs07.dat', 'SmLs08.dat', 'SmLs09.dat']

        for test_case in filenames:
            rtol = 1e-7
            fname = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                                 'data/nist_anova', test_case))
            with open(fname, 'r') as f:
                content = f.read().split('\n')
            certified = [line.split() for line in content[40:48]
                         if line.strip()]
            dataf = np.loadtxt(fname, skiprows=60)
            y, x = dataf.T
            y = y.astype(int)
            caty = np.unique(y)
            f = float(certified[0][-1])

            xlist = [x[y == i] for i in caty]
            res = stats.f_oneway(*xlist)

            # With the hard test cases we relax the tolerance a bit.
            hard_tc = ('SmLs07.dat', 'SmLs08.dat', 'SmLs09.dat')
            if test_case in hard_tc:
                rtol = 1e-4

            assert_allclose(res[0], f, rtol=rtol,
                            err_msg='Failing testcase: %s' % test_case)

Source File: pancreas_tests.py From scanorama with MIT License

5 votes

def print_oneway(X, genes, ds_labels):
    for gene_idx, gene in enumerate(genes):
        ds_names = sorted(set(ds_labels))
        dist = []
        for ds in ds_names:
            dist.append(X[ds_labels == ds, gene_idx])
        sys.stdout.write('{}\t'.format(gene))
        print('{}\t{}'.format(*f_oneway(*dist)))

Source File: test_stats.py From GraphicDesignPatternByPython with MIT License

5 votes

def test_large_integer_array(self):
        a = np.array([655, 788], dtype=np.uint16)
        b = np.array([789, 772], dtype=np.uint16)
        F, p = stats.f_oneway(a, b)
        assert_almost_equal(F, 0.77450216931805538)

Source File: test_stats.py From GraphicDesignPatternByPython with MIT License

5 votes

def test_basic(self):
        # Despite being a floating point calculation, this data should
        # result in F being exactly 2.0.
        F, p = stats.f_oneway([0,2], [2,4])
        assert_equal(F, 2.0)

Source File: test_stats.py From GraphicDesignPatternByPython with MIT License

5 votes

def test_trivial(self):
        # A trivial test of stats.f_oneway, with F=0.
        F, p = stats.f_oneway([0,2], [0,2])
        assert_equal(F, 0.0)

Source File: eda.py From xam with MIT License

5 votes

def feature_importance_regression(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # Pearson correlation
        pearson = np.array([stats.pearsonr(feature, target) for _, feature in cont.iteritems()])
        cont_imp['pearson_r'] = pearson[:, 0]
        cont_imp['pearson_r_p_value'] = pearson[:, 1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_regression(cont, target, discrete_features=False,
                                                           n_neighbors=n_neighbors,
                                                           random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # F-test
        f_tests = defaultdict(dict)

        for feature in disc.columns:
            groups = [target[idxs] for idxs in disc.groupby(feature).groups.values()]
            statistic, p_value = stats.f_oneway(*groups)
            f_tests[feature]['f_statistic'] = statistic
            f_tests[feature]['f_p_value'] = p_value

        f_tests_df = pd.DataFrame.from_dict(f_tests, orient='index')
        disc_imp['f_statistic'] = f_tests_df['f_statistic']
        disc_imp['f_p_value'] = f_tests_df['f_p_value']

        # Mutual information
        mut_inf = feature_selection.mutual_info_regression(disc, target, discrete_features=True,
                                                           n_neighbors=n_neighbors,
                                                           random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp

Source File: test_feature_select.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_f_oneway_ints():
    # Smoke test f_oneway on integers: that it does raise casting errors
    # with recent numpys
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(10, 10))
    y = np.arange(10)
    fint, pint = f_oneway(X, y)

    # test that is gives the same result as with float
    f, p = f_oneway(X.astype(np.float), y)
    assert_array_almost_equal(f, fint, decimal=4)
    assert_array_almost_equal(p, pint, decimal=4)

Source File: test_feature_select.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_f_oneway_vs_scipy_stats():
    # Test that our f_oneway gives the same result as scipy.stats
    rng = np.random.RandomState(0)
    X1 = rng.randn(10, 3)
    X2 = 1 + rng.randn(10, 3)
    f, pv = stats.f_oneway(X1, X2)
    f2, pv2 = f_oneway(X1, X2)
    assert np.allclose(f, f2)
    assert np.allclose(pv, pv2)

Source File: test_stats.py From Computable with MIT License

5 votes

def test_basic(self):
        # A test of stats.f_oneway, with F=2.
        F, p = stats.f_oneway([0,2], [2,4])
        # Despite being a floating point calculation, this data should
        # result in F being exactly 2.0.
        assert_equal(F, 2.0)

Source File: test_stats.py From Computable with MIT License

5 votes

def test_trivial(self):
        # A trivial test of stats.f_oneway, with F=0.
        F, p = stats.f_oneway([0,2], [0,2])
        assert_equal(F, 0.0)

Source File: ANOVA.py From TabPy with MIT License

5 votes

def anova(_arg1, _arg2, *_argN):
    """
    ANOVA is a statistical hypothesis test that is used to compare
    two or more group means for equality.For more information on
    the function and how to use it please refer to tabpy-tools.md
    """

    cols = [_arg1, _arg2] + list(_argN)
    for col in cols:
        if not isinstance(col[0], (int, float)):
            print("values must be numeric")
            raise ValueError
    _, p_value = stats.f_oneway(_arg1, _arg2, *_argN)
    return p_value

Source File: numerical_comparison.py From DIVE-backend with GNU General Public License v3.0

4 votes

def get_valid_tests(equal_var, independent, normal, num_samples):
    '''
    Get valid tests given number of samples and statistical characterization of
    samples:

    Equal variance
    Indepenence
    Normality
    '''
    if num_samples == 1:
        valid_tests = {
            'chisquare': stats.chisquare,
            'power_divergence': stats.power_divergence,
            'kstest': stats.kstest
        }
        if normal:
            valid_tests['input']['one_sample_ttest'] = stats.ttest_1samp

    elif num_samples == 2:
        if independent:
            valid_tests = {
                'mannwhitneyu': stats.mannwhitneyu,
                'kruskal': stats.kruskal,
                'ks_2samp': stats.ks_2samp
            }
            if normal:
                valid_tests['two_sample_ttest'] = stats.ttest_ind
                if equal_var:
                    valid_tests['f_oneway'] = stats.f_oneway
        else:
            valid_tests = {
                'two_sample_ks': stats.ks_2samp,
                'wilcoxon': stats.wilcoxon
            }
            if normal:
                valid_tests['two_sample_related_ttest'] = stats.ttest_rel

    elif num_samples >= 3:
        if independent:
            valid_tests = {
                'kruskal': stats.kruskal
            }
            if normal and equal_var:
                valid_tests['f_oneway'] = stats.f_oneway

        else:
            valid_tests['friedmanchisquare'] = stats.friedmanchisquare

    return valid_tests

Python scipy.stats.f_oneway() Examples