Python scipy.stats.ttest_ind() Examples

The following are 30 code examples of scipy.stats.ttest_ind(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.stats , or try the search function .
Example #1
Source File: param_sensitivity.py    From scanorama with MIT License 7 votes vote down vote up
def test_knn(datasets_dimred, genes, labels, idx, distr, xlabels):
    knns = [ 5, 10, 50, 100 ]
    len_distr = len(distr)
    for knn in knns:
        integrated = assemble(datasets_dimred[:], knn=knn, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(knn))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('knn')) 
Example #2
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_sigma(datasets_dimred, genes, labels, idx, distr, xlabels):
    sigmas = [ 10, 50, 100, 200 ]
    len_distr = len(distr)
    for sigma in sigmas:
        integrated = assemble(datasets_dimred[:], sigma=sigma)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(sigma))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('sigma')) 
Example #3
Source File: scdiff.py    From scdiff with MIT License 6 votes vote down vote up
def getDiffGeneTTest(self):
                cut=5e-2
                AE=[item.E for item in self.fromNode.cells]
                BE=[item.E for item in self.toNode.cells]
                G=range(len(AE[0]))
                #pdb.set_trace()
                TT=[]
                for i in G:
                        X=[item[i] for item in AE]
                        Y=[item[i] for item in BE]
                        pxy=ttest_ind(X,Y)[-1]
                        TT.append([pxy,i])
                TT.sort()
                TT=[item for item in TT if item[0]<cut]
                DG=[[self.GL[item[1]],item[0]] for item in TT if self.GL[item[1]]]
                return DG

        #------------------------------------------------------------------- 
Example #4
Source File: scdiff.py    From scdiff with MIT License 6 votes vote down vote up
def tellDifference(nodeCells,nodePCells,geneIndex,fcut=0.6):
	if len(nodePCells)==0:
		return [0,1,0]
	X=[item.E[geneIndex] for item in nodeCells]
	Y=[item.E[geneIndex] for item in nodePCells]
	fc=sum(X)/len(X)-sum(Y)/len(Y)
	pv=ttest_ind(X,Y)[1]
	
	pcut=0.05
	if pv<pcut and fc>fcut:
		return [1,pv,fc]
	if pv<pcut and fc<-1*fcut:
		return [-1,pv,fc]	
	return [0,pv,fc]
#-----------------------------------------------------------------------------
# filter X 
Example #5
Source File: numerical_comparison.py    From DIVE-backend with GNU General Public License v3.0 6 votes vote down vote up
def ttest(df, fields, indep, dep):
    # Ensure single field
    dep_field_name = dep[0]
    indep_field_name = indep[0]
    unique_indep_values = get_unique(df[indep_field_name])

    subsets = {}
    for v in unique_indep_values:
        subsets[v] = np.array(df[df[indep_field_name] == v][dep_field_name])

    result = {}
    for (x, y) in combinations(unique_indep_values, 2):
        (statistic, pvalue) = ttest_ind(subsets[x], subsets[y])
        result[str([x, y])] = {
            'statistic': statistic,
            'pvalue': pvalue
        }

    return result



##################
#Functions to determine which tests could be run
################## 
Example #6
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 6 votes vote down vote up
def test_ttest_ind_nan_2nd_arg():
    # regression test for gh-6134: nans in the second arg were not handled
    x = [np.nan, 2.0, 3.0, 4.0]
    y = [1.0, 2.0, 1.0, 2.0]

    r1 = stats.ttest_ind(x, y, nan_policy='omit')
    r2 = stats.ttest_ind(y, x, nan_policy='omit')
    assert_allclose(r2.statistic, -r1.statistic, atol=1e-15)
    assert_allclose(r2.pvalue, r1.pvalue, atol=1e-15)

    # NB: arguments are not paired when NaNs are dropped
    r3 = stats.ttest_ind(y, x[1:])
    assert_allclose(r2, r3, atol=1e-15)

    # .. and this is consistent with R. R code:
    # x = c(NA, 2.0, 3.0, 4.0)
    # y = c(1.0, 2.0, 1.0, 2.0)
    # t.test(x, y, var.equal=TRUE)
    assert_allclose(r2, (-2.5354627641855498, 0.052181400457057901), atol=1e-15) 
Example #7
Source File: statistics.py    From HPOlib with GNU General Public License v3.0 6 votes vote down vote up
def calculate_statistics(best_dict, keys, round_=0):
    p_values = dict()
    for key in keys:
        p_values[key] = defaultdict(float)

    for idx, key0 in enumerate(keys):
        if len(keys) > 1:
            for j, key1 in enumerate(keys[idx+1:]):
                if round_ > 0:
                    t_true, p_true = stats.ttest_ind(numpy.round(best_dict[key0], round_),
                                                        numpy.round(best_dict[key1], round_))
                else:
                    t_true, p_true = stats.ttest_ind(best_dict[key0], best_dict[key1])
                p_values[key0][key1] = p_true

    return p_values 
Example #8
Source File: match.py    From pscore_match with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def t_test(covariates, groups):
    """ 
    Two sample t test for the distribution of treatment and control covariates
    
    Parameters
    ----------
    covariates : DataFrame 
        Dataframe with one covariate per column.
        If matches are with replacement, then duplicates should be 
        included as additional rows.
    groups : array-like
        treatment assignments, must be 2 groups
    
    Returns
    -------
    A list of p-values, one for each column in covariates
    """
    colnames = list(covariates.columns)
    J = len(colnames)
    pvalues = np.zeros(J)
    for j in range(J):
        var = covariates[colnames[j]]
        res = ttest_ind(var[groups == 1], var[groups == 0])
        pvalues[j] = res.pvalue
    return pvalues 
Example #9
Source File: test_stats.py    From scprep with GNU General Public License v3.0 6 votes vote down vote up
def test_t_statistic():
    X = data.generate_positive_sparse_matrix(shape=(500, 3), seed=42, poisson_mean=0.2)
    Y = data.generate_positive_sparse_matrix(shape=(500, 3), seed=42, poisson_mean=0.3)
    u_stat = [
        stats.ttest_ind(X[:, i], Y[:, i], equal_var=False)[0] for i in range(X.shape[1])
    ]

    def test_fun(X):
        return scprep.stats.t_statistic(
            scprep.select.select_rows(X, idx=np.arange(500)),
            scprep.select.select_rows(X, idx=np.arange(500, 1000)),
        )

    matrix.test_all_matrix_types(
        np.vstack([X, Y]),
        utils.assert_transform_equals,
        Y=u_stat,
        transform=test_fun,
        check=partial(utils.assert_all_close, rtol=2e-3),
    ) 
Example #10
Source File: analyser.py    From spotpy with MIT License 6 votes vote down vote up
def compare_different_objectivefunctions(like1,like2):
    """
    Performs the Welch’s t-test (aka unequal variances t-test)

    :like1: objectivefunction values
    :type: list

    :like2: Other objectivefunction values
    :type: list

    :return: p Value
    :rtype: list
    """
    from scipy import stats
    out = stats.ttest_ind(like1, like2, equal_var=False)
    print(out)
    if out[1]>0.05:
        print('like1 is NOT signifikant different to like2: p>0.05')
    else:
        print('like1 is signifikant different to like2: p<0.05' )
    return out 
Example #11
Source File: loss.py    From CausalDiscoveryToolbox with MIT License 6 votes vote down vote up
def loop(self, xy, yx):
        """ Tests the loop condition based on the new results and the
        parameters.

        Args:
            xy (list): list containing all the results for one set of samples
            yx (list): list containing all the results for the other set.

        Returns:
            bool: True if the loop has to continue, False otherwise.
        """
        if self.iter < 2:
            self.iter += self.runs_per_iter
            return True
        t_test, self.p_value = ttest_ind(xy, yx, equal_var=False)
        if self.p_value > self.threshold and self.iter < self.max_iter:
            self.iter += self.runs_per_iter
            return True
        else:
            return False 
Example #12
Source File: models.py    From redtide with MIT License 6 votes vote down vote up
def price_trend(self, symbol=None, k=5, metric='rvalue', baseval=0):
        if k < 3:
            raise ValueError('Stocks.price_trend: k >= 3 is a must')
        if metric not in ['slope', 'rvalue', 'pvalue', 'stderr']:
            raise ValueError('Stocks.price_trend: invalid metric: {}'.format(metric))
        if symbol is None:
            values = []
            for s in self.stocks.values():
                r = s.price_trend(k)
                if r:
                    values.append(getattr(r, metric))
            n = len(values)
            if n >= 2:
                # n depends on stocks watching, meanless if few
                return ttest_ind(values, [baseval]*n, equal_var=False)
        elif self.has_stock(symbol):
            return self.stocks[symbol].price_trend(k)
        return None 
Example #13
Source File: models.py    From redtide with MIT License 6 votes vote down vote up
def volume_trend(self, symbol=None, k=5, metric='rvalue', baseval=0):
        if k < 3:
            raise ValueError('Stocks.volume_trend: k >= 3 is a must')
        if metric not in ['slope', 'rvalue', 'pvalue', 'stderr']:
            raise ValueError('Stocks.volume_trend: invalid metric: {}'.format(metric))
        if symbol is None:
            values = []
            for s in self.stocks.values():
                r = s.volume_trend(k)
                if r:
                    values.append(getattr(r, metric))
            n = len(values)
            if n >= 2:
                return ttest_ind(values, [baseval]*n, equal_var=False)
        elif self.has_stock(symbol):
            return self.stocks[symbol].volume_trend(k)
        return None 
Example #14
Source File: test_weightstats.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_weightstats_1(self):
        x1, x2 = self.x1, self.x2
        w1, w2 = self.w1, self.w2
        w1_ = 2. * np.ones(len(x1))
        w2_ = 2. * np.ones(len(x2))

        d1 = DescrStatsW(x1)
#        print ttest_ind(x1, x2)
#        print ttest_ind(x1, x2, usevar='unequal')
#        #print ttest_ind(x1, x2, usevar='unequal')
#        print stats.ttest_ind(x1, x2)
#        print ttest_ind(x1, x2, usevar='unequal', alternative='larger')
#        print ttest_ind(x1, x2, usevar='unequal', alternative='smaller')
#        print ttest_ind(x1, x2, usevar='unequal', weights=(w1_, w2_))
#        print stats.ttest_ind(np.r_[x1, x1], np.r_[x2,x2])
        assert_almost_equal(ttest_ind(x1, x2, weights=(w1_, w2_))[:2],
                            stats.ttest_ind(np.r_[x1, x1], np.r_[x2, x2])) 
Example #15
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_alpha(datasets_dimred, genes, labels, idx, distr, xlabels):
    alphas = [ 0, 0.05, 0.20, 0.50 ]
    len_distr = len(distr)
    for alpha in alphas:
        integrated = assemble(datasets_dimred[:], alpha=alpha, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(alpha))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('alpha')) 
Example #16
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_approx(datasets_dimred, genes, labels, idx, distr, xlabels):
    integrated = assemble(datasets_dimred[:], approx=False, sigma=150)
    X = np.concatenate(integrated)
    distr.append(sil(X[idx, :], labels[idx]))
    len_distr = len(distr)
    for d in distr[:len_distr]:
        print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
    xlabels.append('Exact NN')
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('approx')) 
Example #17
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_perplexity(datasets_dimred, genes, labels, idx,
                    distr, xlabels):
    X = np.concatenate(datasets_dimred)

    perplexities = [ 10, 100, 500, 2000 ]
    len_distr = len(distr)
    for perplexity in perplexities:
        embedding = fit_tsne(X, perplexity=perplexity)
        distr.append(sil(embedding[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(perplexity))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('perplexity')) 
Example #18
Source File: compare_genomes.py    From mCaller with MIT License 6 votes vote down vote up
def compare_by_position(bed1,bed2,xmfa):
    pos_dict = {}

    for i,bed in enumerate([bed1,bed2]):
        pos_dict[i] = {}
        with open(bed,'r') as fi:
                for line in fi:
                #2  1892198 1892199 TCMMTMTTMMM 0.5 -   16
                    csome,start,end,motif,perc_meth,strand,num_reads,probabilities = tuple(line.split('\t'))
                    pos_dict[i][(csome,start,end,strand)] = ((perc_meth,num_reads),np.asarray([float(p) for p in probabilities.strip().split(',')]))

    for pos in pos_dict[0]:
        if pos in pos_dict[1]:
            try:
                u,pval = mannwhitneyu(pos_dict[0][pos][1],pos_dict[0][pos][1],alternative='two-sided')
            except ValueError:
                u,pval = 'none','identical'
            u2,pval2 = ranksums(pos_dict[0][pos][1],pos_dict[0][pos][1])
            try:
                t,pval3 = ttest_ind(pos_dict[0][pos][1],pos_dict[0][pos][1])
            except:
                t,pval3 = 'none','missing df'
            d,pval4 = ks_2samp(pos_dict[0][pos][1],pos_dict[0][pos][1])
            if pval4 < 0.9:
                print pos, pos_dict[0][pos][0], pos_dict[1][pos][0], pval, pval2, pval3, pval4 
Example #19
Source File: EDA.py    From exploripy with MIT License 6 votes vote down vote up
def TTest(self):
		"""
		Calculate PValue based on T-Test. This is applicable only for Binary Categorical Variable with all the Continuous Variables
		"""
		temp_df = self.df.dropna()
		start = time.time()
		TList = []
		Insight1 = "With Confidence interval of 0.05, the distribution of variable - \"{0}\" varies significantly based on the categorical variable - \"{1}\". "
		Insight2 = "With Confidence interval of 0.05, the distribution of variable - \"{0}\" does not vary significantly based on the categorical variable - \"{1}\". "
		for CategoricalVar in self.BinaryCategoricalFeatures:
			for ContinuousVar in self.ContinuousFeatures:
				binary1 = temp_df[CategoricalVar].unique()[0]
				binary2 = temp_df[CategoricalVar].unique()[1]
				TStat,p = stats.ttest_ind(temp_df[temp_df[CategoricalVar]==binary1][ContinuousVar],temp_df[temp_df[CategoricalVar]==binary2][ContinuousVar])
				if p <= 0.05:
					Insight = Insight1.format(ContinuousVar,CategoricalVar)
				else:
					Insight = Insight2.format(ContinuousVar,CategoricalVar)
				TList.append(dict(Continuous=ContinuousVar,Categorical=CategoricalVar,TStat=TStat,PValue=p,Insight=Insight))
		
		end = time.time()
		if self.debug == 'YES':
			print('T-Test',end-start)
			print(pd.DataFrame(TList))
		return pd.DataFrame(TList) 
Example #20
Source File: basenji_test_reps.py    From basenji with Apache License 2.0 6 votes vote down vote up
def stat_tests(ref_cors, exp_cors, alternative):
  _, mwp = mannwhitneyu(ref_cors, exp_cors, alternative=alternative)
  tt, tp = ttest_ind(ref_cors, exp_cors)

  if alternative == 'less':
    if tt > 0:
      tp = 1 - (1-tp)/2
    else:
      tp /= 2
  elif alternative == 'greater':
    if tt <= 0:
      tp /= 2
    else:
      tp = 1 - (1-tp)/2

  return mwp, tp

################################################################################
# __main__
################################################################################ 
Example #21
Source File: Statistics.py    From ClearMap with GNU General Public License v3.0 6 votes vote down vote up
def tTestVoxelization(group1, group2, signed = False, removeNaN = True, pcutoff = None):
    """t-Test on differences between the individual voxels in group1 and group2, group is a array of voxelizations"""
    
    g1 = self.readDataGroup(group1);  
    g2 = self.readDataGroup(group2);  
    
    tvals, pvals = stats.ttest_ind(g1, g2, axis = 0, equal_var = True);

    #remove nans
    if removeNaN: 
        pi = numpy.isnan(pvals);
        pvals[pi] = 1.0;
        tvals[pi] = 0;

    pvals = self.cutoffPValues(pvals, pcutoff = pcutoff);

    #return
    if signed:
        return pvals, numpy.sign(tvals);
    else:
        return pvals; 
Example #22
Source File: Statistics.py    From ClearMap with GNU General Public License v3.0 6 votes vote down vote up
def tTestPointsInRegions(pointCounts1, pointCounts2, labeledImage = lbl.DefaultLabeledImageFile, signed = False, removeNaN = True, pcutoff = None, equal_var = False):
    """t-Test on differences in counts of points in labeled regions"""
    
    #ids, p1 = countPointsGroupInRegions(pointGroup1, labeledImage = labeledImage, withIds = True);
    #p2 = countPointsGroupInRegions(pointGroup2,  labeledImage = labeledImage, withIds = False);   
    
    tvals, pvals = stats.ttest_ind(pointCounts1, pointCounts2, axis = 1, equal_var = equal_var);
    
    #remove nans
    if removeNaN: 
        pi = numpy.isnan(pvals);
        pvals[pi] = 1.0;
        tvals[pi] = 0;

    pvals = self.cutoffPValues(pvals, pcutoff = pcutoff);
    
    #pvals.shape = (1,) + pvals.shape;
    #ids.shape = (1,) + ids.shape;
    #pvals = numpy.concatenate((ids.T, pvals.T), axis = 1);
    
    if signed:
        return pvals, numpy.sign(tvals);
    else:
        return pvals; 
Example #23
Source File: data.py    From multipy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def two_group_model(N=25, m=1000, pi0=0.1, delta=0.7):
    """A two-group model for generating test data (described in [3] and
    elsewhere). The default input arguments can be used to reproduce the
    result reported by Reiss and colleagues in Figure 2.A.

    Input arguments:
    N     - Number of samples in each group or condition.
    m     - Number of variables
    pi0   - Proportion of null effects among the m variables.
    delta - Location parameter of the non-null part of the distribution of Y,
            which controls the effect size.

    Output arguments:
    tstat - Test statistics (Student's t's)
    pvals - P-values corresponding to the test statistics
    """
    X = normal(loc=0, scale=1, size=(N, m))
    Y = np.hstack([normal(loc=0, scale=1, size=(N, int(pi0*m))),
                   normal(loc=delta, scale=1, size=(N, int(round((1-pi0)*m, 1))))])
    # Two-sample t-test
    tstat, pvals = ttest_ind(X, Y, axis=0)
    return tstat, pvals 
Example #24
Source File: evaluation.py    From sktime with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def t_test(self, metric_name=None):
        """
        Runs t-test on all possible combinations between the estimators.
        """
        self._check_is_evaluated()
        metric_name = self._validate_metric_name(metric_name)
        metrics_per_estimator_dataset = \
            self._get_metrics_per_estimator_dataset(
                metric_name)

        t_df = pd.DataFrame()
        perms = itertools.product(metrics_per_estimator_dataset.keys(),
                                  repeat=2)
        values = np.array([])
        for perm in perms:
            x = np.array(metrics_per_estimator_dataset[perm[0]])
            y = np.array(metrics_per_estimator_dataset[perm[1]])
            t_stat, p_val = ttest_ind(x, y)

            t_test = {
                "estimator_1": perm[0],
                "estimator_2": perm[1],
                "t_stat": t_stat,
                "p_val": p_val
            }

            t_df = t_df.append(t_test, ignore_index=True)
            values = np.append(values, t_stat)
            values = np.append(values, p_val)

        index = t_df["estimator_1"].unique()
        values_names = ["t_stat", "p_val"]
        col_idx = pd.MultiIndex.from_product([index, values_names])
        values_reshaped = values.reshape(len(index),
                                         len(values_names) * len(index))

        values_df_multiindex = pd.DataFrame(values_reshaped, index=index,
                                            columns=col_idx)

        return t_df, values_df_multiindex 
Example #25
Source File: evaluate.py    From lentil with Apache License 2.0 5 votes vote down vote up
def compare_validation_aucs(self, model_a, model_b):
        """
        Use a paired t-test to check the statistical significance
        of the difference between the validation AUCs (across CV runs) of two models

        :param str model_a: The name of a model
        :param str model_b: The name of another model
        :rtype: float
        :return: p-value
        """

        return stats.ttest_ind(
                self.validation_aucs(model_a), self.validation_aucs(model_b), equal_var=True)[1] 
Example #26
Source File: continuous_mdr.py    From scikit-mdr with MIT License 5 votes vote down vote up
def score(self, features, targets):
        """Estimates the quality of the ContinuousMDR model using a t-statistic.

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix to predict from
        targets: array-like {n_samples}
            List of true target values

        Returns
        -------
        quality_score: float
            The estimated quality of the Continuous MDR model

        """
        if self.feature_map is None:
            raise ValueError('The Continuous MDR model must be fit before score() can be called.')

        group_0_trait_values = []
        group_1_trait_values = []

        for feature_instance in self.feature_map:
            if self.feature_map[feature_instance] == 0:
                group_0_trait_values.extend(self.mdr_matrix_values[feature_instance])
            else:
                group_1_trait_values.extend(self.mdr_matrix_values[feature_instance])

        return abs(ttest_ind(group_0_trait_values, group_1_trait_values).statistic) 
Example #27
Source File: ptest.py    From pregel with MIT License 5 votes vote down vote up
def _compare_gaussians(dist1, dist2):
    '''Method to compare samples from two gaussians distributions to determine if they are likely to be drawn from the
     same distribution.
    Here we assume that we already know that the 2 dist are indeed gaussians.
    Return true if the two list of samples are likely to be drawn from the same gaussian'''

    _, pvalue = ttest_ind(dist1, dist2, equal_var=False)

    if (pvalue > 0.05):
        # Likely to be drawn from same gaussian
        return True
    else:
        return False 
Example #28
Source File: filters.py    From causallib with Apache License 2.0 5 votes vote down vote up
def compute_pvals(self, X, y):
        # TODO: export to stats_utils?
        is_y_binary = (len(np.unique(y)) == 2)
        # is_binary_feature = np.sum(((X != np.nanmin(X, axis=0)[np.newaxis, :]) &
        #                             (X != np.nanmax(X, axis=0)[np.newaxis, :])), axis=0) == 0
        is_binary_feature = areColumnsBinary(X)
        p_vals = np.zeros(X.shape[1])
        if is_y_binary:
            # Process non-binary columns:
            for i in np.where(~is_binary_feature)[0]:
                x0 = X.loc[y == 0, i]
                x1 = X.loc[y == 1, i]
                if self.is_linear:
                    _, p_vals[i] = stats.ttest_ind(x0, x1)
                else:
                    _, p_vals[i] = stats.ks_2samp(x0, x1)

            # Process binary features:
            _, p_vals[is_binary_feature] = feature_selection.chi2(X.loc[:, is_binary_feature], y)

        else:
            # Process non-binary features:
            _, p_vals[~is_binary_feature] = feature_selection.f_regression(X.loc[:, ~is_binary_feature], y)

            # Process binary features:
            y_mat = np.row_stack(y)
            for i in np.where(is_binary_feature)[0]:
                _, p_vals[i] = feature_selection.f_regression(y_mat, X.loc[:, i])
        return p_vals 
Example #29
Source File: test_single_external_libs.py    From diffxpy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100):
        """
        Test if de.test.t_test() generates the same p-value distribution as scipy t-test.

        :param n_cells: Number of cells to simulate (number of observations per test).
        :param n_genes: Number of genes to simulate (number of tests).
        """
        logging.getLogger("tensorflow").setLevel(logging.ERROR)
        logging.getLogger("batchglm").setLevel(logging.WARNING)
        logging.getLogger("diffxpy").setLevel(logging.INFO)

        np.random.seed(1)
        sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
        test = de.test.t_test(
            data=sim.input_data,
            grouping="condition",
            sample_description=sim.sample_description
        )

        # Run scipy t-tests as a reference.
        conds = np.unique(sim.sample_description["condition"].values)
        ind_a = np.where(sim.sample_description["condition"] == conds[0])[0]
        ind_b = np.where(sim.sample_description["condition"] == conds[1])[0]
        scipy_pvals = stats.ttest_ind(
            a=sim.x[ind_a, :],
            b=sim.x[ind_b, :],
            axis=0,
            equal_var=False
        ).pvalue
        self._eval(test=test, ref_pvals=scipy_pvals)
        return True 
Example #30
Source File: calc_Utilities.py    From IceVarFigs with MIT License 5 votes vote down vote up
def calc_indttest(varx,vary):
    """
    Function calculates statistical difference for 2 independent
    sample t-test

    Parameters
    ----------
    varx : 3d array
    vary : 3d array
    
    Returns
    -------
    stat = calculated t-statistic
    pvalue = two-tailed p-value

    Usage
    -----
    stat,pvalue = calc_ttest(varx,vary)
    """
    print('\n>>> Using calc_ttest function!')
    
    ### Import modules
    import numpy as np
    import scipy.stats as sts
    
    ### 2-independent sample t-test
    stat,pvalue = sts.ttest_ind(varx,vary,nan_policy='omit')
    
    ### Significant at 95% confidence level
    pvalue[np.where(pvalue >= 0.05)] = np.nan
    pvalue[np.where(pvalue < 0.05)] = 1.
    
    print('*Completed: Finished calc_ttest function!')
    return stat,pvalue

###############################################################################
###############################################################################
###############################################################################