Python scipy.stats.chi2_contingency() Examples

The following are 22 code examples of scipy.stats.chi2_contingency(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.stats , or try the search function .
Example #1
Source File: DataSeparate.py    From FAE with GNU General Public License v3.0 6 votes vote down vote up
def _CompareCategoricalFeatures(self, array1, array2):
        df1 = pd.DataFrame(Counter(array1), index=[1])
        df2 = pd.DataFrame(Counter(array2), index=[2])
        df = pd.concat((df1, df2), axis=0)
        df = df.fillna(0)

        descrip1, descrip2 = df.iloc[0, :], df.iloc[1, :]
        descrip1 = ['{}: {}'.format(descrip1.index[x], descrip1.iloc[x]) for x in range(descrip1.size)]
        descrip2 = ['{}: {}'.format(descrip2.index[x], descrip2.iloc[x]) for x in range(descrip2.size)]

        description = {}
        _, description['p-value'], _, _ = chi2_contingency(df.values, correction=True)
        description['method'] = 'Chi-Square'
        description['description'] = [', '.join(descrip1),
                                      ', '.join(descrip2)]
        return description 
Example #2
Source File: continuous_variable.py    From intro_ds with Apache License 2.0 6 votes vote down vote up
def divideData(data, minValue, maxValue):
    """
    遍历所有可能的分段,返回卡方统计量最高的分段
    """
    maxChi2 = 0
    index = -1
    maxPValue = 0
    for i in range(minValue+1, maxValue):
        category = pd.cut(data["hours_per_week"], [minValue, i, maxValue],
            include_lowest=True)
        cross = pd.crosstab(data["label"], category)
        chi2, pValue, _, _ = scs.chi2_contingency(cross)
        if chi2 > maxChi2:
            maxPValue = pValue
            maxChi2 = chi2
            index = i
    return maxPValue, maxChi2, index 
Example #3
Source File: predict_enriched.py    From PIDGINv3 with GNU General Public License v3.0 6 votes vote down vote up
def doHitProcess(inp):
	idx, hits, n_f1_hits, n_f2_hits = inp
	p1_0, p1_1 = n_f1_hits-hits[0], hits[0]
	p2_0, p2_1 = n_f2_hits-hits[1], hits[1]
	#if no actives in either set return
	if p1_1 == 0 and p2_1 == 0: return
	#calculate percentage of hits for file1 and file2
	pcp1_1 = float(p1_1)/float(p1_0)
	pcp2_1 = float(p2_1)/float(p2_0)
	#if no inactives in either set, set chi2 to 1.0 and pvalue to 0
	if p1_0 == 0 and p2_0 == 0: return 1.0, idx, p1_1, pcp1_1, p2_1, pcp2_1, 1.0, 'NA'
	chi, pvalue = chi2_contingency([[p1_1,p1_0],[p2_1,p2_0]])[:2]
	#calculate odds ratio
	try: odr = (float(p1_1)/float(p1_0))/(float(p2_1)/float(p2_0))
	except ZeroDivisionError: odr = np.inf
	#calculate risk ratio
	try: rr = (float(p1_1)/(float(p1_1)+float(p1_0)))/(float(p2_1)/(float(p2_1)+float(p2_0)))
	except ZeroDivisionError: rr = np.inf
	return odr, idx, p1_1, pcp1_1, p2_1, pcp2_1, rr, pvalue
	
#calculate the chi2 and odds ratio between pathway and disease predictions 
Example #4
Source File: test_ibmq_job.py    From qiskit-ibmq-provider with Apache License 2.0 5 votes vote down vote up
def test_run_simulator(self):
        """Test running in a simulator."""
        qr = QuantumRegister(2, 'q')
        cr = ClassicalRegister(2, 'c')
        qc = QuantumCircuit(qr, cr, name='hadamard')
        qc.h(qr)
        qc.measure(qr, cr)
        qobj = assemble(transpile([ReferenceCircuits.bell(), qc], backend=self.sim_backend),
                        backend=self.sim_backend)
        shots = qobj.config.shots
        job = self.sim_backend.run(qobj, validate_qobj=True)
        result = job.result()
        counts_qx1 = result.get_counts(0)
        counts_qx2 = result.get_counts(1)
        counts_ex1 = {'00': shots / 2, '11': shots / 2}
        counts_ex2 = {'00': shots / 4, '11': shots / 4, '10': shots / 4, '01': shots / 4}
        states1 = counts_qx1.keys() | counts_ex1.keys()
        states2 = counts_qx2.keys() | counts_ex2.keys()
        # contingency table
        ctable1 = numpy.array([[counts_qx1.get(key, 0) for key in states1],
                               [counts_ex1.get(key, 0) for key in states1]])
        ctable2 = numpy.array([[counts_qx2.get(key, 0) for key in states2],
                               [counts_ex2.get(key, 0) for key in states2]])
        self.log.info('states1: %s', str(states1))
        self.log.info('states2: %s', str(states2))
        self.log.info('ctable1: %s', str(ctable1))
        self.log.info('ctable2: %s', str(ctable2))
        contingency1 = chi2_contingency(ctable1)
        contingency2 = chi2_contingency(ctable2)
        self.log.info('chi2_contingency1: %s', str(contingency1))
        self.log.info('chi2_contingency2: %s', str(contingency2))
        self.assertGreater(contingency1[1], 0.01)
        self.assertGreater(contingency2[1], 0.01) 
Example #5
Source File: TargetAnalysisCategorical.py    From exploripy with MIT License 5 votes vote down vote up
def ChiSquareOfDFCols(self, c1, c2):
		groupsizes = self.df.groupby([c1, c2]).size()
		ctsum = groupsizes.unstack(c1)
		
		return(list(chi2_contingency(ctsum.fillna(0)))[0:2]) 
Example #6
Source File: EDA.py    From exploripy with MIT License 5 votes vote down vote up
def ChiSquareOfDFCols(self, c1, c2):
		start = time.time()
		groupsizes = self.df.groupby([c1, c2]).size()
		ctsum = groupsizes.unstack(c1)
		end = time.time()
		if self.debug == 'YES':
			print('ChiSquareOfDFCols',end-start)
		
		return(list(chi2_contingency(ctsum.fillna(0)))[0:2]) 
Example #7
Source File: stats.py    From audit-ai with MIT License 5 votes vote down vote up
def chi2_test(labels, results, threshold=None):
    """
    Takes list of labels and results and returns odds ratio and p-value of
    Chi-square test of independence. Uses scipy.stats.chi2_contingency,
    using an Rx2 contingency table
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

    Parameters
    ----------
    labels : array_like
        categorical labels for each corresponding value of `result` ie. M/F

    results : array_like
        binary decision values, if continuous values are supplied then
        the `threshold` must also be supplied to generate binary decisions

    threshold : numeric
        value dividing scores into True/False, where result>=threshold == True

    Returns
    -------
    chi2_stat : float
        The test statistic.
    pvalue : float
        P-value, the probability of obtaining a distribution at least
        as extreme as the one that was actually observed, assuming that
        the null hypothesis is true.
    """

    check_consistent_length(labels, results)
    results = np.array(results)

    # convert the results to True/False
    results = boolean_array(results, threshold=threshold)
    ctab = crosstab(labels, results)

    chi2_stat, pvalue = chi2_contingency(ctab)[:2]
    return chi2_stat, pvalue 
Example #8
Source File: stat_utils.py    From causallib with Apache License 2.0 5 votes vote down vote up
def chi2_test(X, y):
    """

    Args:
        X (np.ndarray): Binary feature matrix
        y (np.ndarray): Binary response vector

    Returns:
        np.array: A vector of p-values, one for every feature.
    """
    X0 = 1 - X
    if hasattr(y, "values"):
        y = y.values
    Y = y.reshape((-1, 1))
    Y = np.append(1 - Y, Y, axis=1)
    Tbl1 = np.dot(Y.T, X)
    Tbl0 = np.dot(Y.T, X0)

    m = X.shape[1]
    pvals = np.empty(m) * np.NaN
    for i in range(m):
        if np.all([Tbl1[:, i] == 0]) or np.all([Tbl0[:, i] == 0]):
            pvals[i] = 1
        else:
            r = stats.chi2_contingency([Tbl0[:, i], Tbl1[:, i]], True)
            pvals[i] = r[1]
    return pvals 
Example #9
Source File: questionnaire.py    From reportgen with MIT License 5 votes vote down vote up
def chi2_test(fo,alpha=0.05):
    import scipy.stats as stats
    fo=pd.DataFrame(fo)
    chiStats = stats.chi2_contingency(observed=fo)
    #critical_value = stats.chi2.ppf(q=1-alpha,df=chiStats[2])
    #observed_chi_val = chiStats[0]
    # p<alpha 等价于 observed_chi_val>critical_value
    chi2_data=(chiStats[1] <= alpha,chiStats[1])
    return chi2_data 
Example #10
Source File: preprocessing.py    From reportgen with MIT License 5 votes vote down vote up
def _chisqure_fo(fo):
    if any(fo==0):
        fo=fo+1
    s=stats.chi2_contingency(fo)
    return s[0],s[1] 
Example #11
Source File: metrics.py    From reportgen with MIT License 5 votes vote down vote up
def chi2(X,y):
    '''计算一组数据的卡方值,弥补sklearn中的chi2只支持2*2的缺憾
    parameter
    ----------
    X:可以是单个特征,也可以是一组特征
    y:目标变量
    
    return
    ------
    chi2_value: np.array 数组
    chi2_pvalue:np.array 数组
    '''
    X=np.asarray(X)
    if len(X.shape)==1:
        X=X.reshape((len(X),1))
    X=pd.DataFrame(X)
    chi2_value=[]
    chi2_pvalue=[]
    for c in X.columns:
        fo=pd.crosstab(X[c],y)
        s=stats.chi2_contingency(fo)
        chi2_value.append(s[0])
        chi2_pvalue.append(s[1])
    return (np.array(chi2_value),np.array(chi2_pvalue))



# 待定 
Example #12
Source File: hypothesis_test.py    From fairtest with Apache License 2.0 5 votes vote down vote up
def g_test(data, correction=False):
    """
    G-test (likelihood ratio test).

    Parameters
    ----------
    data :
        the contingency table

    correction :
        whether to apply continuity corrections

    Returns
    -------
    g :
        the test statistic
    p :
        the p-value
    df:
        the number of degrees of freedom
    expected:
        the expected frequencies

    References
    ----------
    https://en.wikipedia.org/wiki/G-test
    """
    if isinstance(data, pd.DataFrame):
        data = data.values

    # remove zero rows/columns
    data = data[~np.all(data == 0, axis=1)]
    data = data[:, ~np.all(data == 0, axis=0)]

    if data.sum() == 0:
        return 0, 1.0, 1, None

    return stats.chi2_contingency(data, correction=correction,
                                  lambda_="log-likelihood") 
Example #13
Source File: test_morestats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_basic(self):
        # median_test calls chi2_contingency to compute the test statistic
        # and p-value.  Make sure it hasn't screwed up the call...

        x = [1, 2, 3, 4, 5]
        y = [2, 4, 6, 8]

        stat, p, m, tbl = stats.median_test(x, y)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)

        stat, p, m, tbl = stats.median_test(x, y, lambda_=0)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, lambda_=0)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)

        stat, p, m, tbl = stats.median_test(x, y, correction=False)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, correction=False)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p) 
Example #14
Source File: predict_enriched_decision_tree.py    From PIDGINv2 with MIT License 5 votes vote down vote up
def doHitProcess(inp):
	idx, hits, n_f1_hits, n_f2_hits = inp
	if hits[0] == 0 and hits[1] == 0: return
	if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA'
	if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA'
	h1_p = float(hits[0])/float(n_f1_hits)
	h2_p = float(hits[1])/float(n_f2_hits)
	chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]])
	return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue

#calculate the enrichment ratio between predictions 
Example #15
Source File: predict_enriched_two_libraries_decision_tree.py    From PIDGINv2 with MIT License 5 votes vote down vote up
def doHitProcess(inp):
	idx, hits, n_f1_hits, n_f2_hits = inp
	if hits[0] == 0 and hits[1] == 0: return
	if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA'
	if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA'
	h1_p = float(hits[0])/float(n_f1_hits)
	h2_p = float(hits[1])/float(n_f2_hits)
	chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]])
	return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue

#calculate the enrichment ratio between predictions 
Example #16
Source File: chicDifferentialTest.py    From HiCExplorer with GNU General Public License v3.0 5 votes vote down vote up
def chisquare_test(pDataFile1, pDataFile2, pAlpha):
    # pair of accepted/unaccepted and pvalue
    # True is rejection of H0
    # False acceptance of H0
    test_result = []
    accepted = []
    rejected = []
    # Find the critical value for alpha confidence level
    critical_value = stats.chi2.ppf(q=1 - pAlpha, df=1)
    zero_values_counter = 0
    for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)):
        try:
            chi2, p_value, dof, ex = stats.chi2_contingency(
                [group1, group2], correction=False)
            if chi2 >= critical_value:
                test_result.append(p_value)
                rejected.append([i, p_value])
            else:
                test_result.append(p_value)
                accepted.append([i, p_value])

        except ValueError:
            zero_values_counter += 1
            test_result.append(np.nan)
            accepted.append([i, 1.0])

    if zero_values_counter > 0:
        log.info('{} samples were not tested because at least one condition contained no data in both groups.'.format(
            zero_values_counter))
    return test_result, accepted, rejected 
Example #17
Source File: eda.py    From xam with MIT License 5 votes vote down vote up
def cramers_v_corrected_stat(confusion_matrix):
    """Calculate Cramérs V statistic for categorial-categorial association.

    Uses correction from Bergsma and Wicher, Journal of the Korean Statistical
    Society 42 (2013): 323-328.
    """
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2_corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
    r_corr = r - ((r-1)**2) / (n-1)
    k_corr = k - ((k-1)**2) / (n-1)
    return math.sqrt(phi2_corr / min((r_corr-1), (k_corr-1))) 
Example #18
Source File: eda.py    From xam with MIT License 5 votes vote down vote up
def cramers_v_stat(confusion_matrix):
    """Calculate Cramérs V statistic for categorial-categorial association."""
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return math.sqrt(phi2 / min((r-1), (k-1))) 
Example #19
Source File: test_contingency_tables.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_chi2_association():

    np.random.seed(8743)

    table = np.random.randint(10, 30, size=(4, 4))

    from scipy.stats import chi2_contingency
    rslt_scipy = chi2_contingency(table)

    b = ctab.Table(table).test_nominal_association()

    assert_allclose(b.statistic, rslt_scipy[0])
    assert_allclose(b.pvalue, rslt_scipy[1]) 
Example #20
Source File: nominal.py    From dython with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def cramers_v(x,
              y,
              bias_correction=True,
              nan_strategy=_REPLACE,
              nan_replace_value=_DEFAULT_REPLACE_VALUE):
    """
    Calculates Cramer's V statistic for categorical-categorical association.
    This is a symmetric coefficient: V(x,y) = V(y,x)

    Original function taken from: https://stackoverflow.com/a/46498792/5863503
    Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

    Parameters:
    -----------
    x : list / NumPy ndarray / Pandas Series
        A sequence of categorical measurements
    y : list / NumPy ndarray / Pandas Series
        A sequence of categorical measurements
    bias_correction : Boolean, default = True
        Use bias correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328.
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop' to remove samples
        with missing values, or 'replace' to replace all missing values with
        the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'.

    Returns:
    --------
    float in the range of [0,1]
    """
    if nan_strategy == _REPLACE:
        x, y = replace_nan_with_value(x, y, nan_replace_value)
    elif nan_strategy == _DROP:
        x, y = remove_incomplete_samples(x, y)
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    if bias_correction:
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
        rcorr = r - ((r - 1) ** 2) / (n - 1)
        kcorr = k - ((k - 1) ** 2) / (n - 1)
        if min((kcorr - 1), (rcorr - 1)) == 0:
            warnings.warn(
                "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False",
                RuntimeWarning)
            return np.nan
        else:
            return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    else:
        return np.sqrt(phi2 / min(k - 1, r - 1)) 
Example #21
Source File: preprocessing.py    From reportgen with MIT License 4 votes vote down vote up
def chimerge(x,y,max_intervals=30,threshold=5,sample=None):
    '''卡方分箱
    parameter
    ---------
    x: {array-like}, shape [n_samples, 1]
    y: target, connot contain nan 
    max_intervals: 最大的区间数
    threshold:卡方阈值(两个变量)
    sample: int,当样本数过大时,对数据进行取样
    
    return
    ------
    bins: 
    
    '''
    
    x=pd.Series(x)
    y=pd.Series(y)
    class_y=list(pd.unique(y[pd.notnull(y)]))
    value_max=x.max()
    #value_max=np.sort(x)[-1]
    value_min=x.min()
    # 随机取样,且确保取样后的y能包含class_y中的所有类别
    if isinstance(sample,int):
        sample=min(sample,len(x))
        tmp=set()
        while tmp!=set(class_y):
            cc=np.random.choice([True,False],size=len(x),p=[sample/len(x),1-sample/len(x)])
            tmp=set(np.unique(y[cc]))
        x=x[cc]
        y=y[cc]
    fo=pd.crosstab(x,y)# 列联表
    fo=fo.sort_index()
   
    while fo.shape[0] > max_intervals:
        chitest={}
        index=list(fo.index)
        for r in range(len(fo)-1):
            #chi2,_=stats.chi2_contingency(fo.iloc[[r,r+1],:])
            chi2,_=_chisqure_fo(fo.iloc[[r,r+1],:])
            if chi2 not in chitest:
                chitest[chi2]=[]
            chitest[chi2].append((r,r+1))
        smallest = min(chitest.keys())
        if smallest <= threshold:
            #print('最小的chi2值: {}'.format(smallest))
            #print([(index[r[0]],index[r[1]]) for r in list(reversed(chitest[smallest]))])
            for (lower,upper) in list(reversed(chitest[smallest])):
                fo.loc[index[lower],:]=fo.loc[index[lower],:]+fo.loc[index[upper],:]
                fo = fo.drop(index[upper],axis=0)
                #print('已经删除 {}'.format(index[upper]))
        else:
            break
    bins=list(fo.index)+[value_max]
    bins[0]=value_min
    # 如果bins都是数值,则最左和最右都扩大1%以囊括最小最大值
    if np.issubdtype(type(bins[0]),np.number):
        bins[0]=bins[0]*0.99 if bins[0]>0 else bins[0]-0.01
        bins[-1]=bins[-1]*1.01
    return bins 
Example #22
Source File: eda.py    From xam with MIT License 4 votes vote down vote up
def feature_importance_classification(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # F-test
        f_test = feature_selection.f_classif(cont, target)
        cont_imp['f_statistic'] = f_test[0]
        cont_imp['f_p_value'] = f_test[1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # Chi²-test
        chi2_tests = defaultdict(dict)

        for feature in disc.columns:
            cont = pd.crosstab(disc[feature], target)
            statistic, p_value, _, _ = stats.chi2_contingency(cont)
            chi2_tests[feature]['chi2_statistic'] = statistic
            chi2_tests[feature]['chi2_p_value'] = p_value

        chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
        disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
        disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']

        # Cramér's V (corrected)
        disc_imp['cramers_v'] = [
            cramers_v_corrected_stat(pd.crosstab(feature, target).values)
            for _, feature in disc.iteritems()
        ]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp