Python Examples of scipy.stats.chi2

Source File: DataSeparate.py From FAE with GNU General Public License v3.0

6 votes

def _CompareCategoricalFeatures(self, array1, array2):
        df1 = pd.DataFrame(Counter(array1), index=[1])
        df2 = pd.DataFrame(Counter(array2), index=[2])
        df = pd.concat((df1, df2), axis=0)
        df = df.fillna(0)

        descrip1, descrip2 = df.iloc[0, :], df.iloc[1, :]
        descrip1 = ['{}: {}'.format(descrip1.index[x], descrip1.iloc[x]) for x in range(descrip1.size)]
        descrip2 = ['{}: {}'.format(descrip2.index[x], descrip2.iloc[x]) for x in range(descrip2.size)]

        description = {}
        _, description['p-value'], _, _ = chi2_contingency(df.values, correction=True)
        description['method'] = 'Chi-Square'
        description['description'] = [', '.join(descrip1),
                                      ', '.join(descrip2)]
        return description

Source File: continuous_variable.py From intro_ds with Apache License 2.0

6 votes

def divideData(data, minValue, maxValue):
    """
    遍历所有可能的分段，返回卡方统计量最高的分段
    """
    maxChi2 = 0
    index = -1
    maxPValue = 0
    for i in range(minValue+1, maxValue):
        category = pd.cut(data["hours_per_week"], [minValue, i, maxValue],
            include_lowest=True)
        cross = pd.crosstab(data["label"], category)
        chi2, pValue, _, _ = scs.chi2_contingency(cross)
        if chi2 > maxChi2:
            maxPValue = pValue
            maxChi2 = chi2
            index = i
    return maxPValue, maxChi2, index

Source File: predict_enriched.py From PIDGINv3 with GNU General Public License v3.0

6 votes

def doHitProcess(inp):
	idx, hits, n_f1_hits, n_f2_hits = inp
	p1_0, p1_1 = n_f1_hits-hits[0], hits[0]
	p2_0, p2_1 = n_f2_hits-hits[1], hits[1]
	#if no actives in either set return
	if p1_1 == 0 and p2_1 == 0: return
	#calculate percentage of hits for file1 and file2
	pcp1_1 = float(p1_1)/float(p1_0)
	pcp2_1 = float(p2_1)/float(p2_0)
	#if no inactives in either set, set chi2 to 1.0 and pvalue to 0
	if p1_0 == 0 and p2_0 == 0: return 1.0, idx, p1_1, pcp1_1, p2_1, pcp2_1, 1.0, 'NA'
	chi, pvalue = chi2_contingency([[p1_1,p1_0],[p2_1,p2_0]])[:2]
	#calculate odds ratio
	try: odr = (float(p1_1)/float(p1_0))/(float(p2_1)/float(p2_0))
	except ZeroDivisionError: odr = np.inf
	#calculate risk ratio
	try: rr = (float(p1_1)/(float(p1_1)+float(p1_0)))/(float(p2_1)/(float(p2_1)+float(p2_0)))
	except ZeroDivisionError: rr = np.inf
	return odr, idx, p1_1, pcp1_1, p2_1, pcp2_1, rr, pvalue
	
#calculate the chi2 and odds ratio between pathway and disease predictions

Source File: test_ibmq_job.py From qiskit-ibmq-provider with Apache License 2.0

5 votes

def test_run_simulator(self):
        """Test running in a simulator."""
        qr = QuantumRegister(2, 'q')
        cr = ClassicalRegister(2, 'c')
        qc = QuantumCircuit(qr, cr, name='hadamard')
        qc.h(qr)
        qc.measure(qr, cr)
        qobj = assemble(transpile([ReferenceCircuits.bell(), qc], backend=self.sim_backend),
                        backend=self.sim_backend)
        shots = qobj.config.shots
        job = self.sim_backend.run(qobj, validate_qobj=True)
        result = job.result()
        counts_qx1 = result.get_counts(0)
        counts_qx2 = result.get_counts(1)
        counts_ex1 = {'00': shots / 2, '11': shots / 2}
        counts_ex2 = {'00': shots / 4, '11': shots / 4, '10': shots / 4, '01': shots / 4}
        states1 = counts_qx1.keys() | counts_ex1.keys()
        states2 = counts_qx2.keys() | counts_ex2.keys()
        # contingency table
        ctable1 = numpy.array([[counts_qx1.get(key, 0) for key in states1],
                               [counts_ex1.get(key, 0) for key in states1]])
        ctable2 = numpy.array([[counts_qx2.get(key, 0) for key in states2],
                               [counts_ex2.get(key, 0) for key in states2]])
        self.log.info('states1: %s', str(states1))
        self.log.info('states2: %s', str(states2))
        self.log.info('ctable1: %s', str(ctable1))
        self.log.info('ctable2: %s', str(ctable2))
        contingency1 = chi2_contingency(ctable1)
        contingency2 = chi2_contingency(ctable2)
        self.log.info('chi2_contingency1: %s', str(contingency1))
        self.log.info('chi2_contingency2: %s', str(contingency2))
        self.assertGreater(contingency1[1], 0.01)
        self.assertGreater(contingency2[1], 0.01)

Source File: TargetAnalysisCategorical.py From exploripy with MIT License

5 votes

def ChiSquareOfDFCols(self, c1, c2):
		groupsizes = self.df.groupby([c1, c2]).size()
		ctsum = groupsizes.unstack(c1)
		
		return(list(chi2_contingency(ctsum.fillna(0)))[0:2])

Source File: EDA.py From exploripy with MIT License

5 votes

def ChiSquareOfDFCols(self, c1, c2):
		start = time.time()
		groupsizes = self.df.groupby([c1, c2]).size()
		ctsum = groupsizes.unstack(c1)
		end = time.time()
		if self.debug == 'YES':
			print('ChiSquareOfDFCols',end-start)
		
		return(list(chi2_contingency(ctsum.fillna(0)))[0:2])

Source File: stats.py From audit-ai with MIT License

5 votes

def chi2_test(labels, results, threshold=None):
    """
    Takes list of labels and results and returns odds ratio and p-value of
    Chi-square test of independence. Uses scipy.stats.chi2_contingency,
    using an Rx2 contingency table
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

    Parameters
    ----------
    labels : array_like
        categorical labels for each corresponding value of `result` ie. M/F

    results : array_like
        binary decision values, if continuous values are supplied then
        the `threshold` must also be supplied to generate binary decisions

    threshold : numeric
        value dividing scores into True/False, where result>=threshold == True

    Returns
    -------
    chi2_stat : float
        The test statistic.
    pvalue : float
        P-value, the probability of obtaining a distribution at least
        as extreme as the one that was actually observed, assuming that
        the null hypothesis is true.
    """

    check_consistent_length(labels, results)
    results = np.array(results)

    # convert the results to True/False
    results = boolean_array(results, threshold=threshold)
    ctab = crosstab(labels, results)

    chi2_stat, pvalue = chi2_contingency(ctab)[:2]
    return chi2_stat, pvalue

Source File: stat_utils.py From causallib with Apache License 2.0

5 votes

def chi2_test(X, y):
    """

    Args:
        X (np.ndarray): Binary feature matrix
        y (np.ndarray): Binary response vector

    Returns:
        np.array: A vector of p-values, one for every feature.
    """
    X0 = 1 - X
    if hasattr(y, "values"):
        y = y.values
    Y = y.reshape((-1, 1))
    Y = np.append(1 - Y, Y, axis=1)
    Tbl1 = np.dot(Y.T, X)
    Tbl0 = np.dot(Y.T, X0)

    m = X.shape[1]
    pvals = np.empty(m) * np.NaN
    for i in range(m):
        if np.all([Tbl1[:, i] == 0]) or np.all([Tbl0[:, i] == 0]):
            pvals[i] = 1
        else:
            r = stats.chi2_contingency([Tbl0[:, i], Tbl1[:, i]], True)
            pvals[i] = r[1]
    return pvals

Source File: questionnaire.py From reportgen with MIT License

5 votes

def chi2_test(fo,alpha=0.05):
    import scipy.stats as stats
    fo=pd.DataFrame(fo)
    chiStats = stats.chi2_contingency(observed=fo)
    #critical_value = stats.chi2.ppf(q=1-alpha,df=chiStats[2])
    #observed_chi_val = chiStats[0]
    # p<alpha 等价于 observed_chi_val>critical_value
    chi2_data=(chiStats[1] <= alpha,chiStats[1])
    return chi2_data

Source File: preprocessing.py From reportgen with MIT License

5 votes

def _chisqure_fo(fo):
    if any(fo==0):
        fo=fo+1
    s=stats.chi2_contingency(fo)
    return s[0],s[1]

Source File: metrics.py From reportgen with MIT License

5 votes

def chi2(X,y):
    '''计算一组数据的卡方值，弥补sklearn中的chi2只支持2*2的缺憾
    parameter
    ----------
    X:可以是单个特征，也可以是一组特征
    y:目标变量
    
    return
    ------
    chi2_value: np.array 数组
    chi2_pvalue：np.array 数组
    '''
    X=np.asarray(X)
    if len(X.shape)==1:
        X=X.reshape((len(X),1))
    X=pd.DataFrame(X)
    chi2_value=[]
    chi2_pvalue=[]
    for c in X.columns:
        fo=pd.crosstab(X[c],y)
        s=stats.chi2_contingency(fo)
        chi2_value.append(s[0])
        chi2_pvalue.append(s[1])
    return (np.array(chi2_value),np.array(chi2_pvalue))



# 待定

Source File: hypothesis_test.py From fairtest with Apache License 2.0

5 votes

def g_test(data, correction=False):
    """
    G-test (likelihood ratio test).

    Parameters
    ----------
    data :
        the contingency table

    correction :
        whether to apply continuity corrections

    Returns
    -------
    g :
        the test statistic
    p :
        the p-value
    df:
        the number of degrees of freedom
    expected:
        the expected frequencies

    References
    ----------
    https://en.wikipedia.org/wiki/G-test
    """
    if isinstance(data, pd.DataFrame):
        data = data.values

    # remove zero rows/columns
    data = data[~np.all(data == 0, axis=1)]
    data = data[:, ~np.all(data == 0, axis=0)]

    if data.sum() == 0:
        return 0, 1.0, 1, None

    return stats.chi2_contingency(data, correction=correction,
                                  lambda_="log-likelihood")

Source File: test_morestats.py From GraphicDesignPatternByPython with MIT License

5 votes

def test_basic(self):
        # median_test calls chi2_contingency to compute the test statistic
        # and p-value.  Make sure it hasn't screwed up the call...

        x = [1, 2, 3, 4, 5]
        y = [2, 4, 6, 8]

        stat, p, m, tbl = stats.median_test(x, y)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)

        stat, p, m, tbl = stats.median_test(x, y, lambda_=0)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, lambda_=0)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)

        stat, p, m, tbl = stats.median_test(x, y, correction=False)
        assert_equal(m, 4)
        assert_equal(tbl, [[1, 2], [4, 2]])

        exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, correction=False)
        assert_allclose(stat, exp_stat)
        assert_allclose(p, exp_p)

Source File: predict_enriched_decision_tree.py From PIDGINv2 with MIT License

5 votes

def doHitProcess(inp):
	idx, hits, n_f1_hits, n_f2_hits = inp
	if hits[0] == 0 and hits[1] == 0: return
	if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA'
	if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA'
	h1_p = float(hits[0])/float(n_f1_hits)
	h2_p = float(hits[1])/float(n_f2_hits)
	chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]])
	return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue

#calculate the enrichment ratio between predictions

Source File: predict_enriched_two_libraries_decision_tree.py From PIDGINv2 with MIT License

5 votes

def doHitProcess(inp):
	idx, hits, n_f1_hits, n_f2_hits = inp
	if hits[0] == 0 and hits[1] == 0: return
	if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA'
	if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA'
	h1_p = float(hits[0])/float(n_f1_hits)
	h2_p = float(hits[1])/float(n_f2_hits)
	chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]])
	return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue

#calculate the enrichment ratio between predictions

Source File: chicDifferentialTest.py From HiCExplorer with GNU General Public License v3.0

5 votes

def chisquare_test(pDataFile1, pDataFile2, pAlpha):
    # pair of accepted/unaccepted and pvalue
    # True is rejection of H0
    # False acceptance of H0
    test_result = []
    accepted = []
    rejected = []
    # Find the critical value for alpha confidence level
    critical_value = stats.chi2.ppf(q=1 - pAlpha, df=1)
    zero_values_counter = 0
    for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)):
        try:
            chi2, p_value, dof, ex = stats.chi2_contingency(
                [group1, group2], correction=False)
            if chi2 >= critical_value:
                test_result.append(p_value)
                rejected.append([i, p_value])
            else:
                test_result.append(p_value)
                accepted.append([i, p_value])

        except ValueError:
            zero_values_counter += 1
            test_result.append(np.nan)
            accepted.append([i, 1.0])

    if zero_values_counter > 0:
        log.info('{} samples were not tested because at least one condition contained no data in both groups.'.format(
            zero_values_counter))
    return test_result, accepted, rejected

Source File: eda.py From xam with MIT License

5 votes

def cramers_v_corrected_stat(confusion_matrix):
    """Calculate Cramérs V statistic for categorial-categorial association.

    Uses correction from Bergsma and Wicher, Journal of the Korean Statistical
    Society 42 (2013): 323-328.
    """
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2_corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
    r_corr = r - ((r-1)**2) / (n-1)
    k_corr = k - ((k-1)**2) / (n-1)
    return math.sqrt(phi2_corr / min((r_corr-1), (k_corr-1)))

Source File: eda.py From xam with MIT License

5 votes

def cramers_v_stat(confusion_matrix):
    """Calculate Cramérs V statistic for categorial-categorial association."""
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return math.sqrt(phi2 / min((r-1), (k-1)))

Source File: test_contingency_tables.py From vnpy_crypto with MIT License

5 votes

def test_chi2_association():

    np.random.seed(8743)

    table = np.random.randint(10, 30, size=(4, 4))

    from scipy.stats import chi2_contingency
    rslt_scipy = chi2_contingency(table)

    b = ctab.Table(table).test_nominal_association()

    assert_allclose(b.statistic, rslt_scipy[0])
    assert_allclose(b.pvalue, rslt_scipy[1])

Source File: nominal.py From dython with BSD 3-Clause "New" or "Revised" License

4 votes

def cramers_v(x,
              y,
              bias_correction=True,
              nan_strategy=_REPLACE,
              nan_replace_value=_DEFAULT_REPLACE_VALUE):
    """
    Calculates Cramer's V statistic for categorical-categorical association.
    This is a symmetric coefficient: V(x,y) = V(y,x)

    Original function taken from: https://stackoverflow.com/a/46498792/5863503
    Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

    Parameters:
    -----------
    x : list / NumPy ndarray / Pandas Series
        A sequence of categorical measurements
    y : list / NumPy ndarray / Pandas Series
        A sequence of categorical measurements
    bias_correction : Boolean, default = True
        Use bias correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328.
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop' to remove samples
        with missing values, or 'replace' to replace all missing values with
        the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'.

    Returns:
    --------
    float in the range of [0,1]
    """
    if nan_strategy == _REPLACE:
        x, y = replace_nan_with_value(x, y, nan_replace_value)
    elif nan_strategy == _DROP:
        x, y = remove_incomplete_samples(x, y)
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    if bias_correction:
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
        rcorr = r - ((r - 1) ** 2) / (n - 1)
        kcorr = k - ((k - 1) ** 2) / (n - 1)
        if min((kcorr - 1), (rcorr - 1)) == 0:
            warnings.warn(
                "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False",
                RuntimeWarning)
            return np.nan
        else:
            return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    else:
        return np.sqrt(phi2 / min(k - 1, r - 1))

Source File: preprocessing.py From reportgen with MIT License

4 votes

def chimerge(x,y,max_intervals=30,threshold=5,sample=None):
    '''卡方分箱
    parameter
    ---------
    x: {array-like}, shape [n_samples, 1]
    y: target, connot contain nan 
    max_intervals: 最大的区间数
    threshold：卡方阈值(两个变量)
    sample: int,当样本数过大时，对数据进行取样
    
    return
    ------
    bins: 
    
    '''
    
    x=pd.Series(x)
    y=pd.Series(y)
    class_y=list(pd.unique(y[pd.notnull(y)]))
    value_max=x.max()
    #value_max=np.sort(x)[-1]
    value_min=x.min()
    # 随机取样，且确保取样后的y能包含class_y中的所有类别
    if isinstance(sample,int):
        sample=min(sample,len(x))
        tmp=set()
        while tmp!=set(class_y):
            cc=np.random.choice([True,False],size=len(x),p=[sample/len(x),1-sample/len(x)])
            tmp=set(np.unique(y[cc]))
        x=x[cc]
        y=y[cc]
    fo=pd.crosstab(x,y)# 列联表
    fo=fo.sort_index()
   
    while fo.shape[0] > max_intervals:
        chitest={}
        index=list(fo.index)
        for r in range(len(fo)-1):
            #chi2,_=stats.chi2_contingency(fo.iloc[[r,r+1],:])
            chi2,_=_chisqure_fo(fo.iloc[[r,r+1],:])
            if chi2 not in chitest:
                chitest[chi2]=[]
            chitest[chi2].append((r,r+1))
        smallest = min(chitest.keys())
        if smallest <= threshold:
            #print('最小的chi2值: {}'.format(smallest))
            #print([(index[r[0]],index[r[1]]) for r in list(reversed(chitest[smallest]))])
            for (lower,upper) in list(reversed(chitest[smallest])):
                fo.loc[index[lower],:]=fo.loc[index[lower],:]+fo.loc[index[upper],:]
                fo = fo.drop(index[upper],axis=0)
                #print('已经删除 {}'.format(index[upper]))
        else:
            break
    bins=list(fo.index)+[value_max]
    bins[0]=value_min
    # 如果bins都是数值，则最左和最右都扩大1%以囊括最小最大值
    if np.issubdtype(type(bins[0]),np.number):
        bins[0]=bins[0]*0.99 if bins[0]>0 else bins[0]-0.01
        bins[-1]=bins[-1]*1.01
    return bins

Source File: eda.py From xam with MIT License

4 votes

def feature_importance_classification(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # F-test
        f_test = feature_selection.f_classif(cont, target)
        cont_imp['f_statistic'] = f_test[0]
        cont_imp['f_p_value'] = f_test[1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # Chi²-test
        chi2_tests = defaultdict(dict)

        for feature in disc.columns:
            cont = pd.crosstab(disc[feature], target)
            statistic, p_value, _, _ = stats.chi2_contingency(cont)
            chi2_tests[feature]['chi2_statistic'] = statistic
            chi2_tests[feature]['chi2_p_value'] = p_value

        chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
        disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
        disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']

        # Cramér's V (corrected)
        disc_imp['cramers_v'] = [
            cramers_v_corrected_stat(pd.crosstab(feature, target).values)
            for _, feature in disc.iteritems()
        ]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp

Python scipy.stats.chi2_contingency() Examples