Python scipy.stats.chi2_contingency() Examples
The following are 22
code examples of scipy.stats.chi2_contingency().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scipy.stats
, or try the search function
.
Example #1
Source File: DataSeparate.py From FAE with GNU General Public License v3.0 | 6 votes |
def _CompareCategoricalFeatures(self, array1, array2): df1 = pd.DataFrame(Counter(array1), index=[1]) df2 = pd.DataFrame(Counter(array2), index=[2]) df = pd.concat((df1, df2), axis=0) df = df.fillna(0) descrip1, descrip2 = df.iloc[0, :], df.iloc[1, :] descrip1 = ['{}: {}'.format(descrip1.index[x], descrip1.iloc[x]) for x in range(descrip1.size)] descrip2 = ['{}: {}'.format(descrip2.index[x], descrip2.iloc[x]) for x in range(descrip2.size)] description = {} _, description['p-value'], _, _ = chi2_contingency(df.values, correction=True) description['method'] = 'Chi-Square' description['description'] = [', '.join(descrip1), ', '.join(descrip2)] return description
Example #2
Source File: continuous_variable.py From intro_ds with Apache License 2.0 | 6 votes |
def divideData(data, minValue, maxValue): """ 遍历所有可能的分段,返回卡方统计量最高的分段 """ maxChi2 = 0 index = -1 maxPValue = 0 for i in range(minValue+1, maxValue): category = pd.cut(data["hours_per_week"], [minValue, i, maxValue], include_lowest=True) cross = pd.crosstab(data["label"], category) chi2, pValue, _, _ = scs.chi2_contingency(cross) if chi2 > maxChi2: maxPValue = pValue maxChi2 = chi2 index = i return maxPValue, maxChi2, index
Example #3
Source File: predict_enriched.py From PIDGINv3 with GNU General Public License v3.0 | 6 votes |
def doHitProcess(inp): idx, hits, n_f1_hits, n_f2_hits = inp p1_0, p1_1 = n_f1_hits-hits[0], hits[0] p2_0, p2_1 = n_f2_hits-hits[1], hits[1] #if no actives in either set return if p1_1 == 0 and p2_1 == 0: return #calculate percentage of hits for file1 and file2 pcp1_1 = float(p1_1)/float(p1_0) pcp2_1 = float(p2_1)/float(p2_0) #if no inactives in either set, set chi2 to 1.0 and pvalue to 0 if p1_0 == 0 and p2_0 == 0: return 1.0, idx, p1_1, pcp1_1, p2_1, pcp2_1, 1.0, 'NA' chi, pvalue = chi2_contingency([[p1_1,p1_0],[p2_1,p2_0]])[:2] #calculate odds ratio try: odr = (float(p1_1)/float(p1_0))/(float(p2_1)/float(p2_0)) except ZeroDivisionError: odr = np.inf #calculate risk ratio try: rr = (float(p1_1)/(float(p1_1)+float(p1_0)))/(float(p2_1)/(float(p2_1)+float(p2_0))) except ZeroDivisionError: rr = np.inf return odr, idx, p1_1, pcp1_1, p2_1, pcp2_1, rr, pvalue #calculate the chi2 and odds ratio between pathway and disease predictions
Example #4
Source File: test_ibmq_job.py From qiskit-ibmq-provider with Apache License 2.0 | 5 votes |
def test_run_simulator(self): """Test running in a simulator.""" qr = QuantumRegister(2, 'q') cr = ClassicalRegister(2, 'c') qc = QuantumCircuit(qr, cr, name='hadamard') qc.h(qr) qc.measure(qr, cr) qobj = assemble(transpile([ReferenceCircuits.bell(), qc], backend=self.sim_backend), backend=self.sim_backend) shots = qobj.config.shots job = self.sim_backend.run(qobj, validate_qobj=True) result = job.result() counts_qx1 = result.get_counts(0) counts_qx2 = result.get_counts(1) counts_ex1 = {'00': shots / 2, '11': shots / 2} counts_ex2 = {'00': shots / 4, '11': shots / 4, '10': shots / 4, '01': shots / 4} states1 = counts_qx1.keys() | counts_ex1.keys() states2 = counts_qx2.keys() | counts_ex2.keys() # contingency table ctable1 = numpy.array([[counts_qx1.get(key, 0) for key in states1], [counts_ex1.get(key, 0) for key in states1]]) ctable2 = numpy.array([[counts_qx2.get(key, 0) for key in states2], [counts_ex2.get(key, 0) for key in states2]]) self.log.info('states1: %s', str(states1)) self.log.info('states2: %s', str(states2)) self.log.info('ctable1: %s', str(ctable1)) self.log.info('ctable2: %s', str(ctable2)) contingency1 = chi2_contingency(ctable1) contingency2 = chi2_contingency(ctable2) self.log.info('chi2_contingency1: %s', str(contingency1)) self.log.info('chi2_contingency2: %s', str(contingency2)) self.assertGreater(contingency1[1], 0.01) self.assertGreater(contingency2[1], 0.01)
Example #5
Source File: TargetAnalysisCategorical.py From exploripy with MIT License | 5 votes |
def ChiSquareOfDFCols(self, c1, c2): groupsizes = self.df.groupby([c1, c2]).size() ctsum = groupsizes.unstack(c1) return(list(chi2_contingency(ctsum.fillna(0)))[0:2])
Example #6
Source File: EDA.py From exploripy with MIT License | 5 votes |
def ChiSquareOfDFCols(self, c1, c2): start = time.time() groupsizes = self.df.groupby([c1, c2]).size() ctsum = groupsizes.unstack(c1) end = time.time() if self.debug == 'YES': print('ChiSquareOfDFCols',end-start) return(list(chi2_contingency(ctsum.fillna(0)))[0:2])
Example #7
Source File: stats.py From audit-ai with MIT License | 5 votes |
def chi2_test(labels, results, threshold=None): """ Takes list of labels and results and returns odds ratio and p-value of Chi-square test of independence. Uses scipy.stats.chi2_contingency, using an Rx2 contingency table https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html Parameters ---------- labels : array_like categorical labels for each corresponding value of `result` ie. M/F results : array_like binary decision values, if continuous values are supplied then the `threshold` must also be supplied to generate binary decisions threshold : numeric value dividing scores into True/False, where result>=threshold == True Returns ------- chi2_stat : float The test statistic. pvalue : float P-value, the probability of obtaining a distribution at least as extreme as the one that was actually observed, assuming that the null hypothesis is true. """ check_consistent_length(labels, results) results = np.array(results) # convert the results to True/False results = boolean_array(results, threshold=threshold) ctab = crosstab(labels, results) chi2_stat, pvalue = chi2_contingency(ctab)[:2] return chi2_stat, pvalue
Example #8
Source File: stat_utils.py From causallib with Apache License 2.0 | 5 votes |
def chi2_test(X, y): """ Args: X (np.ndarray): Binary feature matrix y (np.ndarray): Binary response vector Returns: np.array: A vector of p-values, one for every feature. """ X0 = 1 - X if hasattr(y, "values"): y = y.values Y = y.reshape((-1, 1)) Y = np.append(1 - Y, Y, axis=1) Tbl1 = np.dot(Y.T, X) Tbl0 = np.dot(Y.T, X0) m = X.shape[1] pvals = np.empty(m) * np.NaN for i in range(m): if np.all([Tbl1[:, i] == 0]) or np.all([Tbl0[:, i] == 0]): pvals[i] = 1 else: r = stats.chi2_contingency([Tbl0[:, i], Tbl1[:, i]], True) pvals[i] = r[1] return pvals
Example #9
Source File: questionnaire.py From reportgen with MIT License | 5 votes |
def chi2_test(fo,alpha=0.05): import scipy.stats as stats fo=pd.DataFrame(fo) chiStats = stats.chi2_contingency(observed=fo) #critical_value = stats.chi2.ppf(q=1-alpha,df=chiStats[2]) #observed_chi_val = chiStats[0] # p<alpha 等价于 observed_chi_val>critical_value chi2_data=(chiStats[1] <= alpha,chiStats[1]) return chi2_data
Example #10
Source File: preprocessing.py From reportgen with MIT License | 5 votes |
def _chisqure_fo(fo): if any(fo==0): fo=fo+1 s=stats.chi2_contingency(fo) return s[0],s[1]
Example #11
Source File: metrics.py From reportgen with MIT License | 5 votes |
def chi2(X,y): '''计算一组数据的卡方值,弥补sklearn中的chi2只支持2*2的缺憾 parameter ---------- X:可以是单个特征,也可以是一组特征 y:目标变量 return ------ chi2_value: np.array 数组 chi2_pvalue:np.array 数组 ''' X=np.asarray(X) if len(X.shape)==1: X=X.reshape((len(X),1)) X=pd.DataFrame(X) chi2_value=[] chi2_pvalue=[] for c in X.columns: fo=pd.crosstab(X[c],y) s=stats.chi2_contingency(fo) chi2_value.append(s[0]) chi2_pvalue.append(s[1]) return (np.array(chi2_value),np.array(chi2_pvalue)) # 待定
Example #12
Source File: hypothesis_test.py From fairtest with Apache License 2.0 | 5 votes |
def g_test(data, correction=False): """ G-test (likelihood ratio test). Parameters ---------- data : the contingency table correction : whether to apply continuity corrections Returns ------- g : the test statistic p : the p-value df: the number of degrees of freedom expected: the expected frequencies References ---------- https://en.wikipedia.org/wiki/G-test """ if isinstance(data, pd.DataFrame): data = data.values # remove zero rows/columns data = data[~np.all(data == 0, axis=1)] data = data[:, ~np.all(data == 0, axis=0)] if data.sum() == 0: return 0, 1.0, 1, None return stats.chi2_contingency(data, correction=correction, lambda_="log-likelihood")
Example #13
Source File: test_morestats.py From GraphicDesignPatternByPython with MIT License | 5 votes |
def test_basic(self): # median_test calls chi2_contingency to compute the test statistic # and p-value. Make sure it hasn't screwed up the call... x = [1, 2, 3, 4, 5] y = [2, 4, 6, 8] stat, p, m, tbl = stats.median_test(x, y) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p) stat, p, m, tbl = stats.median_test(x, y, lambda_=0) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, lambda_=0) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p) stat, p, m, tbl = stats.median_test(x, y, correction=False) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, correction=False) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p)
Example #14
Source File: predict_enriched_decision_tree.py From PIDGINv2 with MIT License | 5 votes |
def doHitProcess(inp): idx, hits, n_f1_hits, n_f2_hits = inp if hits[0] == 0 and hits[1] == 0: return if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA' if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA' h1_p = float(hits[0])/float(n_f1_hits) h2_p = float(hits[1])/float(n_f2_hits) chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]]) return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue #calculate the enrichment ratio between predictions
Example #15
Source File: predict_enriched_two_libraries_decision_tree.py From PIDGINv2 with MIT License | 5 votes |
def doHitProcess(inp): idx, hits, n_f1_hits, n_f2_hits = inp if hits[0] == 0 and hits[1] == 0: return if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA' if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA' h1_p = float(hits[0])/float(n_f1_hits) h2_p = float(hits[1])/float(n_f2_hits) chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]]) return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue #calculate the enrichment ratio between predictions
Example #16
Source File: chicDifferentialTest.py From HiCExplorer with GNU General Public License v3.0 | 5 votes |
def chisquare_test(pDataFile1, pDataFile2, pAlpha): # pair of accepted/unaccepted and pvalue # True is rejection of H0 # False acceptance of H0 test_result = [] accepted = [] rejected = [] # Find the critical value for alpha confidence level critical_value = stats.chi2.ppf(q=1 - pAlpha, df=1) zero_values_counter = 0 for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)): try: chi2, p_value, dof, ex = stats.chi2_contingency( [group1, group2], correction=False) if chi2 >= critical_value: test_result.append(p_value) rejected.append([i, p_value]) else: test_result.append(p_value) accepted.append([i, p_value]) except ValueError: zero_values_counter += 1 test_result.append(np.nan) accepted.append([i, 1.0]) if zero_values_counter > 0: log.info('{} samples were not tested because at least one condition contained no data in both groups.'.format( zero_values_counter)) return test_result, accepted, rejected
Example #17
Source File: eda.py From xam with MIT License | 5 votes |
def cramers_v_corrected_stat(confusion_matrix): """Calculate Cramérs V statistic for categorial-categorial association. Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. """ chi2 = stats.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2_corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1)) r_corr = r - ((r-1)**2) / (n-1) k_corr = k - ((k-1)**2) / (n-1) return math.sqrt(phi2_corr / min((r_corr-1), (k_corr-1)))
Example #18
Source File: eda.py From xam with MIT License | 5 votes |
def cramers_v_stat(confusion_matrix): """Calculate Cramérs V statistic for categorial-categorial association.""" chi2 = stats.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum() phi2 = chi2 / n r, k = confusion_matrix.shape return math.sqrt(phi2 / min((r-1), (k-1)))
Example #19
Source File: test_contingency_tables.py From vnpy_crypto with MIT License | 5 votes |
def test_chi2_association(): np.random.seed(8743) table = np.random.randint(10, 30, size=(4, 4)) from scipy.stats import chi2_contingency rslt_scipy = chi2_contingency(table) b = ctab.Table(table).test_nominal_association() assert_allclose(b.statistic, rslt_scipy[0]) assert_allclose(b.pvalue, rslt_scipy[1])
Example #20
Source File: nominal.py From dython with BSD 3-Clause "New" or "Revised" License | 4 votes |
def cramers_v(x, y, bias_correction=True, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE): """ Calculates Cramer's V statistic for categorical-categorical association. This is a symmetric coefficient: V(x,y) = V(y,x) Original function taken from: https://stackoverflow.com/a/46498792/5863503 Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V Parameters: ----------- x : list / NumPy ndarray / Pandas Series A sequence of categorical measurements y : list / NumPy ndarray / Pandas Series A sequence of categorical measurements bias_correction : Boolean, default = True Use bias correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. Returns: -------- float in the range of [0,1] """ if nan_strategy == _REPLACE: x, y = replace_nan_with_value(x, y, nan_replace_value) elif nan_strategy == _DROP: x, y = remove_incomplete_samples(x, y) confusion_matrix = pd.crosstab(x, y) chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2 / n r, k = confusion_matrix.shape if bias_correction: phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1) ** 2) / (n - 1) kcorr = k - ((k - 1) ** 2) / (n - 1) if min((kcorr - 1), (rcorr - 1)) == 0: warnings.warn( "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False", RuntimeWarning) return np.nan else: return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) else: return np.sqrt(phi2 / min(k - 1, r - 1))
Example #21
Source File: preprocessing.py From reportgen with MIT License | 4 votes |
def chimerge(x,y,max_intervals=30,threshold=5,sample=None): '''卡方分箱 parameter --------- x: {array-like}, shape [n_samples, 1] y: target, connot contain nan max_intervals: 最大的区间数 threshold:卡方阈值(两个变量) sample: int,当样本数过大时,对数据进行取样 return ------ bins: ''' x=pd.Series(x) y=pd.Series(y) class_y=list(pd.unique(y[pd.notnull(y)])) value_max=x.max() #value_max=np.sort(x)[-1] value_min=x.min() # 随机取样,且确保取样后的y能包含class_y中的所有类别 if isinstance(sample,int): sample=min(sample,len(x)) tmp=set() while tmp!=set(class_y): cc=np.random.choice([True,False],size=len(x),p=[sample/len(x),1-sample/len(x)]) tmp=set(np.unique(y[cc])) x=x[cc] y=y[cc] fo=pd.crosstab(x,y)# 列联表 fo=fo.sort_index() while fo.shape[0] > max_intervals: chitest={} index=list(fo.index) for r in range(len(fo)-1): #chi2,_=stats.chi2_contingency(fo.iloc[[r,r+1],:]) chi2,_=_chisqure_fo(fo.iloc[[r,r+1],:]) if chi2 not in chitest: chitest[chi2]=[] chitest[chi2].append((r,r+1)) smallest = min(chitest.keys()) if smallest <= threshold: #print('最小的chi2值: {}'.format(smallest)) #print([(index[r[0]],index[r[1]]) for r in list(reversed(chitest[smallest]))]) for (lower,upper) in list(reversed(chitest[smallest])): fo.loc[index[lower],:]=fo.loc[index[lower],:]+fo.loc[index[upper],:] fo = fo.drop(index[upper],axis=0) #print('已经删除 {}'.format(index[upper])) else: break bins=list(fo.index)+[value_max] bins[0]=value_min # 如果bins都是数值,则最左和最右都扩大1%以囊括最小最大值 if np.issubdtype(type(bins[0]),np.number): bins[0]=bins[0]*0.99 if bins[0]>0 else bins[0]-0.01 bins[-1]=bins[-1]*1.01 return bins
Example #22
Source File: eda.py From xam with MIT License | 4 votes |
def feature_importance_classification(features, target, n_neighbors=3, random_state=None): cont = features.select_dtypes(include=[np.floating]) disc = features.select_dtypes(include=[np.integer, np.bool]) cont_imp = pd.DataFrame(index=cont.columns) disc_imp = pd.DataFrame(index=disc.columns) # Continuous features if cont_imp.index.size > 0: # F-test f_test = feature_selection.f_classif(cont, target) cont_imp['f_statistic'] = f_test[0] cont_imp['f_p_value'] = f_test[1] # Mutual information mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False, n_neighbors=n_neighbors, random_state=random_state) cont_imp['mutual_information'] = mut_inf # Discrete features if disc_imp.index.size > 0: # Chi²-test chi2_tests = defaultdict(dict) for feature in disc.columns: cont = pd.crosstab(disc[feature], target) statistic, p_value, _, _ = stats.chi2_contingency(cont) chi2_tests[feature]['chi2_statistic'] = statistic chi2_tests[feature]['chi2_p_value'] = p_value chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index') disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic'] disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value'] # Cramér's V (corrected) disc_imp['cramers_v'] = [ cramers_v_corrected_stat(pd.crosstab(feature, target).values) for _, feature in disc.iteritems() ] # Mutual information mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True, n_neighbors=n_neighbors, random_state=random_state) disc_imp['mutual_information'] = mut_inf return cont_imp, disc_imp