Python pandas.crosstab() Examples
The following are 30
code examples of pandas.crosstab().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_crosstab_with_empties(self): # Check handling of empties df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], 'c': [np.nan, np.nan, np.nan, np.nan, np.nan]}) empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]], index=pd.Index([1, 2], name='a', dtype='int64'), columns=pd.Index([3, 4], name='b')) for i in [True, 'index', 'columns']: calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', normalize=i) tm.assert_frame_equal(empty, calculated) nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]], index=pd.Index([1, 2], name='a', dtype='int64'), columns=pd.Index([3, 4], name='b')) calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', normalize=False) tm.assert_frame_equal(nans, calculated)
Example #2
Source File: test_pivot.py From vnpy_crypto with MIT License | 6 votes |
def test_crosstab_ndarray(self): a = np.random.randint(0, 5, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 10, size=100) df = DataFrame({'a': a, 'b': b, 'c': c}) result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c')) expected = crosstab(df['a'], [df['b'], df['c']]) tm.assert_frame_equal(result, expected) result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c')) expected = crosstab([df['b'], df['c']], df['a']) tm.assert_frame_equal(result, expected) # assign arbitrary names result = crosstab(self.df['A'].values, self.df['C'].values) assert result.index.name == 'row_0' assert result.columns.name == 'col_0'
Example #3
Source File: contingency_tables.py From vnpy_crypto with MIT License | 6 votes |
def from_data(cls, data, shift_zeros=True): """ Construct a Table object from data. Parameters ---------- data : array-like The raw data, the first column defines the rows and the second column defines the columns. shift_zeros : boolean If True, and if there are any zeros in the contingency table, add 0.5 to all four cells of the table. """ if isinstance(data, pd.DataFrame): table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1]) else: table = pd.crosstab(data[:, 0], data[:, 1]) return cls(table, shift_zeros)
Example #4
Source File: contingency_tables.py From vnpy_crypto with MIT License | 6 votes |
def from_data(cls, data, shift_zeros=True): """ Construct a Table object from data. Parameters ---------- data : array-like The raw data, from which a contingency table is constructed using the first two columns. shift_zeros : boolean If True and any cell count is zero, add 0.5 to all values in the table. Returns ------- A Table instance. """ if isinstance(data, pd.DataFrame): table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1]) else: table = pd.crosstab(data[:, 0], data[:, 1]) return cls(table, shift_zeros)
Example #5
Source File: test_contingency_tables.py From vnpy_crypto with MIT License | 6 votes |
def test_from_data(self): np.random.seed(241) df = pd.DataFrame(index=range(100), columns=("v1", "v2", "strat")) df["v1"] = np.random.randint(0, 2, 100) df["v2"] = np.random.randint(0, 2, 100) df["strat"] = np.kron(np.arange(10), np.ones(10)) tables = [] for k in range(10): ii = np.arange(10*k, 10*(k+1)) tables.append(pd.crosstab(df.loc[ii, "v1"], df.loc[ii, "v2"])) rslt1 = ctab.StratifiedTable(tables) rslt2 = ctab.StratifiedTable.from_data("v1", "v2", "strat", df) assert_equal(rslt1.summary().as_text(), rslt2.summary().as_text())
Example #6
Source File: test_contingency_tables.py From vnpy_crypto with MIT License | 6 votes |
def test_SquareTable_from_data(): np.random.seed(434) df = pd.DataFrame(index=range(100), columns=["v1", "v2"]) df["v1"] = np.random.randint(0, 5, 100) df["v2"] = np.random.randint(0, 5, 100) table = pd.crosstab(df["v1"], df["v2"]) rslt1 = ctab.SquareTable(table) rslt2 = ctab.SquareTable.from_data(df) rslt3 = ctab.SquareTable(np.asarray(table)) assert_equal(rslt1.summary().as_text(), rslt2.summary().as_text()) assert_equal(rslt2.summary().as_text(), rslt3.summary().as_text()) s = str(rslt1) assert_equal(s.startswith('A 5x5 contingency table with counts:'), True) assert_equal(rslt1.table[0, 0], 8.)
Example #7
Source File: test_mosaicplot.py From vnpy_crypto with MIT License | 6 votes |
def test_mosaic_empty_cells(): # SMOKE test see #2286 import pandas as pd mydata = pd.DataFrame({'id2': {64: 'Angelica', 65: 'DXW_UID', 66: 'casuid01', 67: 'casuid01', 68: 'EC93_uid', 69: 'EC93_uid', 70: 'EC93_uid', 60: 'DXW_UID', 61: 'AtmosFox', 62: 'DXW_UID', 63: 'DXW_UID'}, 'id1': {64: 'TGP', 65: 'Retention01', 66: 'default', 67: 'default', 68: 'Musa_EC_9_3', 69: 'Musa_EC_9_3', 70: 'Musa_EC_9_3', 60: 'default', 61: 'default', 62: 'default', 63: 'default'}}) ct = pd.crosstab(mydata.id1, mydata.id2) fig, vals = mosaic(ct.T.unstack()) pylab.close('all') fig, vals = mosaic(mydata, ['id1','id2']) pylab.close('all')
Example #8
Source File: test_pivot.py From vnpy_crypto with MIT License | 6 votes |
def test_crosstab_errors(self): # Issue 12578 df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], 'c': [1, 1, np.nan, 1, 1]}) error = 'values cannot be used without an aggfunc.' with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, values=df.c) error = 'aggfunc cannot be used without values' with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, aggfunc=np.mean) error = 'Not a valid normalize argument' with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, normalize='42') with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, normalize=42) error = 'Not a valid margins argument' with tm.assert_raises_regex(ValueError, error): pd.crosstab(df.a, df.b, normalize='all', margins=42)
Example #9
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_crosstab_errors(self): # Issue 12578 df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], 'c': [1, 1, np.nan, 1, 1]}) error = 'values cannot be used without an aggfunc.' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, values=df.c) error = 'aggfunc cannot be used without values' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, aggfunc=np.mean) error = 'Not a valid normalize argument' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize='42') with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize=42) error = 'Not a valid margins argument' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize='all', margins=42)
Example #10
Source File: decisionTree.py From statistical_learning with Apache License 2.0 | 6 votes |
def SplitData(self, df): labels = df.iloc[:, -1] data = df.iloc[:, :-1] # use crosstab to count the frequency cbs = (pd.crosstab(data.iloc[:, i], labels) for i in range(data.columns.size)) y_c = labels.groupby(labels).count() # entropy of y HD = self.calH(y_c) HDA = [self.calg(cb) for cb in cbs] if self.method == "ID3": g = HD-HDA elif self.method == "C4.5": g = 1-HDA/HD if g.max() < self.eps: return None # the split location split = g.argmax() name = df.columns[split] # divide into parts gp = df.groupby(df.iloc[:, split]) return ((name, i, d.drop(name, axis=1)) for i, d in gp)
Example #11
Source File: test_replication_kw_97.py From respy with MIT License | 6 votes |
def test_distribution_of_lagged_choices(): params, options, actual_df = rp.get_example_model("kw_97_extended") options["n_periods"] = 1 options["simulated_agents"] = 10_000 simulate = rp.get_simulate_func(params, options) df = simulate(params) actual_df = actual_df.query("Period == 0") expected = pd.crosstab( actual_df.Lagged_Choice_1, actual_df.Experience_School, normalize="columns" ) df = df.query("Period == 0") calculated = pd.crosstab( df.Lagged_Choice_1, df.Experience_School, normalize="columns" ) # Allow for 4% differences which likely for small subsets. np.testing.assert_allclose(expected, calculated, atol=0.04)
Example #12
Source File: test_pivot.py From recruit with Apache License 2.0 | 6 votes |
def test_crosstab_errors(self): # Issue 12578 df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], 'c': [1, 1, np.nan, 1, 1]}) error = 'values cannot be used without an aggfunc.' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, values=df.c) error = 'aggfunc cannot be used without values' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, aggfunc=np.mean) error = 'Not a valid normalize argument' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize='42') with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize=42) error = 'Not a valid margins argument' with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize='all', margins=42)
Example #13
Source File: test_pivot.py From recruit with Apache License 2.0 | 6 votes |
def test_crosstab_with_empties(self): # Check handling of empties df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], 'c': [np.nan, np.nan, np.nan, np.nan, np.nan]}) empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]], index=pd.Index([1, 2], name='a', dtype='int64'), columns=pd.Index([3, 4], name='b')) for i in [True, 'index', 'columns']: calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', normalize=i) tm.assert_frame_equal(empty, calculated) nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]], index=pd.Index([1, 2], name='a', dtype='int64'), columns=pd.Index([3, 4], name='b')) calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', normalize=False) tm.assert_frame_equal(nans, calculated)
Example #14
Source File: contingency_tables.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def from_data(cls, data, shift_zeros=True): """ Construct a Table object from data. Parameters ---------- data : array-like The raw data, from which a contingency table is constructed using the first two columns. shift_zeros : boolean If True and any cell count is zero, add 0.5 to all values in the table. Returns ------- A Table instance. """ if isinstance(data, pd.DataFrame): table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1]) else: table = pd.crosstab(data[:, 0], data[:, 1]) return cls(table, shift_zeros)
Example #15
Source File: contingency_tables.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def from_data(cls, data, shift_zeros=True): """ Construct a Table object from data. Parameters ---------- data : array-like The raw data, the first column defines the rows and the second column defines the columns. shift_zeros : boolean If True, and if there are any zeros in the contingency table, add 0.5 to all four cells of the table. """ if isinstance(data, pd.DataFrame): table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1]) else: table = pd.crosstab(data[:, 0], data[:, 1]) return cls(table, shift_zeros)
Example #16
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_crosstab_ndarray(self): a = np.random.randint(0, 5, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 10, size=100) df = DataFrame({'a': a, 'b': b, 'c': c}) result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c')) expected = crosstab(df['a'], [df['b'], df['c']]) tm.assert_frame_equal(result, expected) result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c')) expected = crosstab([df['b'], df['c']], df['a']) tm.assert_frame_equal(result, expected) # assign arbitrary names result = crosstab(self.df['A'].values, self.df['C'].values) assert result.index.name == 'row_0' assert result.columns.name == 'col_0'
Example #17
Source File: evaluate.py From toad with MIT License | 6 votes |
def crosstab_data(columns_var, row_var, data,unique_num,*args): columns_data, columns_target, columns_bins = merger_data(data, columns_var, unique_num,args[0]) row_data, row_target, row_bins = merger_data(data, row_var, unique_num,args[1]) result = pd.crosstab(row_data, columns_data, margins=True, dropna=False) if columns_bins is not None: columns = result.columns.tolist() columns.remove('All') columns_bins_list = rename_columns(columns, columns_bins, args[2]) columns_bins_list.append('All') result.set_axis(columns_bins_list, axis=1, inplace=True) if row_bins is not None: index = result.index.tolist() index.remove('All') index_bins_list = rename_columns(index, row_bins, args[3]) index_bins_list.append('All') result.set_axis(index_bins_list, axis=0, inplace=True) return result # 写入所有高iv的变量分组和图到excel
Example #18
Source File: crosstabs.py From audit-ai with MIT License | 6 votes |
def crosstab_df(labels, decisions): """ Parameters ------------ labels : array_like containing categorical values like ['M', 'F'] decisions : array_like containing boolean / binary values Returns -------- crosstab : 2x2 array in the form, False True TopGroup 5 4 BottomGroup 3 4 so, crosstab = array([[5, 4], [3, 4]]) """ labels, decisions = pd.Series(labels), pd.Series(decisions) # rows are label values (e.g. ['F', 'M']) # columns are decision values (e.g. [False, True]) ctab = pd.crosstab(labels, decisions) return ctab
Example #19
Source File: test_pivot.py From recruit with Apache License 2.0 | 6 votes |
def test_crosstab_ndarray(self): a = np.random.randint(0, 5, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 10, size=100) df = DataFrame({'a': a, 'b': b, 'c': c}) result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c')) expected = crosstab(df['a'], [df['b'], df['c']]) tm.assert_frame_equal(result, expected) result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c')) expected = crosstab([df['b'], df['c']], df['a']) tm.assert_frame_equal(result, expected) # assign arbitrary names result = crosstab(self.df['A'].values, self.df['C'].values) assert result.index.name == 'row_0' assert result.columns.name == 'col_0'
Example #20
Source File: test_pivot.py From vnpy_crypto with MIT License | 6 votes |
def test_crosstab_with_empties(self): # Check handling of empties df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], 'c': [np.nan, np.nan, np.nan, np.nan, np.nan]}) empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]], index=pd.Index([1, 2], name='a', dtype='int64'), columns=pd.Index([3, 4], name='b')) for i in [True, 'index', 'columns']: calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', normalize=i) tm.assert_frame_equal(empty, calculated) nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]], index=pd.Index([1, 2], name='a', dtype='int64'), columns=pd.Index([3, 4], name='b')) calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', normalize=False) tm.assert_frame_equal(nans, calculated)
Example #21
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_crosstab_with_numpy_size(self): # GH 4003 df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, 'B': ['A', 'B', 'C'] * 8, 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, 'D': np.random.randn(24), 'E': np.random.randn(24)}) result = pd.crosstab(index=[df['A'], df['B']], columns=[df['C']], margins=True, aggfunc=np.size, values=df['D']) expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'], ['', 'A', 'B', 'C']], codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], names=['A', 'B']) expected_column = pd.Index(['bar', 'foo', 'All'], dtype='object', name='C') expected_data = np.array([[2., 2., 4.], [2., 2., 4.], [2., 2., 4.], [2., np.nan, 2.], [np.nan, 2., 2.], [2., np.nan, 2.], [np.nan, 2., 2.], [2., np.nan, 2.], [np.nan, 2., 2.], [12., 12., 24.]]) expected = pd.DataFrame(expected_data, index=expected_index, columns=expected_column) tm.assert_frame_equal(result, expected)
Example #22
Source File: _histograms.py From epiScanpy with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cluster_composition(adata, cluster, condition, xlabel='cell cluster', ylabel='cell count', title=None, save=False): """ """ contingency_table = pd.crosstab( adata.obs[condition], adata.obs[cluster], margins = True ) counts = [] p_part = [] index = 0 categories = sorted(list(set(adata.obs[cluster]))) for n in sorted(set(adata.obs[condition])): #counts.append() p_part.append(plt.bar(categories, contingency_table.iloc[index][0:-1].values)) index += 1 #Plots the bar chart #plt.figsize(figsize=[6.4, 4.8]) plt.legend(tuple([p[0] for p in p_part]), tuple(sorted(set(adata.obs[condition])))) plt.xlabel(xlabel, ) plt.ylabel(ylabel) plt.title(title) if save!=False: if (save==True) or (save.split('.')[-1] not in ['png', 'pdf']): plt.savefig('cluster_composition.png', dpi=300, bbox_inches="tight") else: plt.savefig('_'.join(['cluster_composition',save]), #format=save.split('.')[-1], dpi=300, bbox_inches="tight") plt.show()
Example #23
Source File: random_forest.py From Speculator with MIT License | 5 votes |
def confusion_matrix(self, actual, preds): """ Confusion matrix of actual set to predicted set """ return crosstab(actual, preds, rownames=['(A)'], colnames=['(P)'])
Example #24
Source File: NaiveBayes.py From statistical_learning with Apache License 2.0 | 5 votes |
def __init__(self, data, lam=0): df = pd.DataFrame(data) dim = df.shape[1] self.y_p = df[dim-1].groupby(df[dim-1]).count()+lam self.y_p /= self.y_p.sum() self.cb = [] for i in range(dim-1): xi_p = pd.crosstab(df[i], df[dim-1])+lam self.cb.append(xi_p/xi_p.sum())
Example #25
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_crosstab_unsorted_order(self): df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]}, index=['C', 'A', 'B']) result = pd.crosstab(df.index, [df.b, df.a]) e_idx = pd.Index(['A', 'B', 'C'], name='row_0') e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=['b', 'a']) expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns) tm.assert_frame_equal(result, expected)
Example #26
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_crosstab_tuple_name(self, names): s1 = pd.Series(range(3), name=names[0]) s2 = pd.Series(range(1, 4), name=names[1]) mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names) expected = pd.Series(1, index=mi).unstack(1, fill_value=0) result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected)
Example #27
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_crosstab_dup_index_names(self): # GH 13279 s = pd.Series(range(3), name='foo') result = pd.crosstab(s, s) expected_index = pd.Index(range(3), name='foo') expected = pd.DataFrame(np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index) tm.assert_frame_equal(result, expected)
Example #28
Source File: metrics.py From reportgen with MIT License | 5 votes |
def info_value(X,y,bins='auto'): '''计算连续变量的IV值 计算X和y之间的IV值 IV=\sum (g_k/n_g-b_k/n_b)*log2(g_k*n_b/n_g/) ''' threshold=[] for q in [0.05,0.04,0.03,0.02,0.01,1e-7]: t_down=max([X[y==k].quantile(q) for k in y.dropna().unique()]) t_up=min([X[y==k].quantile(1-q) for k in y.dropna().unique()]) threshold.append((t_down,t_up)) if bins is not None: X=pd.cut(X,bins) ctable=pd.crosstab(X,y) p=ctable.sum()/ctable.sum().sum() if ctable.shape[1]==2: ctable=ctable/ctable.sum() IV=((ctable.iloc[:,0]-ctable.iloc[:,1])*np.log2(ctable.iloc[:,0]/ctable.iloc[:,1])).sum() return IV IV=0 for cc in ctable.columns: ctable_bin=pd.concat([ctable[cc],ctable.loc[:,~(ctable.columns==cc)].sum(axis=1)],axis=1) ctable_bin=ctable_bin/ctable_bin.sum() IV_bin=((ctable_bin.iloc[:,0]-ctable_bin.iloc[:,1])*np.log2(ctable_bin.iloc[:,0]/ctable_bin.iloc[:,1])).sum() IV+=IV_bin*p[cc] return IV # 计算离散随机变量的熵
Example #29
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_crosstab_no_overlap(self): # GS 10291 s1 = pd.Series([1, 2, 3], index=[1, 2, 3]) s2 = pd.Series([4, 5, 6], index=[4, 5, 6]) actual = crosstab(s1, s2) expected = pd.DataFrame() tm.assert_frame_equal(actual, expected)
Example #30
Source File: test_pivot.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_crosstab_dropna(self): # GH 3820 a = np.array(['foo', 'foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object) b = np.array(['one', 'one', 'two', 'one', 'two', 'two', 'two'], dtype=object) c = np.array(['dull', 'dull', 'dull', 'dull', 'dull', 'shiny', 'shiny'], dtype=object) res = pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False) m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), ('two', 'dull'), ('two', 'shiny')], names=['b', 'c']) tm.assert_index_equal(res.columns, m)