Python pandas.value_counts() Examples
The following are 27
code examples of pandas.value_counts().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: spatial_heatmap.py From NanoPlot with GNU General Public License v3.0 | 6 votes |
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"): """Taking channel information and creating post run channel activity plots.""" logging.info("Nanoplotter: Creating heatmap of reads per channel using {} reads." .format(array.size)) activity_map = Plot( path=path + "." + figformat, title="Number of reads generated per channel") layout = make_layout(maxval=np.amax(array)) valueCounts = pd.value_counts(pd.Series(array)) for entry in valueCounts.keys(): layout.template[np.where(layout.structure == entry)] = valueCounts[entry] plt.figure() ax = sns.heatmap( data=pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks), xticklabels="auto", yticklabels="auto", square=True, cbar_kws={"orientation": "horizontal"}, cmap=color, linewidths=0.20) ax.set_title(title or activity_map.title) activity_map.fig = ax.get_figure() activity_map.save(format=figformat) plt.close("all") return [activity_map]
Example #2
Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_blockwise_shufflesplit(): splitter = dask_ml.model_selection.ShuffleSplit(random_state=0) assert splitter.get_n_splits() == 10 gen = splitter.split(dX) train_idx, test_idx = next(gen) assert isinstance(train_idx, da.Array) assert isinstance(test_idx, da.Array) assert train_idx.shape == (99,) # 90% of 110 assert test_idx.shape == (11,) assert train_idx.chunks == ((45, 45, 9),) assert test_idx.chunks == ((5, 5, 1),) counts = pd.value_counts(train_idx.compute()) assert counts.max() == 1 N = len(X) np.testing.assert_array_equal( np.unique(da.concatenate([train_idx, test_idx])), np.arange(N) )
Example #3
Source File: tagged_corpus.py From underthesea with GNU General Public License v3.0 | 6 votes |
def _analyze_field(self, df, id, output_folder=".", n_head=10): id = str(id) m = df.shape[1] df.columns = [str(i) for i in range(m)] agg_dict = dict() agg_dict[id] = "size" for i in range(int(id)): agg_dict[str(i)] = lambda x: ", ".join( pd.value_counts(x).index[:n_head]) name_dict = dict() name_dict[id] = "count" df_analyze = df.groupby(id).agg(agg_dict).rename( columns=name_dict).reset_index() filename = join(output_folder, "column-%s-analyze.xlsx" % id) log = u"" log += u"Tags : {}\n".format(df_analyze.shape[0]) tags = df_analyze[id].to_dict().values() tags = sorted(tags) log += u"List tags : {}\n".format(u", ".join(tags)) df_analyze.to_excel(filename, index=False) return log
Example #4
Source File: tc_.py From underthesea with GNU General Public License v3.0 | 6 votes |
def _analyze_field(self, df, id, output_folder=".", n_head=10): id = str(id) m = df.shape[1] df.columns = [str(i) for i in range(m)] agg_dict = dict() agg_dict[id] = "size" for i in range(int(id)): agg_dict[str(i)] = lambda x: ", ".join( pd.value_counts(x).index[:n_head]) name_dict = dict() name_dict[id] = "count" df_analyze = df.groupby(id).agg(agg_dict).rename( columns=name_dict).reset_index() filename = join(output_folder, "column-%s-analyze.xlsx" % id) log = u"" log += u"Tags : {}\n".format(df_analyze.shape[0]) tags = df_analyze[id].to_dict().values() tags = sorted(tags) log += u"List tags : {}\n".format(u", ".join(tags)) df_analyze.to_excel(filename, index=False) return log
Example #5
Source File: test_example_sleeping_giant.py From postman_problems with MIT License | 5 votes |
def test_get_shortest_paths_distances(): df = read_edgelist(EDGELIST) graph = create_networkx_graph_from_edgelist(df, edge_id='id') odd_nodes = get_odd_nodes(graph) odd_node_pairs = list(itertools.combinations(odd_nodes, 2)) # coarsely checking structure of `get_shortest_paths_distances` return value odd_node_pairs_shortest_paths = get_shortest_paths_distances(graph, odd_node_pairs, 'distance') assert len(odd_node_pairs_shortest_paths) == 630 assert type(odd_node_pairs_shortest_paths) == dict # check that each node name appears the same number of times in `get_shortest_paths_distances` return value node_names = list(itertools.chain(*[i[0] for i in odd_node_pairs_shortest_paths.items()])) assert set(pd.value_counts(node_names)) == set([35])
Example #6
Source File: testPlotting.py From fitbit-analyzer with Apache License 2.0 | 5 votes |
def test_plottingOnIntradayStats(self): filepath = RESOURCE_PATH + "\\unittest\\test_sleep_basic01.csv" data1 = utils.loadIntradayData(filepath) filepath = RESOURCE_PATH + "\\unittest\\test_sleep_basic02.csv" data2 = utils.loadIntradayData(filepath) stats = sleepStats.generateStatsFrom([data1, data2], sleepStats.STATS_NAME_INTRADAY) data = stats.apply(pd.value_counts) mplot.plotSleepValueHeatmap(data, sleepValue=1)
Example #7
Source File: sleepStats.py From fitbit-analyzer with Apache License 2.0 | 5 votes |
def normalizedIntradayCountStats(intradayStats, limitCount=5): # For each minute, number of days for which we have a valid measure (record) notNullCount = intradayStats.count() # Ignore minutes where we have low level of records notNullCount[notNullCount < limitCount] = None # Count how many times each value appears for each minute valueCount = intradayStats.apply(pd.value_counts) # Normalize each minute by records count res = valueCount.div(notNullCount, axis=1) return res
Example #8
Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_kfold(shuffle): splitter = dask_ml.model_selection.KFold( n_splits=5, random_state=0, shuffle=shuffle ) assert splitter.get_n_splits() == 5 gen = splitter.split(dX) train_idx, test_idx = next(gen) assert isinstance(train_idx, da.Array) assert isinstance(test_idx, da.Array) assert train_idx.shape == (88,) # 80% of 110 assert test_idx.shape == (22,) assert train_idx.chunks == ((28, 50, 10),) assert test_idx.chunks == ((22,),) counts = pd.value_counts(train_idx.compute()) assert counts.max() == 1 N = len(X) np.testing.assert_array_equal( np.unique(da.concatenate([train_idx, test_idx])), np.arange(N) ) expected_chunks = [ (((22, 6, 50, 10),), ((22,),)), (((44, 34, 10),), ((6, 16),)), (((50, 16, 12, 10),), ((22,),)), (((50, 38),), ((12, 10),)), ] for (exp_train_idx, exp_test_idx), (train_idx, test_idx) in zip( expected_chunks, gen ): assert train_idx.chunks == exp_train_idx assert test_idx.chunks == exp_test_idx
Example #9
Source File: core.py From econtools with BSD 3-Clause "New" or "Revised" License | 5 votes |
def df_cluster(n, k, cluster_id): g = len(pd.value_counts(cluster_id)) df = g - 1 vce_correct = ((n - 1) / (n - k)) * (g / (g - 1)) return df, vce_correct, g
Example #10
Source File: prepare_dataset.py From moses with MIT License | 5 votes |
def split_dataset(dataset, seed): logger.info('Splitting the dataset') scaffolds = pd.value_counts(dataset['scaffold']) scaffolds = sorted(scaffolds.items(), key=lambda x: (-x[1], x[0])) test_scaffolds = set([x[0] for x in scaffolds[9::10]]) dataset['SPLIT'] = 'train' test_scaf_idx = [x in test_scaffolds for x in dataset['scaffold']] dataset.loc[test_scaf_idx, 'SPLIT'] = 'test_scaffolds' test_idx = dataset.loc[dataset['SPLIT'] == 'train'].sample( frac=0.1, random_state=seed ).index dataset.loc[test_idx, 'SPLIT'] = 'test' dataset.drop('scaffold', axis=1, inplace=True) return dataset
Example #11
Source File: others.py From arche with MIT License | 5 votes |
def garbage_symbols(df: pd.DataFrame) -> Result: """Find unwanted symbols in `np.object` columns. Returns: A result containing item keys per field which contained any trash symbol """ garbage = ( r"(?P<spaces>^\s|\s$)" r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)" r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})" r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|" r"blockquote)\s*?/??>|<!--|-->)" ) errors = {} row_keys: Set = set() rule_result = Result("Garbage Symbols", items_count=len(df)) for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"): matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE) if not matches.empty: error_keys = df.loc[matches.unstack().index.values].index bad_texts = matches.stack().value_counts().index.sort_values().tolist() # escape backslashes for markdown repr, `\n > \\n` bad_texts = [ f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'" for bx in bad_texts ] error = ( f"{len(error_keys)/len(df)*100:.1f}% of '{column}' " f"values contain `{', '.join(bad_texts)}`" ) errors[error] = list(error_keys) row_keys = row_keys.union(error_keys) if errors: rule_result.add_error( f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected", errors=errors, ) return rule_result
Example #12
Source File: tagged_corpus.py From underthesea with GNU General Public License v3.0 | 5 votes |
def _analyze_first_token(self, df, id, output_folder="."): filename = join(output_folder, "column-%s-analyze.xlsx" % id) df_analyze = df[id].value_counts().reset_index(name="count") df_analyze = df_analyze.rename(columns={"index": "0"}) df_analyze.to_excel(filename, index=False) log = u"" log += u"Unique words : {}\n".format(df_analyze.shape[0]) log += u"Top words : {}\n".format( u", ".join(list(df_analyze["0"].to_dict().values())[:20])) return log
Example #13
Source File: tc_.py From underthesea with GNU General Public License v3.0 | 5 votes |
def _analyze_first_token(self, df, id, output_folder="."): filename = join(output_folder, "column-%s-analyze.xlsx" % id) df_analyze = df[id].value_counts().reset_index(name="count") df_analyze = df_analyze.rename(columns={"index": "0"}) df_analyze.to_excel(filename, index=False) log = u"" log += u"Unique words : {}\n".format(df_analyze.shape[0]) log += u"Top words : {}\n".format( u", ".join(list(df_analyze["0"].to_dict().values())[:20])) return log
Example #14
Source File: __init__.py From pandas-summary with MIT License | 5 votes |
def _get_bool_summary(self, column): series = self.df[column] stats = {} for class_name, class_value in dict(series.value_counts()).items(): stats['"{}" count'.format(class_name)] = '{}'.format(class_value) stats['"{}" perc'.format(class_name)] = '{}'.format( self._percent(class_value / self.length)) return pd.concat([pd.Series(stats, name=column), self.columns_stats[column]], sort=True)
Example #15
Source File: __init__.py From pandas-summary with MIT License | 5 votes |
def _get_categorical_summary(self, column): series = self.df[column] # Only run if at least 1 non-missing value value_counts = series.value_counts() stats = { 'top': '{}: {}'.format(value_counts.index[0], value_counts.iloc[0]), } return pd.concat([pd.Series(stats, name=column), self.columns_stats[column]], sort=True)
Example #16
Source File: __init__.py From pandas-summary with MIT License | 5 votes |
def _get_median_absolute_deviation(self, series, multiplier=3): """ Returns count of values larger than `multiplier` * `mad` :type series: :param multiplier: :return (array): """ capped_series = np.minimum( series, series.median() + multiplier * series.mad()) count = pd.value_counts(series != capped_series) count = count[True] if True in count else 0 perc = self._percent(count / self.length) return count, perc
Example #17
Source File: __init__.py From pandas-summary with MIT License | 5 votes |
def _get_deviation_of_mean(self, series, multiplier=3): """ Returns count of values deviating of the mean, i.e. larger than `multiplier` * `std`. :type series: :param multiplier: :return: """ capped_series = np.minimum( series, series.mean() + multiplier * series.std()) count = pd.value_counts(series != capped_series) count = count[True] if True in count else 0 perc = self._percent(count / self.length) return count, perc
Example #18
Source File: __init__.py From pandas-summary with MIT License | 5 votes |
def columns_types(self): return pd.value_counts(self.columns_stats.loc['types'])
Example #19
Source File: test_randomness_index_map.py From vivarium with GNU General Public License v3.0 | 5 votes |
def test_hash_uniformity(map_size_and_hashed_values): n, h = map_size_and_hashed_values k = len(h) num_bins = k//5 # Want about 5 items per bin for chi-squared bins = np.linspace(0, n + 1, num_bins) binned_data = pd.cut(h, bins) distribution = pd.value_counts(binned_data).sort_index() c, p = chisquare(distribution) assert p > 0.05, "Data not uniform"
Example #20
Source File: train_main.py From ibeis with Apache License 2.0 | 5 votes |
def class_weights(self): import pandas as pd label_freq = pd.value_counts(self.labels) class_weights = label_freq.median() / label_freq class_weights = class_weights.sort_index().values class_weights = torch.from_numpy(class_weights.astype(np.float32)) return class_weights
Example #21
Source File: pairwise.py From CausalGAN with MIT License | 5 votes |
def calc_tvd(label_dict,attr): ''' attr should be a 0,1 pandas dataframe with columns corresponding to label names for example: names=zip(*self.graph)[0] calc_tvd(label_dict,attr[names]) label_dict should be a dictionary key:1d-array of samples ''' ####Calculate Total Variation#### if np.min(attr.values)<0: raise ValueError('calc_tvd received \ attr that may not have been in {0,1}') label_names=label_dict.keys() attr=attr[label_names] df2=attr.drop_duplicates() df2 = df2.reset_index(drop = True).reset_index() df2=df2.rename(columns = {'index':'ID'}) real_data_id=pd.merge(attr,df2) real_counts = pd.value_counts(real_data_id['ID']) real_pdf=real_counts/len(attr) label_list_dict={k:np.round(v.ravel()) for k,v in label_dict.items()} df_dat=pd.DataFrame.from_dict(label_list_dict) dat_id=pd.merge(df_dat,df2,on=label_names,how='left') dat_counts=pd.value_counts(dat_id['ID']) dat_pdf = dat_counts / dat_counts.sum() diff=real_pdf.subtract(dat_pdf, fill_value=0) tvd=0.5*diff.abs().sum() return tvd
Example #22
Source File: visualize.py From adversarial-policies with MIT License | 5 votes |
def _visualize_helper( model_dir, output_dir, subsample_rate, save_type, ordering, external_legend_params ): logger.info("Generating figures") # Data metadata_df = pd.read_csv(os.path.join(model_dir, "metadata.csv")) cluster_ids = np.load(os.path.join(model_dir, "cluster_ids.npy")) metadata_df["ax_1"] = cluster_ids[:, 0] metadata_df["ax_2"] = cluster_ids[:, 1] metadata_df["opponent_id"] = metadata_df["opponent_id"].apply(ABBREVIATIONS.get) def save_path(prefix): return osp.join(output_dir, f"{prefix}.{save_type}") counts = pd.value_counts(metadata_df["opponent_id"]) min_counts = counts.min() opponent_groups = metadata_df.groupby("opponent_id") opponent_dfs = {name: group.sample(n=min_counts) for name, group in opponent_groups} opponent_dfs = [opponent_dfs[label] for label in ordering] metadata_df = pd.concat(opponent_dfs) _plot_and_save_chart(save_path("combined"), [metadata_df]) _plot_and_save_chart(save_path("subsampled"), [metadata_df.sample(frac=subsample_rate)]) _plot_and_save_chart(save_path("sidebyside"), opponent_dfs) if external_legend_params is not None: _external_legend(osp.join(output_dir, "external_legend.pdf")) logger.info("Visualization complete")
Example #23
Source File: views.py From dtale with GNU Lesser General Public License v2.1 | 4 votes |
def describe(data_id, column): """ :class:`flask:flask.Flask` route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param column: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :return: JSON { describe: object representing output from :meth:`pandas:pandas.Series.describe`, unique_data: array of unique values when data has <= 100 unique values success: True/False } """ data = global_state.get_data(data_id)[[column]] additional_aggs = None curr_dtypes = global_state.get_dtypes(data_id) dtype = next( ( dtype_info["dtype"] for dtype_info in curr_dtypes if dtype_info["name"] == column ), None, ) if classify_type(dtype) in ["I", "F"]: additional_aggs = ["sum", "median", "mode", "var", "sem", "skew", "kurt"] code = build_code_export(data_id) desc, desc_code = load_describe(data[column], additional_aggs=additional_aggs) code += desc_code return_data = dict(describe=desc, success=True) uniq_vals = data[column].value_counts().sort_values(ascending=False) total_uniq_vals = len(uniq_vals) if "unique" not in return_data["describe"]: return_data["describe"]["unique"] = json_int(total_uniq_vals, as_string=True) uniq_vals.index.name = "value" uniq_vals.name = "count" uniq_vals = uniq_vals.reset_index() uniq_f, _ = build_formatters(uniq_vals) if total_uniq_vals <= 100: code.append("uniq_vals = data['{}'].unique()".format(column)) return_data["uniques"] = dict( data=uniq_f.format_dicts(uniq_vals.itertuples()), total=total_uniq_vals, top=False, ) else: # get top 100 most common values return_data["uniques"] = dict( data=uniq_f.format_dicts(uniq_vals.head(100).itertuples()), total=total_uniq_vals, top=True, ) uniq_code = "uniq_vals = data['{}'].value_counts().sort_values(ascending=False).head(100).index.values" code.append(uniq_code.format(column)) return_data["code"] = "\n".join(code) return jsonify(return_data)
Example #24
Source File: format_utils.py From pyseqlogo with MIT License | 4 votes |
def read_alignment(infile, data_type='fasta', seq_type='dna', pseudo_count=1): """Read alignment file as motif Parameters ---------- infile: str Path to input alignment file data_type: str 'fasta', 'stockholm', etc/. as supported by Bio.AlignIO seq_type: str 'dna', 'rna' or 'aa' pseudo_count: int psuedo counts to add before calculating information cotent Returns ------- (motif, information_content) : tuple A motif instance followd by total informatio content of the motif """ alignment = AlignIO.read(infile, data_type) data = [] for aln in alignment: data.append([x for x in str(aln.seq)]) df = pd.DataFrame(data) df_counts = df.apply(pd.value_counts, 0) total = df_counts[[0]].sum() df_counts = df_counts[df_counts.index != '-'] # Remove - from counts counts_dict = df_counts.to_dict(orient='index') counts = {} for key, val in counts_dict.items(): counts[key] = list(val.values()) return counts, total """ summary_align = AlignInfo.SummaryInfo(alignment) if seq_type == 'dna': info_content = summary_align.information_content(e_freq_table = naive_freq_tables['dna'], chars_to_ignore = ['N'], pseudo_count = pseudo_count) elif seq_type == 'rna': info_content = summary_align.information_content(e_freq_table = naive_freq_tables['rna'], chars_to_ignore = ['N'], pseudo_count = pseudo_count) else: info_content = summary_align.information_content(e_freq_table = naive_freq_tables['aa'], pseudo_count = pseudo_count) motif = create_motif_from_alignment(alignment) return (motif, summary_align.ic_vector) """
Example #25
Source File: others.py From arche with MIT License | 4 votes |
def compare_boolean_fields( source_df: pd.DataFrame, target_df: pd.DataFrame, err_thr: float = 0.10, warn_thr: float = 0.05, ) -> Result: """Compare booleans distribution between two dataframes Returns: A result containing dataframe with distributions and messages if differences are in thresholds """ source_bool = source_df.select_dtypes(include="bool") target_bool = target_df.select_dtypes(include="bool") result = Result("Boolean Fields") if not fields_to_compare(source_bool, target_bool): result.outcome = Outcome.SKIPPED return result dummy = pd.DataFrame(columns=[True, False]) source_counts = pd.concat( [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) target_counts = pd.concat( [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False ).fillna(0.0) difs = (source_counts - target_counts)[True] bool_covs = pd.concat( [ source_counts.rename("{}_source".format), target_counts.rename("{}_target".format), ] ).sort_index() bool_covs.name = "Coverage for boolean fields" result.stats.append(bool_covs) err_diffs = difs[difs.abs() > err_thr] if not err_diffs.empty: result.add_error( f"{', '.join(err_diffs.index)} relative frequencies differ " f"by more than {err_thr:.0%}" ) warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)] if not warn_diffs.empty: result.add_warning( f"{', '.join(warn_diffs.index)} relative frequencies differ by " f"{warn_thr:.0%}-{err_thr:.0%}" ) return result
Example #26
Source File: test_general.py From modin with Apache License 2.0 | 4 votes |
def test_value_counts(normalize, bins, dropna): def sort_index_for_equal_values(result, ascending): is_range = False is_end = False i = 0 new_index = np.empty(len(result), dtype=type(result.index)) while i < len(result): j = i if i < len(result) - 1: while result[result.index[i]] == result[result.index[i + 1]]: i += 1 if is_range is False: is_range = True if i == len(result) - 1: is_end = True break if is_range: k = j for val in sorted(result.index[j : i + 1], reverse=not ascending): new_index[k] = val k += 1 if is_end: break is_range = False else: new_index[j] = result.index[j] i += 1 return pandas.Series(result, index=new_index) # We sort indices for pandas result because of issue #1650 values = np.array([3, 1, 2, 3, 4, np.nan]) modin_result = pd.value_counts(values, normalize=normalize, ascending=False) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, normalize=normalize, ascending=False), False ) df_equals(modin_result, pandas_result) modin_result = pd.value_counts(values, bins=bins, ascending=False) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, bins=bins, ascending=False), False ) df_equals(modin_result, pandas_result) modin_result = pd.value_counts(values, dropna=dropna, ascending=True) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, dropna=dropna, ascending=True), True ) df_equals(modin_result, pandas_result)
Example #27
Source File: kaggle_titanic.py From stacked_generalization with Apache License 2.0 | 4 votes |
def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True): def get_title(name): title_search = re.search(' ([A-Za-z]+)\.', name) if title_search: return title_search.group(1) return "" def normalize_fare(data): new_data = None for embarked in (0, 1, 2): temp = data[data.Embarked == embarked] temp['Fare'] /= temp['Fare'].values.mean() if new_data is None: new_data = temp else: new_data = pd.concat([new_data, temp]) new_data = new_data.sort('PassengerId') return new_data data = pd.read_csv(self.file_name).replace('male',0).replace('female',1) data['Age'].fillna(data.Age.median(), inplace=True) data['Fare'].fillna(data.Fare.median(), inplace=True) data['FamilySize'] = data['SibSp'] + data['Parch'] + 1 data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2) data['Embarked'].fillna(0, inplace=True) if norm_fare: data = normalize_fare(data) # Get all the titles and print how often each one occurs. titles = data["Name"].apply(get_title) print(pd.value_counts(titles)) # Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles. title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2} for k,v in title_mapping.items(): titles[titles == k] = v # Add in the title column. data['Title'] = titles data['Title'].fillna(1, inplace=True) #data['Pos'] = data["Title"] + data['Pclass'] if drop: #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1) data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1) #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1) print(data.keys()) if title_to_onehot: self.encode(data, 'Title', [i for i in range(1, 11)]) data = data.drop(['Title'], axis=1) return data