Python pandas.factorize() Examples
The following are 30
code examples of pandas.factorize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: scatter.py From scprep with GNU General Public License v3.0 | 6 votes |
def c_discrete(self): """Discretized form of c If c is discrete then this converts it to integers from 0 to `n_c_unique` """ if self._c_discrete is None: if isinstance(self._cmap, dict): self._labels = np.array( [k for k in self._cmap.keys() if k in self.c_unique] ) self._c_discrete = np.zeros_like(self._c, dtype=int) for i, label in enumerate(self._labels): self._c_discrete[self._c == label] = i else: self._c_discrete = np.zeros_like(self._c, dtype=int) self._c_discrete[self._mask], self._labels = pd.factorize( self._c_masked, sort=True ) return self._c_discrete
Example #2
Source File: sandwich_covariance.py From vnpy_crypto with MIT License | 6 votes |
def group_sums(x, group): '''sum x for each group, simple bincount version, again group : array, integer assumed to be consecutive integers no dtype checking because I want to raise in that case uses loop over columns of x #TODO: remove this, already copied to tools/grouputils ''' #TODO: transpose return in group_sum, need test coverage first # re-label groups or bincount takes too much memory if np.max(group) > 2 * x.shape[0]: group = pd.factorize(group)[0] return np.array([np.bincount(group, weights=x[:, col]) for col in range(x.shape[1])])
Example #3
Source File: feature_engineering_titanic.py From Deep-Learning-By-Example with MIT License | 6 votes |
def process_cabin(): # refering to the global variable that contains the titanic examples global df_titanic_data # repllacing the missing value in cabin variable "U0" df_titanic_data['Cabin'][df_titanic_data.Cabin.isnull()] = 'U0' # the cabin number is a sequence of of alphanumerical digits, so we are going to create some features # from the alphabetical part of it df_titanic_data['CabinLetter'] = df_titanic_data['Cabin'].map(lambda l: get_cabin_letter(l)) df_titanic_data['CabinLetter'] = pd.factorize(df_titanic_data['CabinLetter'])[0] # binarizing the cabin letters features if keep_binary: cletters = pd.get_dummies(df_titanic_data['CabinLetter']).rename(columns=lambda x: 'CabinLetter_' + str(x)) df_titanic_data = pd.concat([df_titanic_data, cletters], axis=1) # creating features from the numerical side of the cabin df_titanic_data['CabinNumber'] = df_titanic_data['Cabin'].map(lambda x: get_cabin_num(x)).astype(int) + 1 # scaling the feature if keep_scaled: scaler_processing = preprocessing.StandardScaler() df_titanic_data['CabinNumber_scaled'] = scaler_processing.fit_transform(df_titanic_data.CabinNumber.reshape(-1, 1))
Example #4
Source File: test_algos.py From vnpy_crypto with MIT License | 6 votes |
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = Index(['A', 'B', 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = Index([3.14, np.inf, 'A', 'B']) tm.assert_index_equal(uniques, exp)
Example #5
Source File: test_algos.py From recruit with Apache License 2.0 | 6 votes |
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = Index(['A', 'B', 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = Index([3.14, np.inf, 'A', 'B']) tm.assert_index_equal(uniques, exp)
Example #6
Source File: feature_engineering_titanic.py From Deep-Learning-By-Example with MIT License | 6 votes |
def process_embarked(): global df_titanic_data # replacing the missing values with the most commmon value in the variable df_titanic_data.Embarked[df_titanic_data.Embarked.isnull()] = df_titanic_data.Embarked.dropna().mode().values # converting the values into numbers df_titanic_data['Embarked'] = pd.factorize(df_titanic_data['Embarked'])[0] # binarizing the constructed features if keep_binary: df_titanic_data = pd.concat([df_titanic_data, pd.get_dummies(df_titanic_data['Embarked']).rename( columns=lambda x: 'Embarked_' + str(x))], axis=1) # Define a helper function that can use RandomForestClassifier for handling the missing values of the age variable
Example #7
Source File: test_algos.py From vnpy_crypto with MIT License | 6 votes |
def test_uint64_factorize(self): data = np.array([2**63, 1, 2**63], dtype=np.uint64) exp_labels = np.array([0, 1, 0], dtype=np.intp) exp_uniques = np.array([2**63, 1], dtype=np.uint64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) data = np.array([2**63, -1, 2**63], dtype=object) exp_labels = np.array([0, 1, 0], dtype=np.intp) exp_uniques = np.array([2**63, -1], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques)
Example #8
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = Index(['A', 'B', 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = Index([3.14, np.inf, 'A', 'B']) tm.assert_index_equal(uniques, exp)
Example #9
Source File: sandwich_covariance.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def group_sums(x, group): '''sum x for each group, simple bincount version, again group : array, integer assumed to be consecutive integers no dtype checking because I want to raise in that case uses loop over columns of x #TODO: remove this, already copied to tools/grouputils ''' #TODO: transpose return in group_sum, need test coverage first # re-label groups or bincount takes too much memory if np.max(group) > 2 * x.shape[0]: group = pd.factorize(group)[0] return np.array([np.bincount(group, weights=x[:, col]) for col in range(x.shape[1])])
Example #10
Source File: test_algos.py From vnpy_crypto with MIT License | 6 votes |
def test_factorize_nan(self): # nan should map to na_sentinel, not reverse_indexer[na_sentinel] # rizer.factorize should not raise an exception if na_sentinel indexes # outside of reverse_indexer key = np.array([1, 2, 1, np.nan], dtype='O') rizer = ht.Factorizer(len(key)) for na_sentinel in (-1, 20): ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) expected = np.array([0, 1, 0, na_sentinel], dtype='int32') assert len(set(key)) == len(set(expected)) tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) # nan still maps to na_sentinel when sort=False key = np.array([0, np.nan, 1], dtype='O') na_sentinel = -1 # TODO(wesm): unused? ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa expected = np.array([2, -1, 0], dtype='int32') assert len(set(key)) == len(set(expected)) tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
Example #11
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_int64_factorize(self, writable): data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques)
Example #12
Source File: test_ip.py From cyberpandas with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_factorize(): arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1]) labels, uniques = arr.factorize() expected_labels, expected_uniques = pd.factorize(arr.astype(object)) assert isinstance(uniques, ip.IPArray) uniques = uniques.astype(object) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_numpy_array_equal(uniques, expected_uniques)
Example #13
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_object_factorize(self, writable): data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'], dtype=object) data.setflags(write=writable) exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) exp_uniques = np.array(['a', 'c', 'b'], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques)
Example #14
Source File: feature_engineering_titanic.py From Deep-Learning-By-Example with MIT License | 5 votes |
def process_fare(): global df_titanic_data # handling the missing values by replacing it with the median feare df_titanic_data['Fare'][np.isnan(df_titanic_data['Fare'])] = df_titanic_data['Fare'].median() # zeros in the fare will cause some division problems so we are going to set them to 1/10th of the lowest fare df_titanic_data['Fare'][np.where(df_titanic_data['Fare'] == 0)[0]] = df_titanic_data['Fare'][ df_titanic_data['Fare'].nonzero()[ 0]].min() / 10 # Binarizing the features by binning them into quantiles df_titanic_data['Fare_bin'] = pd.qcut(df_titanic_data['Fare'], 4) if keep_binary: df_titanic_data = pd.concat( [df_titanic_data, pd.get_dummies(df_titanic_data['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))], axis=1) # binning if keep_bins: df_titanic_data['Fare_bin_id'] = pd.factorize(df_titanic_data['Fare_bin'])[0] + 1 # scaling the value if keep_scaled: scaler_processing = preprocessing.StandardScaler() df_titanic_data['Fare_scaled'] = scaler_processing.fit_transform(df_titanic_data.Fare.reshape(-1, 1)) if keep_bins and keep_scaled: scaler_processing = preprocessing.StandardScaler() df_titanic_data['Fare_bin_id_scaled'] = scaler_processing.fit_transform( df_titanic_data.Fare_bin_id.reshape(-1, 1)) if not keep_strings: df_titanic_data.drop('Fare_bin', axis=1, inplace=True) # Helper function for constructing features from the ticket variable
Example #15
Source File: grouputils.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def group_sums(x, group, use_bincount=True): """simple bincount version, again group : array, integer assumed to be consecutive integers no dtype checking because I want to raise in that case uses loop over columns of x for comparison, simple python loop """ x = np.asarray(x) if x.ndim == 1: x = x[:, None] elif x.ndim > 2 and use_bincount: raise ValueError('not implemented yet') if use_bincount: # re-label groups or bincount takes too much memory if np.max(group) > 2 * x.shape[0]: group = pd.factorize(group)[0] return np.array([np.bincount(group, weights=x[:, col]) for col in range(x.shape[1])]) else: uniques = np.unique(group) result = np.zeros([len(uniques)] + list(x.shape[1:])) for ii, cat in enumerate(uniques): result[ii] = x[g == cat].sum(0) return result
Example #16
Source File: feature_transformer.py From py_ml_utils with Apache License 2.0 | 5 votes |
def _fit_special_process(self, data, target=None): _, self.encoder = pd.factorize(data[self._name], sort=True)
Example #17
Source File: feature_engineering_titanic.py From Deep-Learning-By-Example with MIT License | 5 votes |
def process_age(): global df_titanic_data # calling the set_missing_ages helper function to use random forest regression for predicting missing values of age set_missing_ages() # # scale the age variable by centering it around the mean with a unit variance # if keep_scaled: # scaler_preprocessing = preprocessing.StandardScaler() # df_titanic_data['Age_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Age.reshape(-1, 1)) # construct a feature for children df_titanic_data['isChild'] = np.where(df_titanic_data.Age < 13, 1, 0) # bin into quartiles and create binary features df_titanic_data['Age_bin'] = pd.qcut(df_titanic_data['Age'], 4) if keep_binary: df_titanic_data = pd.concat( [df_titanic_data, pd.get_dummies(df_titanic_data['Age_bin']).rename(columns=lambda y: 'Age_' + str(y))], axis=1) if keep_bins: df_titanic_data['Age_bin_id'] = pd.factorize(df_titanic_data['Age_bin'])[0] + 1 if keep_bins and keep_scaled: scaler_processing = preprocessing.StandardScaler() df_titanic_data['Age_bin_id_scaled'] = scaler_processing.fit_transform( df_titanic_data.Age_bin_id.reshape(-1, 1)) if not keep_strings: df_titanic_data.drop('Age_bin', axis=1, inplace=True) # Helper function for constructing features from the passengers/crew names
Example #18
Source File: feature_engineering_titanic.py From Deep-Learning-By-Example with MIT License | 5 votes |
def process_name(): global df_titanic_data # getting the different names in the names variable df_titanic_data['Names'] = df_titanic_data['Name'].map(lambda y: len(re.split(' ', y))) # Getting titles for each person df_titanic_data['Title'] = df_titanic_data['Name'].map(lambda y: re.compile(", (.*?)\.").findall(y)[0]) # handling the low occuring titles df_titanic_data['Title'][df_titanic_data.Title == 'Jonkheer'] = 'Master' df_titanic_data['Title'][df_titanic_data.Title.isin(['Ms', 'Mlle'])] = 'Miss' df_titanic_data['Title'][df_titanic_data.Title == 'Mme'] = 'Mrs' df_titanic_data['Title'][df_titanic_data.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir' df_titanic_data['Title'][df_titanic_data.Title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady' # binarizing all the features if keep_binary: df_titanic_data = pd.concat( [df_titanic_data, pd.get_dummies(df_titanic_data['Title']).rename(columns=lambda x: 'Title_' + str(x))], axis=1) # scalling if keep_scaled: scaler_preprocessing = preprocessing.StandardScaler() df_titanic_data['Names_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Names.reshape(-1, 1)) # binning if keep_bins: df_titanic_data['Title_id'] = pd.factorize(df_titanic_data['Title'])[0] + 1 if keep_bins and keep_scaled: scaler = preprocessing.StandardScaler() df_titanic_data['Title_id_scaled'] = scaler.fit_transform(df_titanic_data.Title_id.reshape(-1, 1)) # Generate features from the cabin input variable
Example #19
Source File: test_ip_pandas.py From cyberpandas with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_factorize(): arr = ip.IPArray([1, 1, 10, 10]) labels, uniques = pd.factorize(arr) expected_labels = np.array([0, 0, 1, 1]) tm.assert_numpy_array_equal(labels, expected_labels) expected_uniques = ip.IPArray([1, 10]) assert uniques.equals(expected_uniques)
Example #20
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_factorize_na_sentinel(self, sort, na_sentinel): data = np.array(['b', 'a', None, 'b'], dtype=object) labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = np.array(['a', 'b'], dtype=object) else: expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) expected_uniques = np.array(['b', 'a'], dtype=object) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_numpy_array_equal(uniques, expected_uniques)
Example #21
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_uint64_factorize(self, writable): data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques)
Example #22
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques)
Example #23
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_complex_sorting(self): # gh 12666 - check no segfault x17 = np.array([complex(i) for i in range(17)], dtype=object) pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True)
Example #24
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_factorize_tuple_list(self, data, expected_label, expected_level): # GH9454 result = pd.factorize(data) tm.assert_numpy_array_equal(result[0], np.array(expected_label, dtype=np.intp)) expected_level_array = com.asarray_tuplesafe(expected_level, dtype=object) tm.assert_numpy_array_equal(result[1], expected_level_array)
Example #25
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_basic(self): labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_numpy_array_equal( uniques, np.array(['a', 'b', 'c'], dtype=object)) labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], sort=True) exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = np.array(['a', 'b', 'c'], dtype=object) tm.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp)
Example #26
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_factorized_sort_ordered(): cat = pd.Categorical(['b', 'b', None, 'a'], categories=['c', 'b', 'a'], ordered=True) labels, uniques = pd.factorize(cat, sort=True) expected_labels = np.array([0, 0, -1, 1], dtype=np.intp) expected_uniques = pd.Categorical(['b', 'a'], categories=['c', 'b', 'a'], ordered=True) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques)
Example #27
Source File: test_algos.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_factorized_sort(): cat = pd.Categorical(['b', 'b', None, 'a']) labels, uniques = pd.factorize(cat, sort=True) expected_labels = np.array([1, 1, -1, 0], dtype=np.intp) expected_uniques = pd.Categorical(['a', 'b']) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques)
Example #28
Source File: groupby.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) index = pd.Index(index, name="B") expected = pd.Series([1, 3, 4], index=index, name="A") self.assert_series_equal(result, expected)
Example #29
Source File: groupby.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_groupby_extension_agg(self, as_index, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() _, index = pd.factorize(data_for_grouping, sort=True) index = pd.Index(index, name="B") expected = pd.Series([3, 1, 4], index=index, name="A") if as_index: self.assert_series_equal(result, expected) else: expected = expected.reset_index() self.assert_frame_equal(result, expected)
Example #30
Source File: methods.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_factorize_empty(self, data): labels, uniques = pd.factorize(data[:0]) expected_labels = np.array([], dtype=np.intp) expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype) tm.assert_numpy_array_equal(labels, expected_labels) self.assert_extension_array_equal(uniques, expected_uniques)