Python sklearn.preprocessing.OneHotEncoder() Examples
The following are 30
code examples of sklearn.preprocessing.OneHotEncoder().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.preprocessing
, or try the search function
.
Example #1
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_one_hot_encoder_categorical_features(): X = np.array([[3, 2, 1], [0, 1, 1]]) X2 = np.array([[1, 1, 1]]) cat = [True, False, False] _check_one_hot(X, X2, cat, 4) # Edge case: all non-categorical cat = [False, False, False] _check_one_hot(X, X2, cat, 3) # Edge case: all categorical cat = [True, True, True] _check_one_hot(X, X2, cat, 5) # check error raised if also specifying categories oh = OneHotEncoder(categories=[range(3)], categorical_features=[True, False, False]) assert_raises(ValueError, oh.fit, X)
Example #2
Source File: feature_expansion.py From KDDCup2019_admin with MIT License | 6 votes |
def cat_onehot_encoder(df,y,col,selection=True): feat_x = df.values.reshape(-1,1) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(feat_x) feat_x = le.transform(feat_x) mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1)) from scipy.sparse import csr_matrix features_tmp = mlbs.transform(feat_x.reshape(-1,1)) features_tmp = csr_matrix(features_tmp,dtype=float).tocsr() models = None auc_score = None if selection is True: auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y) print(col, "auc", auc_score) #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col]) new_feature = features_tmp return new_feature,mlbs,models,auc_score,le
Example #3
Source File: test_one_hot_encoder.py From coremltools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_conversion_one_column_of_several(self): scikit_model = OneHotEncoder(categorical_features=[0]) scikit_model.fit(copy(self.scikit_data_multiple_cols)) spec = sklearn.convert( scikit_model, ["feature_1", "feature_2"], "out" ).get_spec() test_data = [ {"feature_1": row[0], "feature_2": row[1]} for row in self.scikit_data_multiple_cols ] scikit_output = [ {"out": row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray() ] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics["num_errors"], 0)
Example #4
Source File: features_binarizer_test.py From tick with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_binarizer_remove_first(self): """...Test binarizer fit when remove_first=True """ n_cuts = 3 one_hot_encoder = OneHotEncoder(sparse=True) expected_binarization = one_hot_encoder.fit_transform( self.default_expected_intervals) binarizer = FeaturesBinarizer(method='quantile', n_cuts=n_cuts, detect_column_type="auto", remove_first=True) binarizer.fit(self.features) binarized_array = binarizer.transform(self.features) self.assertEqual(binarized_array.__class__, csr.csr_matrix) expected_binarization_without_first = \ np.delete(expected_binarization.toarray(), [0, 4, 8, 10], 1) np.testing.assert_array_equal(expected_binarization_without_first, binarized_array.toarray()) return
Example #5
Source File: datasets.py From torchkit with MIT License | 6 votes |
def load_cifar10_image(root='dataset',labels=False): helpers.create(root, 'cifar10') droot = root+'/'+'cifar10' if not os.path.exists('{}/cifar10.pkl'.format(droot)): from downloader import download_cifar10 download_cifar10(droot) f = lambda d:d.astype(floatX) filename = '{}/cifar10.pkl'.format(droot) tr_x, tr_y, te_x, te_y = pickle.load(open(filename,'r')) if tr_x.max() == 255: tr_x = tr_x / 256. te_x = te_x / 256. if labels: enc = OneHotEncoder(10) tr_y = enc.fit_transform(tr_y).toarray().reshape(50000,10).astype(int) te_y = enc.fit_transform(te_y).toarray().reshape(10000,10).astype(int) return (f(d) for d in [tr_x, tr_y, te_x, te_y]) else: return (f(d) for d in [tr_x, te_x])
Example #6
Source File: test_one_hot_encoder.py From coremltools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_conversion_many_columns(self): scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data_multiple_cols) spec = sklearn.convert( scikit_model, ["feature_1", "feature_2"], "out" ).get_spec() test_data = [ {"feature_1": row[0], "feature_2": row[1]} for row in self.scikit_data_multiple_cols ] scikit_output = [ {"out": row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray() ] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics["num_errors"], 0)
Example #7
Source File: forest_embedding.py From RandomForestClustering with MIT License | 6 votes |
def fit_transform(self, X, y=None, sample_weight=None): X = check_array(X, accept_sparse=['csc'], ensure_2d=False) if sp.issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() X_, y_ = generate_discriminative_dataset(X) super(RandomForestEmbedding, self).fit(X_, y_, sample_weight=sample_weight) self.one_hot_encoder_ = OneHotEncoder(sparse=True) if self.sparse_output: return self.one_hot_encoder_.fit_transform(self.apply(X)) return self.apply(X)
Example #8
Source File: sakaguchi.py From PyShortTextCategorization with MIT License | 6 votes |
def loadmodel(self, prefix): """ Load the model. :param prefix: prefix of the model path :return: None :type prefix: str """ self.dictionary = Dictionary.load(prefix+'_vocabs.gensimdict') parameters = json.load(open(prefix+'_config.json', 'r')) self.operation = parameters['operation'] self.alph = parameters['alph'] self.specialsignals = parameters['special_signals'] self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals) self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph) self.batchsize = parameters['batchsize'] self.nb_hiddenunits = parameters['nb_hiddenunits'] self.onehotencoder = OneHotEncoder() self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1))) self.model = kerasio.load_model(prefix) self.trained = True
Example #9
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_categories(density, drop): ohe_base = OneHotEncoder(sparse=density) ohe_test = OneHotEncoder(sparse=density, drop=drop) X = [['c', 1, 'a'], ['a', 2, 'b']] ohe_base.fit(X) ohe_test.fit(X) assert_array_equal(ohe_base.categories_, ohe_test.categories_) if drop == 'first': assert_array_equal(ohe_test.drop_idx_, 0) else: for drop_cat, drop_idx, cat_list in zip(drop, ohe_test.drop_idx_, ohe_test.categories_): assert cat_list[drop_idx] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) assert ohe_test.drop_idx_.dtype == np.int_
Example #10
Source File: 05_scaling_non_linear_models.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_X_y(**kwargs): """simple wrapper around pd.read_csv that extracts features and labels Some systematic preprocessing is also carried out to avoid doing this transformation repeatedly in the code. """ global label_encoder df = pd.read_csv(info['path'], sep='\t', **kwargs) return preprocess(df, label_encoder) ############################################################################### # Classifier objects in |sklearn| often require :code:`y` to be integer labels. # Additionally, |APS| requires a binary version of the labels. For these two # purposes, we create: # # * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes # * a |OneHotEncoder|, pre-fitted on the resulting integer labels. # # Their |transform| methods can the be called at appopriate times.
Example #11
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_one_hot_encoder_drop_manual(): cats_to_drop = ['def', 12, 3, 56] enc = OneHotEncoder(drop=cats_to_drop) X = [['abc', 12, 2, 55], ['def', 12, 1, 55], ['def', 12, 3, 56]] trans = enc.fit_transform(X).toarray() exp = [[1, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]] assert_array_equal(trans, exp) dropped_cats = [cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)] assert_array_equal(dropped_cats, cats_to_drop) assert_array_equal(np.array(X, dtype=object), enc.inverse_transform(trans))
Example #12
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_encoder_dtypes_pandas(): # check dtype (similar to test_categorical_encoder_dtypes for dataframes) pd = pytest.importorskip('pandas') enc = OneHotEncoder(categories='auto') exp = np.array([[1., 0., 1., 0., 1., 0.], [0., 1., 0., 1., 0., 1.]], dtype='float64') X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64') enc.fit(X) assert all([enc.categories_[i].dtype == 'int64' for i in range(2)]) assert_array_equal(enc.transform(X).toarray(), exp) X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]}) X_type = [int, object, float] enc.fit(X) assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)]) assert_array_equal(enc.transform(X).toarray(), exp)
Example #13
Source File: test_one_hot_encoder.py From coremltools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_boston_OHE_pipeline(self): data = load_boston() for categorical_features in [[3], [8], [3, 8], [8, 3]]: # Put it in a pipeline so that we can test whether the output dimension # handling is correct. model = Pipeline( [ ("OHE", OneHotEncoder(categorical_features=categorical_features)), ("Normalizer", Normalizer()), ] ) model.fit(data.data.copy(), data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, "out").get_spec() input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out": row} for row in model.transform(data.data.copy())] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
Example #14
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_one_hot_encoder_force_new_behaviour(): # ambiguous integer case (non secutive range of categories) X = np.array([[1, 2]]).T X2 = np.array([[0, 1]]).T # without argument -> by default using legacy behaviour with warnings enc = OneHotEncoder() with ignore_warnings(category=FutureWarning): enc.fit(X) res = enc.transform(X2) exp = np.array([[0, 0], [1, 0]]) assert_array_equal(res.toarray(), exp) # with explicit auto argument -> don't use legacy behaviour # (so will raise an error on unseen value within range) enc = OneHotEncoder(categories='auto') enc.fit(X) assert_raises(ValueError, enc.transform, X2)
Example #15
Source File: test_discretization.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_encode_options(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal').fit(X) Xt_1 = est.transform(X) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot-dense').fit(X) Xt_2 = est.transform(X) assert not sp.issparse(Xt_2) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False) .fit_transform(Xt_1), Xt_2) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot').fit(X) Xt_3 = est.transform(X) assert sp.issparse(Xt_3) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True) .fit_transform(Xt_1).toarray(), Xt_3.toarray())
Example #16
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype): enc = OneHotEncoder(categories=cats) exp = np.array([[1., 0., 0.], [0., 1., 0.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert list(enc.categories[0]) == list(cats[0]) assert enc.categories_[0].tolist() == list(cats[0]) # manually specified categories should have same dtype as # the data when coerced from lists assert enc.categories_[0].dtype == cat_dtype # when specifying categories manually, unknown categories should already # raise when fitting enc = OneHotEncoder(categories=cats) with pytest.raises(ValueError, match="Found unknown categories"): enc.fit(X2) enc = OneHotEncoder(categories=cats, handle_unknown='ignore') exp = np.array([[1., 0., 0.], [0., 0., 0.]]) assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
Example #17
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_one_hot_encoder_unsorted_categories(): X = np.array([['a', 'b']], dtype=object).T enc = OneHotEncoder(categories=[['b', 'a', 'c']]) exp = np.array([[0., 1., 0.], [1., 0., 0.]]) assert_array_equal(enc.fit(X).transform(X).toarray(), exp) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories_[0].tolist() == ['b', 'a', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.object_) # unsorted passed categories still raise for numerical values X = np.array([[1, 2]]).T enc = OneHotEncoder(categories=[[2, 1, 3]]) msg = 'Unsorted categories are not supported' with pytest.raises(ValueError, match=msg): enc.fit_transform(X)
Example #18
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
Example #19
Source File: bls.py From Broad-Learning-System with MIT License | 6 votes |
def __init__(self, maptimes = 10, enhencetimes = 10, map_function = 'linear', enhence_function = 'linear', batchsize = 'auto', reg = 0.001): self._maptimes = maptimes self._enhencetimes = enhencetimes self._batchsize = batchsize self._reg = reg self._map_function = map_function self._enhence_function = enhence_function self.W = 0 self.pesuedoinverse = 0 self.normalscaler = scaler() self.onehotencoder = preprocessing.OneHotEncoder(sparse = False) self.mapping_generator = node_generator() self.enhence_generator = node_generator(whiten = True)
Example #20
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown): if as_data_frame: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) if as_data_frame: X_partial = X.iloc[:1, :] else: X_partial = X[:1, :] ohe.fit(X_partial) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
Example #21
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_encoder_dtypes(): # check that dtypes are preserved when determining categories enc = OneHotEncoder(categories='auto') exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64') for X in [np.array([[1, 2], [3, 4]], dtype='int64'), np.array([[1, 2], [3, 4]], dtype='float64'), np.array([['a', 'b'], ['c', 'd']]), # string dtype np.array([[1, 'a'], [3, 'b']], dtype='object')]: enc.fit(X) assert all([enc.categories_[i].dtype == X.dtype for i in range(2)]) assert_array_equal(enc.transform(X).toarray(), exp) X = [[1, 2], [3, 4]] enc.fit(X) assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)]) assert_array_equal(enc.transform(X).toarray(), exp) X = [[1, 'a'], [3, 'b']] enc.fit(X) assert all([enc.categories_[i].dtype == 'object' for i in range(2)]) assert_array_equal(enc.transform(X).toarray(), exp)
Example #22
Source File: transforms.py From nussl with MIT License | 6 votes |
def __call__(self, data): if 'metadata' not in data: raise TransformException( f"Expected metadata in data, got {list(data.keys())}") if 'labels' not in data['metadata']: raise TransformException( f"Expected labels in data['metadata'], got " f"{list(data['metadata'].keys())}") enc = OneHotEncoder(categories=[data['metadata']['labels']]) sources = data[self.source_key] source_keys = [k.split('::')[0] for k in list(sources.keys())] source_labels = [[l] for l in sorted(source_keys)] one_hot_labels = enc.fit_transform(source_labels) data['one_hot_labels'] = one_hot_labels.toarray() return data
Example #23
Source File: test_one_hot_encoder.py From coremltools with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_conversion_one_column(self): # Fit a single OHE scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data) spec = sklearn.convert(scikit_model, "single_feature", "out").get_spec() test_data = [{"single_feature": row} for row in self.scikit_data] scikit_output = [ {"out": row} for row in scikit_model.transform(self.scikit_data).toarray() ] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics["num_errors"], 0)
Example #24
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_one_hot_encoder_feature_names(): enc = OneHotEncoder() X = [['Male', 1, 'girl', 2, 3], ['Female', 41, 'girl', 1, 10], ['Male', 51, 'boy', 12, 3], ['Male', 91, 'girl', 21, 30]] enc.fit(X) feature_names = enc.get_feature_names() assert isinstance(feature_names, np.ndarray) assert_array_equal(['x0_Female', 'x0_Male', 'x1_1', 'x1_41', 'x1_51', 'x1_91', 'x2_boy', 'x2_girl', 'x3_1', 'x3_2', 'x3_12', 'x3_21', 'x4_3', 'x4_10', 'x4_30'], feature_names) feature_names2 = enc.get_feature_names(['one', 'two', 'three', 'four', 'five']) assert_array_equal(['one_Female', 'one_Male', 'two_1', 'two_41', 'two_51', 'two_91', 'three_boy', 'three_girl', 'four_1', 'four_2', 'four_12', 'four_21', 'five_3', 'five_10', 'five_30'], feature_names2) with pytest.raises(ValueError, match="input_features should have length"): enc.get_feature_names(['one', 'two'])
Example #25
Source File: test_one_hot_encoder.py From coremltools with BSD 3-Clause "New" or "Revised" License | 5 votes |
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ scikit_data = [[0], [1], [2], [4], [3], [2], [4], [5], [6], [7]] scikit_data_multiple_cols = [[0, 1], [1, 0], [2, 2], [3, 3], [4, 4]] scikit_model = OneHotEncoder() scikit_model.fit(scikit_data) # Save the data and the model self.scikit_data = np.asarray(scikit_data, dtype="d") self.scikit_data_multiple_cols = np.asarray( scikit_data_multiple_cols, dtype="d" ) self.scikit_model = scikit_model
Example #26
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_X_is_not_1D_pandas(method): pd = pytest.importorskip('pandas') X = pd.Series([6, 3, 4, 6]) oh = OneHotEncoder() msg = ("Expected 2D array, got 1D array instead") with pytest.raises(ValueError, match=msg): getattr(oh, method)(X)
Example #27
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_one_hot_encoder(X): Xtr = check_categorical_onehot(np.array(X)[:, [0]]) assert_allclose(Xtr, [[0, 1], [1, 0]]) Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) Xtr = OneHotEncoder(categories='auto').fit_transform(X) assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
Example #28
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_one_hot_encoder_dtype_pandas(output_dtype): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]}) X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype) oh = OneHotEncoder(dtype=output_dtype) assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected) assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected) oh = OneHotEncoder(dtype=output_dtype, sparse=False) assert_array_equal(oh.fit_transform(X_df), X_expected) assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
Example #29
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def check_categorical_onehot(X): enc = OneHotEncoder(categories='auto') Xtr1 = enc.fit_transform(X) enc = OneHotEncoder(categories='auto', sparse=False) Xtr2 = enc.fit_transform(X) assert_allclose(Xtr1.toarray(), Xtr2) assert sparse.isspmatrix_csr(Xtr1) return Xtr1.toarray()
Example #30
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_X_is_not_1D(X, method): oh = OneHotEncoder() msg = ("Expected 2D array, got 1D array instead") with pytest.raises(ValueError, match=msg): getattr(oh, method)(X)