Python sklearn.preprocessing.OneHotEncoder() Examples

The following are 30 code examples of sklearn.preprocessing.OneHotEncoder(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.preprocessing , or try the search function .
Example #1
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_one_hot_encoder_categorical_features():
    X = np.array([[3, 2, 1], [0, 1, 1]])
    X2 = np.array([[1, 1, 1]])

    cat = [True, False, False]
    _check_one_hot(X, X2, cat, 4)

    # Edge case: all non-categorical
    cat = [False, False, False]
    _check_one_hot(X, X2, cat, 3)

    # Edge case: all categorical
    cat = [True, True, True]
    _check_one_hot(X, X2, cat, 5)

    # check error raised if also specifying categories
    oh = OneHotEncoder(categories=[range(3)],
                       categorical_features=[True, False, False])
    assert_raises(ValueError, oh.fit, X) 
Example #2
Source File: feature_expansion.py    From KDDCup2019_admin with MIT License 6 votes vote down vote up
def cat_onehot_encoder(df,y,col,selection=True):
    feat_x = df.values.reshape(-1,1)

    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    le.fit(feat_x)
    feat_x = le.transform(feat_x)

    mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1))
    from scipy.sparse import csr_matrix
    features_tmp = mlbs.transform(feat_x.reshape(-1,1))
    features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
    models = None
    auc_score = None
    if selection is True:
        auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
        print(col, "auc", auc_score)
    #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
    new_feature = features_tmp




    return new_feature,mlbs,models,auc_score,le 
Example #3
Source File: test_one_hot_encoder.py    From coremltools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_conversion_one_column_of_several(self):
        scikit_model = OneHotEncoder(categorical_features=[0])
        scikit_model.fit(copy(self.scikit_data_multiple_cols))
        spec = sklearn.convert(
            scikit_model, ["feature_1", "feature_2"], "out"
        ).get_spec()

        test_data = [
            {"feature_1": row[0], "feature_2": row[1]}
            for row in self.scikit_data_multiple_cols
        ]
        scikit_output = [
            {"out": row}
            for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()
        ]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics["num_errors"], 0) 
Example #4
Source File: features_binarizer_test.py    From tick with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_binarizer_remove_first(self):
        """...Test binarizer fit when remove_first=True
        """
        n_cuts = 3
        one_hot_encoder = OneHotEncoder(sparse=True)
        expected_binarization = one_hot_encoder.fit_transform(
            self.default_expected_intervals)

        binarizer = FeaturesBinarizer(method='quantile', n_cuts=n_cuts,
                                      detect_column_type="auto",
                                      remove_first=True)

        binarizer.fit(self.features)
        binarized_array = binarizer.transform(self.features)
        self.assertEqual(binarized_array.__class__, csr.csr_matrix)

        expected_binarization_without_first = \
            np.delete(expected_binarization.toarray(), [0, 4, 8, 10], 1)

        np.testing.assert_array_equal(expected_binarization_without_first,
                                      binarized_array.toarray())

        return 
Example #5
Source File: datasets.py    From torchkit with MIT License 6 votes vote down vote up
def load_cifar10_image(root='dataset',labels=False):
    helpers.create(root, 'cifar10')
    droot = root+'/'+'cifar10'
    
    if not os.path.exists('{}/cifar10.pkl'.format(droot)):
        from downloader import download_cifar10
        download_cifar10(droot)
    
    f = lambda d:d.astype(floatX)
    filename = '{}/cifar10.pkl'.format(droot)
    tr_x, tr_y, te_x, te_y = pickle.load(open(filename,'r'))
    if tr_x.max() == 255:
        tr_x = tr_x / 256.
        te_x = te_x / 256.
        
    if labels:
        enc = OneHotEncoder(10)
        tr_y = enc.fit_transform(tr_y).toarray().reshape(50000,10).astype(int)
        te_y = enc.fit_transform(te_y).toarray().reshape(10000,10).astype(int)
        
        return (f(d) for d in [tr_x, tr_y, te_x, te_y])   
    else:
        return (f(d) for d in [tr_x, te_x]) 
Example #6
Source File: test_one_hot_encoder.py    From coremltools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_conversion_many_columns(self):
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data_multiple_cols)
        spec = sklearn.convert(
            scikit_model, ["feature_1", "feature_2"], "out"
        ).get_spec()

        test_data = [
            {"feature_1": row[0], "feature_2": row[1]}
            for row in self.scikit_data_multiple_cols
        ]
        scikit_output = [
            {"out": row}
            for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()
        ]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics["num_errors"], 0) 
Example #7
Source File: forest_embedding.py    From RandomForestClustering with MIT License 6 votes vote down vote up
def fit_transform(self, X, y=None, sample_weight=None):
        X = check_array(X, accept_sparse=['csc'], ensure_2d=False)

        if sp.issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
            X.sort_indices()

        X_, y_ = generate_discriminative_dataset(X)

        super(RandomForestEmbedding, self).fit(X_, y_,
                                               sample_weight=sample_weight)

        self.one_hot_encoder_ = OneHotEncoder(sparse=True)
        if self.sparse_output:
            return self.one_hot_encoder_.fit_transform(self.apply(X))
        return self.apply(X) 
Example #8
Source File: sakaguchi.py    From PyShortTextCategorization with MIT License 6 votes vote down vote up
def loadmodel(self, prefix):
        """ Load the model.

        :param prefix: prefix of the model path
        :return: None
        :type prefix: str
        """
        self.dictionary = Dictionary.load(prefix+'_vocabs.gensimdict')
        parameters = json.load(open(prefix+'_config.json', 'r'))
        self.operation = parameters['operation']
        self.alph = parameters['alph']
        self.specialsignals = parameters['special_signals']
        self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals)
        self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph)
        self.batchsize = parameters['batchsize']
        self.nb_hiddenunits = parameters['nb_hiddenunits']
        self.onehotencoder = OneHotEncoder()
        self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1)))
        self.model = kerasio.load_model(prefix)
        self.trained = True 
Example #9
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_categories(density, drop):
    ohe_base = OneHotEncoder(sparse=density)
    ohe_test = OneHotEncoder(sparse=density, drop=drop)
    X = [['c', 1, 'a'],
         ['a', 2, 'b']]
    ohe_base.fit(X)
    ohe_test.fit(X)
    assert_array_equal(ohe_base.categories_, ohe_test.categories_)
    if drop == 'first':
        assert_array_equal(ohe_test.drop_idx_, 0)
    else:
        for drop_cat, drop_idx, cat_list in zip(drop,
                                                ohe_test.drop_idx_,
                                                ohe_test.categories_):
            assert cat_list[drop_idx] == drop_cat
    assert isinstance(ohe_test.drop_idx_, np.ndarray)
    assert ohe_test.drop_idx_.dtype == np.int_ 
Example #10
Source File: 05_scaling_non_linear_models.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_X_y(**kwargs):
    """simple wrapper around pd.read_csv that extracts features and labels

    Some systematic preprocessing is also carried out to avoid doing this
    transformation repeatedly in the code.
    """
    global label_encoder
    df = pd.read_csv(info['path'], sep='\t', **kwargs)
    return preprocess(df, label_encoder)

###############################################################################
# Classifier objects in |sklearn| often require :code:`y` to be integer labels.
# Additionally, |APS| requires a binary version of the labels.  For these two
# purposes, we create:
#
# * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes
# * a |OneHotEncoder|, pre-fitted on the resulting integer labels.
#
# Their |transform| methods can the be called at appopriate times. 
Example #11
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_one_hot_encoder_drop_manual():
    cats_to_drop = ['def', 12, 3, 56]
    enc = OneHotEncoder(drop=cats_to_drop)
    X = [['abc', 12, 2, 55],
         ['def', 12, 1, 55],
         ['def', 12, 3, 56]]
    trans = enc.fit_transform(X).toarray()
    exp = [[1, 0, 1, 1],
           [0, 1, 0, 1],
           [0, 0, 0, 0]]
    assert_array_equal(trans, exp)
    dropped_cats = [cat[feature]
                    for cat, feature in zip(enc.categories_,
                                            enc.drop_idx_)]
    assert_array_equal(dropped_cats, cats_to_drop)
    assert_array_equal(np.array(X, dtype=object),
                       enc.inverse_transform(trans)) 
Example #12
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_encoder_dtypes_pandas():
    # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
    pd = pytest.importorskip('pandas')

    enc = OneHotEncoder(categories='auto')
    exp = np.array([[1., 0., 1., 0., 1., 0.],
                    [0., 1., 0., 1., 0., 1.]], dtype='float64')

    X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
    enc.fit(X)
    assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp)

    X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
    X_type = [int, object, float]
    enc.fit(X)
    assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
    assert_array_equal(enc.transform(X).toarray(), exp) 
Example #13
Source File: test_one_hot_encoder.py    From coremltools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_boston_OHE_pipeline(self):
        data = load_boston()

        for categorical_features in [[3], [8], [3, 8], [8, 3]]:
            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct.

            model = Pipeline(
                [
                    ("OHE", OneHotEncoder(categorical_features=categorical_features)),
                    ("Normalizer", Normalizer()),
                ]
            )

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, "out").get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out": row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0 
Example #14
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_one_hot_encoder_force_new_behaviour():
    # ambiguous integer case (non secutive range of categories)
    X = np.array([[1, 2]]).T
    X2 = np.array([[0, 1]]).T

    # without argument -> by default using legacy behaviour with warnings
    enc = OneHotEncoder()

    with ignore_warnings(category=FutureWarning):
        enc.fit(X)

    res = enc.transform(X2)
    exp = np.array([[0, 0], [1, 0]])
    assert_array_equal(res.toarray(), exp)

    # with explicit auto argument -> don't use legacy behaviour
    # (so will raise an error on unseen value within range)
    enc = OneHotEncoder(categories='auto')
    enc.fit(X)
    assert_raises(ValueError, enc.transform, X2) 
Example #15
Source File: test_discretization.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_encode_options():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='ordinal').fit(X)
    Xt_1 = est.transform(X)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot-dense').fit(X)
    Xt_2 = est.transform(X)
    assert not sp.issparse(Xt_2)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=False)
                       .fit_transform(Xt_1), Xt_2)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot').fit(X)
    Xt_3 = est.transform(X)
    assert sp.issparse(Xt_3)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=True)
                       .fit_transform(Xt_1).toarray(),
                       Xt_3.toarray()) 
Example #16
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OneHotEncoder(categories=cats)
    exp = np.array([[1., 0., 0.],
                    [0., 1., 0.]])
    assert_array_equal(enc.fit_transform(X).toarray(), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OneHotEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)
    enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
    exp = np.array([[1., 0., 0.], [0., 0., 0.]])
    assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp) 
Example #17
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_one_hot_encoder_unsorted_categories():
    X = np.array([['a', 'b']], dtype=object).T

    enc = OneHotEncoder(categories=[['b', 'a', 'c']])
    exp = np.array([[0., 1., 0.],
                    [1., 0., 0.]])
    assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
    assert_array_equal(enc.fit_transform(X).toarray(), exp)
    assert enc.categories_[0].tolist() == ['b', 'a', 'c']
    assert np.issubdtype(enc.categories_[0].dtype, np.object_)

    # unsorted passed categories still raise for numerical values
    X = np.array([[1, 2]]).T
    enc = OneHotEncoder(categories=[[2, 1, 3]])
    msg = 'Unsorted categories are not supported'
    with pytest.raises(ValueError, match=msg):
        enc.fit_transform(X) 
Example #18
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    assert_array_equal(ct.fit_transform(X_list), expected_result)
    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) 
Example #19
Source File: bls.py    From Broad-Learning-System with MIT License 6 votes vote down vote up
def __init__(self, 
                 maptimes = 10, 
                 enhencetimes = 10,
                 map_function = 'linear',
                 enhence_function = 'linear',
                 batchsize = 'auto', 
                 reg = 0.001):
        
        self._maptimes = maptimes
        self._enhencetimes = enhencetimes
        self._batchsize = batchsize
        self._reg = reg
        self._map_function = map_function
        self._enhence_function = enhence_function
        
        self.W = 0
        self.pesuedoinverse = 0
        self.normalscaler = scaler()
        self.onehotencoder = preprocessing.OneHotEncoder(sparse = False)
        self.mapping_generator = node_generator()
        self.enhence_generator = node_generator(whiten = True) 
Example #20
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
    if as_data_frame:
        pd = pytest.importorskip('pandas')
        X = pd.DataFrame(X)

    ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    if as_data_frame:
        X_partial = X.iloc[:1, :]
    else:
        X_partial = X[:1, :]

    ohe.fit(X_partial)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X) 
Example #21
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_encoder_dtypes():
    # check that dtypes are preserved when determining categories
    enc = OneHotEncoder(categories='auto')
    exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')

    for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
              np.array([[1, 2], [3, 4]], dtype='float64'),
              np.array([['a', 'b'], ['c', 'd']]),  # string dtype
              np.array([[1, 'a'], [3, 'b']], dtype='object')]:
        enc.fit(X)
        assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
        assert_array_equal(enc.transform(X).toarray(), exp)

    X = [[1, 2], [3, 4]]
    enc.fit(X)
    assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
                for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp)

    X = [[1, 'a'], [3, 'b']]
    enc.fit(X)
    assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp) 
Example #22
Source File: transforms.py    From nussl with MIT License 6 votes vote down vote up
def __call__(self, data):
        if 'metadata' not in data:
            raise TransformException(
                f"Expected metadata in data, got {list(data.keys())}")
        if 'labels' not in data['metadata']:
            raise TransformException(
                f"Expected labels in data['metadata'], got "
                f"{list(data['metadata'].keys())}")

        enc = OneHotEncoder(categories=[data['metadata']['labels']])

        sources = data[self.source_key]
        source_keys = [k.split('::')[0] for k in list(sources.keys())]
        source_labels = [[l] for l in sorted(source_keys)]

        one_hot_labels = enc.fit_transform(source_labels)
        data['one_hot_labels'] = one_hot_labels.toarray()

        return data 
Example #23
Source File: test_one_hot_encoder.py    From coremltools with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_conversion_one_column(self):
        # Fit a single OHE
        scikit_model = OneHotEncoder()
        scikit_model.fit(self.scikit_data)
        spec = sklearn.convert(scikit_model, "single_feature", "out").get_spec()

        test_data = [{"single_feature": row} for row in self.scikit_data]
        scikit_output = [
            {"out": row} for row in scikit_model.transform(self.scikit_data).toarray()
        ]
        metrics = evaluate_transformer(spec, test_data, scikit_output)

        self.assertIsNotNone(spec)
        self.assertIsNotNone(spec.description)
        self.assertEquals(metrics["num_errors"], 0) 
Example #24
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_one_hot_encoder_feature_names():
    enc = OneHotEncoder()
    X = [['Male', 1, 'girl', 2, 3],
         ['Female', 41, 'girl', 1, 10],
         ['Male', 51, 'boy', 12, 3],
         ['Male', 91, 'girl', 21, 30]]

    enc.fit(X)
    feature_names = enc.get_feature_names()
    assert isinstance(feature_names, np.ndarray)

    assert_array_equal(['x0_Female', 'x0_Male',
                        'x1_1', 'x1_41', 'x1_51', 'x1_91',
                        'x2_boy', 'x2_girl',
                        'x3_1', 'x3_2', 'x3_12', 'x3_21',
                        'x4_3',
                        'x4_10', 'x4_30'], feature_names)

    feature_names2 = enc.get_feature_names(['one', 'two',
                                            'three', 'four', 'five'])

    assert_array_equal(['one_Female', 'one_Male',
                        'two_1', 'two_41', 'two_51', 'two_91',
                        'three_boy', 'three_girl',
                        'four_1', 'four_2', 'four_12', 'four_21',
                        'five_3', 'five_10', 'five_30'], feature_names2)

    with pytest.raises(ValueError, match="input_features should have length"):
        enc.get_feature_names(['one', 'two']) 
Example #25
Source File: test_one_hot_encoder.py    From coremltools with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        scikit_data = [[0], [1], [2], [4], [3], [2], [4], [5], [6], [7]]
        scikit_data_multiple_cols = [[0, 1], [1, 0], [2, 2], [3, 3], [4, 4]]
        scikit_model = OneHotEncoder()
        scikit_model.fit(scikit_data)

        # Save the data and the model
        self.scikit_data = np.asarray(scikit_data, dtype="d")
        self.scikit_data_multiple_cols = np.asarray(
            scikit_data_multiple_cols, dtype="d"
        )
        self.scikit_model = scikit_model 
Example #26
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_X_is_not_1D_pandas(method):
    pd = pytest.importorskip('pandas')
    X = pd.Series([6, 3, 4, 6])
    oh = OneHotEncoder()

    msg = ("Expected 2D array, got 1D array instead")
    with pytest.raises(ValueError, match=msg):
        getattr(oh, method)(X) 
Example #27
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_one_hot_encoder(X):
    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
    assert_allclose(Xtr, [[0, 1], [1, 0]])

    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])

    Xtr = OneHotEncoder(categories='auto').fit_transform(X)
    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]]) 
Example #28
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_one_hot_encoder_dtype_pandas(output_dtype):
    pd = pytest.importorskip('pandas')

    X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
    X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)

    oh = OneHotEncoder(dtype=output_dtype)
    assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
    assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)

    oh = OneHotEncoder(dtype=output_dtype, sparse=False)
    assert_array_equal(oh.fit_transform(X_df), X_expected)
    assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) 
Example #29
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def check_categorical_onehot(X):
    enc = OneHotEncoder(categories='auto')
    Xtr1 = enc.fit_transform(X)

    enc = OneHotEncoder(categories='auto', sparse=False)
    Xtr2 = enc.fit_transform(X)

    assert_allclose(Xtr1.toarray(), Xtr2)

    assert sparse.isspmatrix_csr(Xtr1)
    return Xtr1.toarray() 
Example #30
Source File: test_encoders.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_X_is_not_1D(X, method):
    oh = OneHotEncoder()

    msg = ("Expected 2D array, got 1D array instead")
    with pytest.raises(ValueError, match=msg):
        getattr(oh, method)(X)