Python Examples of sklearn.preprocessing.LabelEncoder

Source File: prototypicalNet.py From DeepResearch with MIT License

6 votes

def get_query_y(self, Qy, Qyc, class_label):
        """
        Returns labeled representation of classes of Query set and a list of labels.
        """
        labels = []
        m = len(Qy)
        for i in range(m):
            labels += [Qy[i]] * Qyc[i]
        labels = np.array(labels).reshape(len(labels), 1)
        label_encoder = LabelEncoder()
        Query_y = torch.Tensor(
            label_encoder.fit_transform(labels).astype(int)).long()
        if self.gpu:
            Query_y = Query_y.cuda()
        Query_y_labels = np.unique(labels)
        return Query_y, Query_y_labels

Source File: f4_score.py From driverlessai-recipes with Apache License 2.0

6 votes

def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:
        lb = LabelEncoder()
        labels = lb.fit_transform(labels)
        actual = lb.transform(actual)
        method = "binary"
        if len(labels) > 2:
            predicted = np.argmax(predicted, axis=1)
            method = "micro"
        else:
            predicted = (predicted > self._threshold)
        f4_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=4)
        return f4_score

Source File: feature_expansion.py From KDDCup2019_admin with MIT License

6 votes

def cat_onehot_encoder(df,y,col,selection=True):
    feat_x = df.values.reshape(-1,1)

    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    le.fit(feat_x)
    feat_x = le.transform(feat_x)

    mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1))
    from scipy.sparse import csr_matrix
    features_tmp = mlbs.transform(feat_x.reshape(-1,1))
    features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
    models = None
    auc_score = None
    if selection is True:
        auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
        print(col, "auc", auc_score)
    #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
    new_feature = features_tmp




    return new_feature,mlbs,models,auc_score,le

Source File: cost.py From driverlessai-recipes with Apache License 2.0

6 votes

def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:
        # label actuals as 1 or 0
        lb = LabelEncoder()
        labels = lb.fit_transform(labels)
        actual = lb.transform(actual)

        # label predictions as 1 or 0
        predicted = predicted >= self._threshold

        # use sklearn to get fp and fn
        cm = confusion_matrix(actual, predicted, sample_weight=sample_weight, labels=labels)
        tn, fp, fn, tp = cm.ravel()

        # calculate`$1*FP + $2*FN`
        return ((fp * self.__class__._fp_cost) + (fn * self.__class__._fn_cost)) / (
                    tn + fp + fn + tp)  # divide by total weighted count to make loss invariant to data size

Source File: f3_score.py From driverlessai-recipes with Apache License 2.0

6 votes

def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:
        lb = LabelEncoder()
        labels = lb.fit_transform(labels)
        actual = lb.transform(actual)
        method = "binary"
        if len(labels) > 2:
            predicted = np.argmax(predicted, axis=1)
            method = "micro"
        else:
            predicted = (predicted > self._threshold)
        f3_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=3)
        return f3_score

Source File: amazon.py From driverlessai-recipes with Apache License 2.0

6 votes

def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        lb = LabelEncoder()
        lb.fit(self.labels)
        y = lb.transform(y)
        orig_cols = list(X.names)
        XX = X.to_pandas()
        params = {
            'train_dir': user_dir(),
            'allow_writing_files': False,
            'thread_count': 10,
            # 'loss_function': 'Logloss'
        }
        from catboost import CatBoostClassifier
        model = CatBoostClassifier(**params)
        model.fit(XX, y=y, sample_weight=sample_weight, verbose=False,
                  cat_features=list(X[:, [str, int]].names))  # Amazon specific, also no early stopping

        # must always set best_iterations
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=model.feature_importances_,
                                  iterations=0)

Source File: test_estimator_checks.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def fit(self, X, y):
        from sklearn.preprocessing import LabelEncoder
        from sklearn.utils import compute_class_weight

        label_encoder = LabelEncoder().fit(y)
        classes = label_encoder.classes_
        class_weight = compute_class_weight(self.class_weight, classes, y)

        # Intentionally modify the balanced class_weight
        # to simulate a bug and raise an exception
        if self.class_weight == "balanced":
            class_weight += 1.

        # Simply assigning coef_ to the class_weight
        self.coef_ = class_weight
        return self

Source File: util.py From stock-price-prediction with MIT License

6 votes

def preprocessData(dataset):

    le = preprocessing.LabelEncoder()

    # in case divid-by-zero
    dataset.Open[dataset.Open == 0] = 1

    # add prediction target: next day Up/Down
    threshold = 0.000
    dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open']
    dataset.UpDown[dataset.UpDown >= threshold] = 'Up'
    dataset.UpDown[dataset.UpDown < threshold] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down
    dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value
    return dataset

Source File: datasets.py From sato with Apache License 2.0

6 votes

def __init__(self,
                 df_dict: Dict[str, pd.DataFrame]=None,
                 tensor_dict: Dict[str, torch.FloatTensor]=None,
                 labels: List[str] =[],
                 label_enc: LabelEncoder = None,
                 shuffle_group: str = None):

        super().__init__(df_dict,
                         tensor_dict,
                         labels,
                         label_enc)

        l = self.__len__()

        self.shuffle_group = shuffle_group
        prng = np.random.RandomState(SEED)
        self.shuffle_order = prng.permutation(l)

Source File: datasets.py From sato with Apache License 2.0

6 votes

def __init__(self,
                 corpus,
                 sherlock_features: List[str] = None,
                 topic_feature: str = None,
                 label_enc: LabelEncoder = None,
                 id_filter: List[str] = None,
                 max_col_count:int = None,
                 shuffle_group:str=None):

        super().__init__(corpus,
                                    sherlock_features,
                                    topic_feature,
                                    label_enc,
                                    id_filter,
                                    max_col_count)

        l = len(self.df_header)
        self.tempcorpus = corpus

        self.shuffle_group = shuffle_group
        self.prng = np.random.RandomState(SEED)
        self.shuffle_order = self.prng.permutation(l)

Source File: create_transactional_data_or_convert_to_iid.py From driverlessai-recipes with Apache License 2.0

6 votes

def test_transactional_to_iid():
    ret = TransactionalToIID.create_data()
    for name, X in ret.items():
        le = LabelEncoder()
        y = le.fit_transform(X[target]).ravel()
        print(name)
        print(X.head(10))
        print(X.tail(10))
        for col in X.names:
            if "_past_" in col:
                auc = roc_auc_score(y, X[col].to_numpy().ravel())
                print("%s: auc = %f" % (col, auc))
                if "leaky" not in col:
                    assert auc > 0.53  # all lags must have signal
                    assert auc < 0.8  # but not too much
                else:
                    assert auc > 0.75  # all leaky lags must have a lot of signal

Source File: scikitlearn.py From razzy-spinner with GNU General Public License v3.0

6 votes

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

Source File: loaders.py From category_encoders with BSD 3-Clause "New" or "Revised" License

6 votes

def get_cars_data():
    """
    Load the cars dataset, split it into X and y, and then call the label encoder to get an integer y column.

    :return:
    """

    df = pd.read_csv('source_data/cars/car.data.txt')
    X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
    y = df.reindex(columns=['class'])
    y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))

    mapping = [
        {'col': 'buying', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]},
        {'col': 'maint', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]},
        {'col': 'doors', 'mapping': [('2', 0), ('3', 1), ('4', 2), ('5more', 3)]},
        {'col': 'persons', 'mapping': [('2', 0), ('4', 1), ('more', 2)]},
        {'col': 'lug_boot', 'mapping': [('small', 0), ('med', 1), ('big', 2)]},
        {'col': 'safety', 'mapping': [('high', 0), ('med', 1), ('low', 2)]},
    ]

    return X, y, mapping

Source File: test_classical_explainer.py From interpret-text with MIT License

6 votes

def test_explain_model_local_with_predicted_label(self):
        """
        Test for explain_local of classical explainer
        :return:
        """
        X_train, X_test, y_train, y_test = setup_mnli_test_train_split()

        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)
        explainer = ClassicalTextExplainer()
        classifier, best_params = explainer.fit(X_train, y_train)
        explainer.preprocessor.labelEncoder = label_encoder
        y = classifier.predict(DOCUMENT)
        predicted_label = label_encoder.inverse_transform(y)
        local_explanation = explainer.explain_local(DOCUMENT, predicted_label)
        assert len(local_explanation.local_importance_values) == len(local_explanation.features)

Source File: loaders.py From category_encoders with BSD 3-Clause "New" or "Revised" License

6 votes

def get_mushroom_data():
    """
    Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column.

    :return:
    """

    df = pd.read_csv('source_data/mushrooms/agaricus-lepiota.csv')
    X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
    y = df.reindex(columns=['class'])
    y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))

    # this data is truly categorical, with no known concept of ordering
    mapping = None

    return X, y, mapping

Source File: loaders.py From category_encoders with BSD 3-Clause "New" or "Revised" License

6 votes

def get_splice_data():
    """
    Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column.

    :return:
    """

    df = pd.read_csv('source_data/splice/splice.csv')
    X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
    X['dna'] = X['dna'].map(lambda x: list(str(x).strip()))
    for idx in range(60):
        X['dna_%d' % (idx, )] = X['dna'].map(lambda x: x[idx])
    del X['dna']

    y = df.reindex(columns=['class'])
    y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))

    # this data is truly categorical, with no known concept of ordering
    mapping = None

    return X, y, mapping

Source File: 05_scaling_non_linear_models.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

6 votes

def get_X_y(**kwargs):
    """simple wrapper around pd.read_csv that extracts features and labels

    Some systematic preprocessing is also carried out to avoid doing this
    transformation repeatedly in the code.
    """
    global label_encoder
    df = pd.read_csv(info['path'], sep='\t', **kwargs)
    return preprocess(df, label_encoder)

###############################################################################
# Classifier objects in |sklearn| often require :code:`y` to be integer labels.
# Additionally, |APS| requires a binary version of the labels.  For these two
# purposes, we create:
#
# * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes
# * a |OneHotEncoder|, pre-fitted on the resulting integer labels.
#
# Their |transform| methods can the be called at appopriate times.

Source File: text_tfidf_model.py From driverlessai-recipes with Apache License 2.0

5 votes

def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = LogisticRegression(random_state=2019)
        else:
            model = LinearRegression()

        self.tfidf_objs = []
        new_X = None
        for col in X.names:
            XX = X[:, col].to_pandas()
            XX = XX[col].astype(str).fillna("NA").values.tolist()
            tfidf_vec = TfidfVectorizer(**self.params)
            XX = tfidf_vec.fit_transform(XX)
            self.tfidf_objs.append(tfidf_vec)
            if new_X is None:
                new_X = XX
            else:
                new_X = sp.sparse.hstack([new_X, XX])

        model.fit(new_X, y)
        model = (model, self.tfidf_objs)
        self.tfidf_objs = []
        importances = [1] * len(orig_cols)
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances,
                                  iterations=0)

Source File: firstNCharCVTE.py From driverlessai-recipes with Apache License 2.0

5 votes

def fit_transform(self, X: dt.Frame, y: np.array = None):
        self.binner = firstNChars()
        X = self.binner.fit_transform(X, 4)

        # Compute mean target (out of fold) per same string
        self.cvte = CVTargetEncodeTransformer(cat_cols=X.names)

        if self.labels is not None:
            # for classification, always turn y into numeric form, even if already integer
            y = dt.Frame(LabelEncoder().fit(self.labels).transform(y))

        X = self.cvte.fit_transform(X, y)
        return X

Source File: feature_selection.py From driverlessai-recipes with Apache License 2.0

5 votes

def get_feature_importances(data, shuffle, cats=[], seed=None):
    # Gather real features
    train_features = [f for f in data if f not in [target] + cols2ignore]

    # Shuffle target if required
    y = data[target].copy()
    if shuffle:
        y = data[target].copy().sample(frac=1.0, random_state=seed + 4)
    from h2oaicore.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb
    import lightgbm as lgbm
    if is_regression:
        model = lgbm.LGBMRegressor(random_state=seed, importance_type=importance, **lgbm_params)
    else:
        model = lgbm.LGBMClassifier(random_state=seed, importance_type=importance, **lgbm_params)
        y = LabelEncoder().fit_transform(y)
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    model.fit(data[train_features], y, categorical_feature=cats)
    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance"] = model.feature_importances_

    return imp_df

Source File: firstNCharCVTE.py From driverlessai-recipes with Apache License 2.0

5 votes

def fit_transform(self, X: dt.Frame, y: np.array = None):
        self.binner = firstNChars()
        X = self.binner.fit_transform(X, 3)

        # Compute mean target (out of fold) per same string
        self.cvte = CVTargetEncodeTransformer(cat_cols=X.names)

        if self.labels is not None:
            # for classification, always turn y into numeric form, even if already integer
            y = dt.Frame(LabelEncoder().fit(self.labels).transform(y))

        X = self.cvte.fit_transform(X, y)
        return X

Source File: cost_access_to_data.py From driverlessai-recipes with Apache License 2.0

5 votes

def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              X: typing.Optional[dt.Frame] = None,
              **kwargs) -> float:
        # can compute arbitrary cost from all original features
        if X is not None:
            assert X.nrows == len(actual)
            assert X.ncols >= 1
            X_pd = X.to_pandas()

        # label actuals as 1 or 0
        lb = LabelEncoder()
        labels = lb.fit_transform(labels)
        actual = lb.transform(actual)

        # label predictions as 1 or 0
        predicted = predicted >= self._threshold

        # use sklearn to get fp and fn
        cm = confusion_matrix(actual, predicted, sample_weight=sample_weight, labels=labels)
        tn, fp, fn, tp = cm.ravel()

        # calculate`$1*FP + $2*FN`
        return ((fp * self.__class__._fp_cost) + (fn * self.__class__._fn_cost)) / (
                    tn + fp + fn + tp)  # divide by total weighted count to make loss invariant to data size

Source File: cost_smooth.py From driverlessai-recipes with Apache License 2.0

5 votes

def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:
        lb = LabelEncoder()
        labels = list(lb.fit_transform(labels))
        actual = lb.transform(actual)

        if sample_weight is None:
            sample_weight = np.ones(actual.shape[0])

        return np.sum(((1 - actual) * predicted * self.__class__._fp_cost + actual * (
                    1 - predicted) * self.__class__._fn_cost) * sample_weight) / np.sum(sample_weight)

Source File: huber_loss.py From driverlessai-recipes with Apache License 2.0

5 votes

def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[np.array] = None,
              **kwargs) -> float:

        if sample_weight is None:
            sample_weight = np.ones(actual.shape[0])

        isRegression = labels is None
        delta = MyHuberLossScorer._delta_regression if isRegression else MyHuberLossScorer._delta_binary
        if isRegression:
            abs_error = np.abs(np.subtract(actual, predicted))
            loss = np.where(abs_error < delta, .5 * (abs_error) ** 2, delta * (abs_error - 0.5 * delta))
        else:
            lb = LabelEncoder()
            labels = lb.fit_transform(labels)
            actual = lb.transform(actual)
            all0s = np.zeros(actual.shape[0])
            predicted = np.subtract(np.multiply(predicted, 2), 1)
            actual = np.where(actual == 0, -1, 1)
            actual_mult_predict = np.multiply(actual, predicted)
            loss = np.where(actual_mult_predict >= -1,
                            np.square(np.maximum(all0s, np.subtract(1, actual_mult_predict))),
                            -4 * actual_mult_predict)

        loss = np.sum(np.multiply(sample_weight, loss)) / np.sum(sample_weight)
        return float(loss) if actual.shape[0] > 0 else 0

Source File: test_logistic.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_multinomial_logistic_regression_string_inputs():
    # Test with string labels for LogisticRegression(CV)
    n_samples, n_features, n_classes = 50, 5, 3
    X_ref, y = make_classification(n_samples=n_samples, n_features=n_features,
                                   n_classes=n_classes, n_informative=3,
                                   random_state=0)
    y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y)
    # For numerical labels, let y values be taken from set (-1, 0, 1)
    y = np.array(y) - 1
    # Test for string labels
    lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')
    lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')

    lr.fit(X_ref, y)
    lr_cv.fit(X_ref, y)
    lr_str.fit(X_ref, y_str)
    lr_cv_str.fit(X_ref, y_str)

    assert_array_almost_equal(lr.coef_, lr_str.coef_)
    assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
    assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_)
    assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
    assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo'])

    # The predictions should be in original labels
    assert_equal(sorted(np.unique(lr_str.predict(X_ref))),
                 ['bar', 'baz', 'foo'])
    assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))),
                 ['bar', 'baz', 'foo'])

    # Make sure class weights can be given with string labels
    lr_cv_str = LogisticRegression(
        solver='lbfgs', class_weight={'bar': 1, 'baz': 2, 'foo': 0},
        multi_class='multinomial').fit(X_ref, y_str)
    assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])

Source File: test_sgd.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_multiple_fit(klass):
    # Test multiple calls of fit w/ different shaped inputs.
    clf = klass(alpha=0.01, shuffle=False)
    clf.fit(X, Y)
    assert hasattr(clf, "coef_")

    # Non-regression test: try fitting with a different label set.
    y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)]
    clf.fit(X[:, :-1], y)


###############################################################################
# Regression Test Case

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_cross_val_predict_class_subset():

    X = np.arange(200).reshape(100, 2)
    y = np.array([x // 10 for x in range(100)])
    classes = 10

    kfold3 = KFold(n_splits=3)
    kfold4 = KFold(n_splits=4)

    le = LabelEncoder()

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        # Test with n_splits=3
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)

        # Runs a naive loop (should be same as cross_val_predict):
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Test with n_splits=4
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold4)
        expected_predictions = get_expected_predictions(X, y, kfold4, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Testing unordered labels
        y = shuffle(np.repeat(range(10), 10), random_state=0)
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)
        y = le.fit_transform(y)
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

Source File: gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def _encode_y(self, y):
        # encode classes into 0 ... n_classes - 1 and sets attributes classes_
        # and n_trees_per_iteration_
        check_classification_targets(y)

        label_encoder = LabelEncoder()
        encoded_y = label_encoder.fit_transform(y)
        self.classes_ = label_encoder.classes_
        n_classes = self.classes_.shape[0]
        # only 1 tree for binary classification. For multiclass classification,
        # we build 1 tree per class.
        self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes
        encoded_y = encoded_y.astype(Y_DTYPE, copy=False)
        return encoded_y

Source File: main.py From AutoOut with MIT License

5 votes

def encode_data(data_frame):
    for column_name in data_frame.columns:
        if str(data_frame[column_name].dtype) == 'object':
            label_encoder = LabelEncoder()
            data_frame[column_name] = label_encoder.fit_transform(data_frame[column_name])
    return data_frame

Source File: auto_prep.py From nyaggle with MIT License

5 votes

def autoprep_gbdt(algorithm_type: str, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame],
                  categorical_feature_to_treat: Optional[List[str]] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if categorical_feature_to_treat is None:
        categorical_feature_to_treat = [c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category']]

    # LightGBM:
    # Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns.
    # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support
    #
    # CatBoost:
    # int, float, bool or str is acceptable for categorical columns. NaN should be filled.
    # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
    #
    # XGBoost:
    # All categorical column should be encoded beforehand.

    if algorithm_type == 'lgbm':
        # LightGBM can handle categorical dtype natively
        categorical_feature_to_treat = [c for c in categorical_feature_to_treat if not is_categorical(X_train[c])]

    if algorithm_type == 'cat' and len(categorical_feature_to_treat) > 0:
        X_train = X_train.copy()
        X_test = X_test.copy() if X_test is not None else X_train.iloc[:1, :].copy()  # dummy
        for c in categorical_feature_to_treat:
            X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])

    if algorithm_type in ('lgbm', 'xgb') and len(categorical_feature_to_treat) > 0:
        assert X_test is not None, "X_test is required for XGBoost with categorical variables"
        X_train = X_train.copy()
        X_test = X_test.copy()

        for c in categorical_feature_to_treat:
            X_train[c], X_test[c] = _fill_na_by_unique_value(X_train[c], X_test[c])
            le = LabelEncoder()
            concat = np.concatenate([X_train[c].values, X_test[c].values])
            concat = le.fit_transform(concat)
            X_train[c] = concat[:len(X_train)]
            X_test[c] = concat[len(X_train):]

    return X_train, X_test

Python sklearn.preprocessing.LabelEncoder() Examples