Python Examples of sklearn.preprocessing.Normalizer

Source File: train.py From skorch with BSD 3-Clause "New" or "Revised" License

7 votes

def get_model(with_pipeline=False):
    """Get a multi-layer perceptron model.

    Optionally, put it in a pipeline that scales the data.

    """
    model = NeuralNetClassifier(MLPClassifier)
    if with_pipeline:
        model = Pipeline([
            ('scale', FeatureUnion([
                ('minmax', MinMaxScaler()),
                ('normalize', Normalizer()),
            ])),
            ('select', SelectKBest(k=N_FEATURES)),  # keep input size constant
            ('net', model),
        ])
    return model

Source File: bow.py From broca with MIT License

6 votes

def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        # Wrap the specified tokenizer
        t = Tokenizer(tokenizer())

        if hash:
            vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
        else:
            vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)

        args = [
            ('vectorizer', vectr),
            ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
            ('normalizer', Normalizer(copy=False))
        ]

        self.pipeline = Pipeline(args)
        self.trained = False

Source File: gp_repurposer.py From xfer with Apache License 2.0

6 votes

def __init__(self, source_model: mx.mod.Module, feature_layer_names, context_function=mx.context.cpu, num_devices=1,
                 max_function_evaluations=100, apply_l2_norm=False):
        # Call base class constructor with parameters required for meta-models
        super().__init__(source_model, feature_layer_names, context_function, num_devices)
        self.max_function_evaluations = max_function_evaluations
        self.apply_l2_norm = apply_l2_norm

        # Mean of features to use for normalization. Computed in training phase.
        # Used to normalize features in training and in prediction.
        self.feature_mean = None

        # Optimizer to use for training GP model
        self.optimizer = 'lbfgs'

        # Number of inducing points to use for sparse GP
        self.NUM_INDUCING_SPARSE_GP = 100

        # Normalizer to use when apply_l2_norm flag is set
        self.l2_normalizer = Normalizer(norm='l2')

Source File: test_scikit.py From pliers with BSD 3-Clause "New" or "Revised" License

6 votes

def test_within_pipeline():
    pytest.importorskip('cv2')
    pytest.importorskip('sklearn')
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import Normalizer
    stim = join(get_test_data_path(), 'image', 'apple.jpg')
    graph = Graph([BrightnessExtractor(), SharpnessExtractor()])
    trans = PliersTransformer(graph)
    normalizer = Normalizer()
    pipeline = Pipeline([('pliers', trans), ('normalizer', normalizer)])
    res = pipeline.fit_transform(stim)
    assert res.shape == (1, 2)
    assert np.isclose(res[0][0], 0.66393, 1e-5)
    assert np.isclose(res[0][1], 0.74780, 1e-5)
    meta = trans.metadata_
    assert 'onset' in meta.columns
    assert meta['class'][0] == 'ImageStim'

Source File: test_normalizer.py From coremltools with BSD 3-Clause "New" or "Revised" License

6 votes

def test_random(self):
        # Generate some random data_imputeValue.multiArrayValue[i]
        X = _np.random.random(size=(50, 3))

        for param in ("l1", "l2", "max"):
            cur_model = Normalizer(norm=param)

            output = cur_model.fit_transform(X)

            spec = converter.convert(cur_model, ["a", "b", "c"], "out")

            evaluate_transformer(
                spec,
                [dict(zip(["a", "b", "c"], row)) for row in X],
                [{"out": row} for row in output],
            )

Source File: test_one_hot_encoder.py From coremltools with BSD 3-Clause "New" or "Revised" License

6 votes

def test_boston_OHE_pipeline(self):
        data = load_boston()

        for categorical_features in [[3], [8], [3, 8], [8, 3]]:
            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct.

            model = Pipeline(
                [
                    ("OHE", OneHotEncoder(categorical_features=categorical_features)),
                    ("Normalizer", Normalizer()),
                ]
            )

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, "out").get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out": row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0

Source File: test_lsh.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

6 votes

def test_kneighbors_with_or_without_self_hit(LSH: callable, metric, n_jobs, verbose):
    X, y = make_classification(random_state=234)
    X = Normalizer().fit_transform(X)
    lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose)
    lsh.fit(X, y)
    neigh_dist, neigh_ind = lsh.kneighbors(return_distance=True)
    neigh_dist_self, neigh_ind_self = lsh.kneighbors(X, return_distance=True)

    ind_only = lsh.kneighbors(return_distance=False)
    ind_only_self = lsh.kneighbors(X, return_distance=False)

    assert_array_equal(neigh_ind, ind_only)
    assert_array_equal(neigh_ind_self, ind_only_self)

    assert (neigh_ind - neigh_ind_self).mean() <= .01, f'More than 1% of neighbors mismatch'
    assert ((neigh_dist - neigh_dist_self) < 0.0001).mean() <= 0.01,\
        f'Not almost equal to 4 decimals in more than 1% of neighbor slots'

Source File: test_lsh.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

6 votes

def test_radius_neighbors_with_or_without_self_hit(LSH, metric, n_jobs, verbose):
    X, y = make_classification()
    X = Normalizer().fit_transform(X)
    lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose)
    lsh.fit(X, y)
    radius = lsh.kneighbors(n_candidates=3)[0][:, 2].max()
    neigh_dist, neigh_ind = lsh.radius_neighbors(return_distance=True, radius=radius)
    neigh_dist_self, neigh_ind_self = lsh.radius_neighbors(X, return_distance=True, radius=radius)

    ind_only = lsh.radius_neighbors(return_distance=False, radius=radius)
    ind_only_self = lsh.radius_neighbors(X, return_distance=False, radius=radius)

    assert len(neigh_ind) == len(neigh_ind_self) == len(neigh_dist) == len(neigh_dist_self)
    for i in range(len(neigh_ind)):
        assert_array_equal(neigh_ind[i], ind_only[i])
        assert_array_equal(neigh_ind_self[i], ind_only_self[i])

        assert_array_equal(neigh_ind[i][:3],
                           neigh_ind_self[i][1:4])
        assert_array_almost_equal(neigh_dist[i][:3],
                                  neigh_dist_self[i][1:4])

Source File: test_lsh.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

6 votes

def test_squared_euclidean_same_neighbors_as_euclidean(LSH):
    X, y = make_classification(random_state=234)
    X = Normalizer().fit_transform(X)
    lsh = LSH(metric='minkowski')
    lsh.fit(X, y)
    neigh_dist_eucl, neigh_ind_eucl = lsh.kneighbors()

    lsh_sq = LSH(metric='sqeuclidean')
    lsh_sq.fit(X, y)
    neigh_dist_sqeucl, neigh_ind_sqeucl = lsh_sq.kneighbors()

    assert_array_equal(neigh_ind_eucl, neigh_ind_sqeucl)
    assert_array_almost_equal(neigh_dist_eucl ** 2, neigh_dist_sqeucl)

    if LSH in LSH_WITH_RADIUS:
        radius = neigh_dist_eucl[:, 2].max()
        rad_dist_eucl, rad_ind_eucl = lsh.radius_neighbors(radius=radius)
        rad_dist_sqeucl, rad_ind_sqeucl = lsh_sq.radius_neighbors(radius=radius**2)
        for i in range(len(rad_ind_eucl)):
            assert_array_equal(rad_ind_eucl[i], rad_ind_sqeucl[i])
            assert_array_almost_equal(rad_dist_eucl[i] ** 2, rad_dist_sqeucl[i])

Source File: models.py From ntua-slp-semeval2018 with MIT License

6 votes

def nbow_model(task, embeddings, word2idx):
    if task == "clf":
        algo = LogisticRegression(C=0.6, random_state=0,
                                  class_weight='balanced')
    elif task == "reg":
        algo = SVR(kernel='linear', C=0.6)
    else:
        raise ValueError("invalid task!")

    embeddings_features = NBOWVectorizer(aggregation=["mean"],
                                         embeddings=embeddings,
                                         word2idx=word2idx,
                                         stopwords=False)

    model = Pipeline([
        ('embeddings-feats', embeddings_features),
        ('normalizer', Normalizer(norm='l2')),
        ('clf', algo)
    ])

    return model

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_make_column_transformer_kwargs():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']),
                                 n_jobs=3, remainder='drop',
                                 sparse_threshold=0.5)
    assert_equal(ct.transformers, make_column_transformer(
        (scaler, 'first'), (norm, ['second'])).transformers)
    assert_equal(ct.n_jobs, 3)
    assert_equal(ct.remainder, 'drop')
    assert_equal(ct.sparse_threshold, 0.5)
    # invalid keyword parameters should raise an error message
    assert_raise_message(
        TypeError,
        'Unknown keyword arguments: "transformer_weights"',
        make_column_transformer, (scaler, 'first'), (norm, ['second']),
        transformer_weights={'pca': 10, 'Transf': 1}
    )

Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)

Source File: test_lsh.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

5 votes

def test_falconn_parallel():
    X, y = make_classification(random_state=346)
    X = Normalizer().fit_transform(X)
    lsh = FalconnLSH(n_jobs=1)
    lsh.fit(X, y)
    neigh_dist, neigh_ind = lsh.kneighbors()

    lsh_parallel = FalconnLSH(n_jobs=4)
    lsh_parallel.fit(X, y)
    neigh_dist_parallel, neigh_ind_parallel = lsh_parallel.kneighbors()

    assert_array_equal(neigh_ind, neigh_ind_parallel)
    assert_array_almost_equal(neigh_dist, neigh_dist_parallel)

Source File: test_lsh.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

5 votes

def test_warn_on_invalid_metric(LSH, metric):
    X, y = make_classification(random_state=24643)
    X = Normalizer().fit_transform(X)
    lsh = LSH(metric='euclidean')
    lsh.fit(X, y)
    neigh_dist, neigh_ind = lsh.kneighbors()

    lsh.metric = metric
    with pytest.warns(UserWarning):
        lsh.fit(X, y)
    neigh_dist_inv, neigh_ind_inv = lsh.kneighbors()

    assert_array_equal(neigh_ind, neigh_ind_inv)
    assert_array_almost_equal(neigh_dist, neigh_dist_inv)

Source File: pipeline_builder.py From texta with GNU General Public License v3.0

5 votes

def build(self, fields):
        """ Build model Pipeline and Grid Search params
        """
        params = {}
        # Field transform pipeline per field + params
        transformer_list = []

        for field in fields:
            pipe_key = 'pipe_{}'.format(field)
            steps = []    
            steps.append(tuple(['selector', ItemSelector(key=field)]))
            steps.append(self.extractor_list[self.extractor_op].get_step())
            steps.append(self.reductor_list[self.reductor_op].get_step())
            steps.append(self.normalizer_list[self.normalizer_op].get_step())
            transformer_list.append(tuple([pipe_key, Pipeline(steps)]))
            # Nest params inside the union field - Extractor
            p_dict = self.extractor_list[self.extractor_op].get_param()
            for k in p_dict:
                new_k = '{}__{}__{}'.format('union', pipe_key, k)
                params[new_k] = p_dict[k]
            # Nest params inside the union field - Reductor
            p_dict = self.reductor_list[self.reductor_op].get_param()
            for k in p_dict:
                new_k = '{}__{}__{}'.format('union', pipe_key, k)
                params[new_k] = p_dict[k]
            # Nest params inside the union field - Normalizer
            p_dict = self.normalizer_list[self.normalizer_op].get_param()
            for k in p_dict:
                new_k = '{}__{}__{}'.format('union', pipe_key, k)
                params[new_k] = p_dict[k]

        # Classifier pipeline + params
        steps = []
        steps.append(tuple(['union', FeatureUnion(transformer_list=transformer_list)]))
        steps.append(self.classifier_list[self.classifier_op].get_step())
        pipe = Pipeline(steps)
        params.update(self.classifier_list[self.classifier_op].get_param())
        return pipe, params

Source File: models.py From ntua-slp-semeval2018 with MIT License

5 votes

def bow_model(task, max_features=10000):
    if task == "clf":
        algo = LogisticRegression(C=0.6, random_state=0,
                                  class_weight='balanced')
    elif task == "reg":
        algo = SVR(kernel='linear', C=0.6)
    else:
        raise ValueError("invalid task!")

    word_features = TfidfVectorizer(ngram_range=(1, 1),
                                    tokenizer=lambda x: x,
                                    analyzer='word',
                                    min_df=5,
                                    # max_df=0.9,
                                    lowercase=False,
                                    use_idf=True,
                                    smooth_idf=True,
                                    max_features=max_features,
                                    sublinear_tf=True)

    model = Pipeline([
        ('bow-feats', word_features),
        ('normalizer', Normalizer(norm='l2')),
        ('clf', algo)
    ])

    return model

Source File: sklearn.py From datastories-semeval2017-task4 with MIT License

5 votes

def nbow_model(task, embeddings, word2idx):
    if task == "clf":
        algo = LogisticRegression(C=0.6, random_state=0,
                                  class_weight='balanced')
    elif task == "reg":
        algo = SVR(kernel='linear', C=0.6)
    else:
        raise ValueError("invalid task!")

    embeddings_features = NBOWVectorizer(aggregation=["mean"],
                                         embeddings=embeddings,
                                         word2idx=word2idx,
                                         stopwords=False)

    preprocessor = TextPreProcessor(
        backoff=['url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                 'url',
                 'date', 'number'],
        include_tags={"hashtag", "allcaps", "elongated", "repeated",
                      'emphasis',
                      'censored'},
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])

    model = Pipeline([
        ('preprocess', CustomPreProcessor(preprocessor, to_list=True)),
        ('embeddings-feats', embeddings_features),
        ('normalizer', Normalizer(norm='l2')),
        ('clf', algo)
    ])

    return model

Source File: data_manipulator.py From LSTM_Anomaly_Detector with MIT License

5 votes

def normalize(mat):
    return Normalizer(norm='l2').fit_transform(mat)

Source File: test_sklearn_normalizer_converter.py From sklearn-onnx with MIT License

5 votes

def test_model_normalizer(self):
        model = Normalizer(norm="l2")
        model_onnx = convert_sklearn(
            model,
            "scikit-learn normalizer",
            [("input", Int64TensorType([None, 1]))],
        )
        self.assertTrue(model_onnx is not None)
        self.assertTrue(len(model_onnx.graph.node) == 1)

Source File: test_sklearn_normalizer_converter.py From sklearn-onnx with MIT License

5 votes

def test_model_normalizer_float(self):
        model = Normalizer(norm="l2")
        model_onnx = convert_sklearn(
            model,
            "scikit-learn normalizer",
            [("input", FloatTensorType([None, 3]))],
        )
        self.assertTrue(model_onnx is not None)
        self.assertTrue(len(model_onnx.graph.node) == 1)
        dump_data_and_model(
            numpy.array([[1, 1, 3], [3, 1, 2]], dtype=numpy.float32),
            model,
            model_onnx,
            basename="SklearnNormalizerL2-SkipDim1",
        )

Source File: bnn_classifier.py From xfer with Apache License 2.0

5 votes

def __init__(self, model: gluon.nn.Sequential, var_posterior: VariationalPosterior,
                 normalizer: Normalizer):

        self.model = model
        self.var_posterior = var_posterior
        self.normalizer = normalizer

Source File: bnn_repurposer.py From xfer with Apache License 2.0

5 votes

def __init__(self, source_model: mx.mod.Module, feature_layer_names, context_function=mx.cpu, num_devices=1,
                 bnn_context_function=mx.cpu, sigma=100.0, num_layers=1, n_hidden=10, num_samples_mc=3,
                 learning_rate=1e-3, batch_size=20, num_epochs=200, start_annealing=None, end_annealing=None,
                 num_samples_mc_prediction=100, verbose=0):

        # Call base class constructor with parameters required for meta-models
        super().__init__(source_model, feature_layer_names, context_function, num_devices)

        # Initialize BNN specific parameters
        self.sigma = sigma
        self.num_layers = num_layers
        self.n_hidden = n_hidden
        self.num_samples_mc = num_samples_mc
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.num_samples_mc_prediction = num_samples_mc_prediction
        self.verbose = verbose

        self.start_annealing = start_annealing
        self.end_annealing = end_annealing
        self.step_annealing_sample_weight = 1.0 / float(self.end_annealing - self.start_annealing)
        self.annealing_weight = 0.0

        # Initialize variables to track performance
        self.train_acc = []
        self.test_acc = []
        self.moving_loss_total = []
        self.current_loss_total = []
        self.average_loss = []
        self.anneal_weights = []

        # L2 normalization of the features
        self.normalizer = Normalizer(norm='l2')

        self.bnn_context_function = bnn_context_function
        self._context_bnn = self.bnn_context_function()

        # init parameters for constructing network to None. These will be set during repurposing
        self.dim_input = None
        self.num_classes = None

Source File: sklearn_example.py From Hunch with Apache License 2.0

5 votes

def train(self, training_data_X, training_data_Y):
        self.normalizer = Normalizer()
        self.svc = svm.SVC(gamma=0.001, C=100.)
        normalised_training_data_X = self.normalizer.fit_transform(training_data_X)
        self.svc.fit(normalised_training_data_X, training_data_Y)

Source File: preprocessing.py From open-solution-toxic-comments with MIT License

5 votes

def __init__(self):
        self.normalizer = sk_prep.Normalizer()

Source File: test_feature_optimization.py From hyperparameter_hunter with MIT License

5 votes

def normalize(train_inputs, non_train_inputs):
    normalizer = Normalizer()
    train_inputs[train_inputs.columns] = normalizer.fit_transform(train_inputs.values)
    non_train_inputs[train_inputs.columns] = normalizer.transform(non_train_inputs.values)
    return train_inputs, non_train_inputs

Source File: topic.py From Python-DevOps with MIT License

5 votes

def train_lsa(corpus,n_topics, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)): corpus[i] = cleaning(corpus[i])
    tfidf_vectorizer = TfidfVectorizer(max_df = max_df, min_df = min_df, stop_words = stop_words)
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    tfidf_features = tfidf_vectorizer.get_feature_names()
    tfidf = Normalizer().fit_transform(tfidf)
    lsa = TruncatedSVD(n_topics).fit(tfidf)
    return TOPIC(tfidf_features,lsa)

Source File: misc.py From steppy-toolkit with MIT License

5 votes

def __init__(self):
        super().__init__()
        self.normalizer = Normalizer()

Source File: main.py From AutoOut with MIT License

5 votes

def data_cleaning_formatting(X):
    # Basic cleaning
    X = X.fillna(0)
    X = X.fillna('ffill')

    # Encode data
    X = encode_data(X)
    X = Normalizer().fit_transform(X)
    return X

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_make_column_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']))
    names, transformers, columns = zip(*ct.transformers)
    assert_equal(names, ("standardscaler", "normalizer"))
    assert_equal(transformers, (scaler, norm))
    assert_equal(columns, ('first', ['second']))

    # XXX remove in v0.22
    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        ct1 = make_column_transformer(([0], norm))
    ct2 = make_column_transformer((norm, [0]))
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    assert_almost_equal(ct1.fit_transform(X_array),
                        ct2.fit_transform(X_array))

    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        make_column_transformer(('first', 'drop'))

    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        make_column_transformer(('passthrough', 'passthrough'),
                                ('first', 'drop'))

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_make_column_transformer_pandas():
    pd = pytest.importorskip('pandas')
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    norm = Normalizer()
    # XXX remove in v0.22
    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        ct1 = make_column_transformer((X_df.columns, norm))
    ct2 = make_column_transformer((norm, X_df.columns))
    assert_almost_equal(ct1.fit_transform(X_df),
                        ct2.fit_transform(X_df))

Python sklearn.preprocessing.Normalizer() Examples