Python Examples of sklearn.preprocessing.RobustScaler

Source File: CategoryProjector.py From scattertext with Apache License 2.0

6 votes

def fit_transform(self, X):
        compact_category_counts_catscale = X / X.sum(axis=0)
        compact_category_counts_catscale_std = (
                compact_category_counts_catscale.T - compact_category_counts_catscale.mean(axis=1)).T
        return RobustScaler().fit_transform(compact_category_counts_catscale_std)

Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)

Source File: test_investigate.py From sklearn-onnx with MIT License

6 votes

def test_simple_feature_union(self):
        data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]],
                           dtype=numpy.float32)
        model = FeatureUnion([("scaler1", StandardScaler()),
                             ("scaler2", RobustScaler())])
        model.fit(data)
        all_models = list(enumerate_pipeline_models(model))
        steps = collect_intermediate_steps(model, "feature union",
                                           [("input",
                                             FloatTensorType([None, 2]))])

        assert len(steps) == 2
        assert len(all_models) == 3

        model.transform(data)
        for step in steps:
            onnx_step = step['onnx_step']
            sess = onnxruntime.InferenceSession(onnx_step.SerializeToString())
            onnx_outputs = sess.run(None, {'input': data})
            onnx_output = onnx_outputs[0]
            skl_outputs = step['model']._debug.outputs['transform']
            assert_almost_equal(onnx_output, skl_outputs)
            compare_objects(onnx_output, skl_outputs)

Source File: model.py From revscoring with MIT License

6 votes

def __init__(self, *args, scale=False, center=False, **kwargs):
        """
        A machine learned model.  Beyond :class:`revscoring.Model`, this
        "Learned" models implement
        :func:`~revscoring.scoring.models.Learned.fit` and
        :func:`~revscoring.scoring.models.Learned.cross_validate`.
        """
        super().__init__(*args, **kwargs)
        self.trained = None
        if scale or center:
            self.scaler = RobustScaler(with_centering=center,
                                       with_scaling=scale)
        else:
            self.scaler = None

        self.params.update({
            'scale': scale,
            'center': center
        })

Source File: scaler.py From pyts with BSD 3-Clause "New" or "Revised" License

6 votes

def transform(self, X):
        """Scale the data.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data to scale.

        Returns
        -------
        X_new : array-like, shape = (n_samples, n_timestamps)
            Scaled data.

        """
        X = check_array(X, dtype='float64')
        scaler = SklearnRobustScaler(
            with_centering=self.with_centering,
            with_scaling=self.with_scaling,
            quantile_range=self.quantile_range
        )
        X_new = scaler.fit_transform(X.T).T
        return X_new

Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_df_values(self):
        est1 = dpp.RobustScaler()
        est2 = dpp.RobustScaler()

        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)
        if hasattr(result_df, "values"):
            result_df = result_df.values
        assert_eq_ar(result_ar, result_df)

        for attr in ["scale_", "center_"]:
            assert_eq_ar(getattr(est1, attr), getattr(est2, attr))

        assert_eq_ar(est1.transform(X), est2.transform(X))
        assert_eq_ar(est1.transform(df).values, est2.transform(X))
        assert_eq_ar(est1.transform(X), est2.transform(df).values)

        # different data types
        df["0"] = df["0"].astype("float32")
        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)
        if hasattr(result_df, "values"):
            result_df = result_df.values
        assert_eq_ar(result_ar, result_df)

Source File: RobustScaler.py From Splunking-Crime with GNU Affero General Public License v3.0

6 votes

def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            bools=['with_centering', 'with_scaling'],
            strs=['quantile_range'], 
        )

        if StrictVersion(sklearn_version) < StrictVersion(quantile_range_required_version) and 'quantile_range' in out_params.keys():
            out_params.pop('quantile_range')
            msg = 'The quantile_range option is ignored in this version of scikit-learn ({}): version {} or higher required'
            msg = msg.format(sklearn_version, quantile_range_required_version)
            messages.warn(msg)

        if 'quantile_range' in out_params.keys():
            try:
                out_params['quantile_range'] = tuple(int(i) for i in out_params['quantile_range'].split('-'))
                assert len(out_params['quantile_range']) == 2
            except:
                raise RuntimeError('Syntax Error: quantile_range requires a range, e.g., quantile_range=25-75')

        self.estimator = _RobustScaler(**out_params)

Source File: staticautoencoder.py From pyodds with MIT License

6 votes

def fit(self, X):
        """Fit detector.
        Parameters
        ----------
        X : dataframe of shape (n_samples, n_features)
            The input samples.
        """
        scaler = preprocessing.RobustScaler().fit(X)
        X_train = scaler.transform(X)
        if self.hidden_neurons is None:
            self.hidden_neurons=[X_train.shape[1]//2+1,X_train.shape[1]//4+1,X_train.shape[1]//4+1,X_train.shape[1]//2+1]
        self.batch_size=X_train.shape[0]//10
        self.model=self._build_model()

        self.model.fit(X_train,X_train,epochs=self.epoch,batch_size=self.batch_size)

        return self

Source File: engine.py From Clairvoyant with MIT License

5 votes

def fit(self, X, y):
        self.XX = vstack(X)
        self.yy = hstack(y)
        self.scaler = RobustScaler().fit(self.XX)
        self.svc.fit(self.scaler.transform(self.XX), self.yy)

Source File: common.py From typhon with MIT License

5 votes

def _iwp_model(self, processes, cv_folds):
        """Return the default model for the IWP regressor
        """
        # Estimators are normally objects that have a fit and predict method
        # (e.g. MLPRegressor from sklearn). To make their training easier we
        # scale the input data in advance. With Pipeline objects from sklearn
        # we can combine such steps easily since they behave like an
        # estimator object as well.
        estimator = Pipeline([
            # SVM or NN work better if we have scaled the data in the first
            # place. MinMaxScaler is the simplest one. RobustScaler or
            # StandardScaler could be an alternative.
            ("scaler", RobustScaler(quantile_range=(15, 85))),
            # The "real" estimator:
            ("estimator", MLPRegressor(max_iter=6000, early_stopping=True)),
        ])

        # To optimize the results, we try different hyper parameters by
        # using a grid search
        hidden_layer_sizes = [
            (15, 10, 3),
            #(50, 20),
        ]
        hyper_parameter = [
            {   # Hyper parameter for lbfgs solver
                'estimator__solver': ['lbfgs'],
                'estimator__activation': ['tanh'],
                'estimator__hidden_layer_sizes': hidden_layer_sizes,
                'estimator__random_state': [0, 42, 100, 3452],
                'estimator__alpha': [0.1, 0.001, 0.0001],
            },
        ]

        return GridSearchCV(
            estimator, hyper_parameter, refit=True,
            n_jobs=processes, cv=cv_folds, verbose=self.verbose,
        )

Source File: test_investigate.py From sklearn-onnx with MIT License

5 votes

def test_simple_column_transformer(self):
        if ColumnTransformer is None:
            return
        data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]],
                           dtype=numpy.float32)
        model = ColumnTransformer([("scaler1", StandardScaler(), [0]),
                                  ("scaler2", RobustScaler(), [1])])
        model.fit(data)
        all_models = list(enumerate_pipeline_models(model))

        steps = collect_intermediate_steps(model, "coulmn transformer",
                                           [("input",
                                             FloatTensorType([None, 2]))])

        assert len(steps) == 2
        assert len(all_models) == 3

        model.transform(data)
        for step in steps:
            onnx_step = step['onnx_step']
            sess = onnxruntime.InferenceSession(onnx_step.SerializeToString())
            onnx_outputs = sess.run(None, {'input': data})
            onnx_output = onnx_outputs[0]
            skl_outputs = step['model']._debug.outputs['transform']
            assert_almost_equal(onnx_output, skl_outputs)
            compare_objects(onnx_output.tolist(), skl_outputs.tolist())

Source File: gmm.py From ml-ids with MIT License

5 votes

def main():
    """Run the IDS using GMM experiment."""
    week3Data = _parseTrainingData()

    # Scale the training data (ignore the timestamp column)
    scaler = preprocessing.RobustScaler().fit(week3Data[:, 1:])
    X_train = scaler.transform(week3Data[:, 1:])
    del week3Data

    try:
        gmm = pickle.load(open("data/gmm.pkl", "rb"))
        print("Loading pre-trained GMM...")
    except IOError:
        print("Training the Gaussian Mixture...")
        gmm = GaussianMixture(n_components=16,
                              covariance_type='full',
                              #  reg_covar=1,
                              verbose=1,
                              verbose_interval=2).fit(X_train)
        pickle.dump(gmm, open("data/gmm.pkl", "wb"))
    del X_train

    X_orig = _parseTestingData()
    print("Scaling the test data...")
    X_test = scaler.transform(X_orig[:, 1:])

    print("Calculating prosterior probabilies of test data...")
    probs = gmm.predict_proba(X_test)
    del X_test

    scores = _score(probs)
    del probs

    results = np.hstack((X_orig, scores.reshape((scores.shape[0], 1))))

    _outputToCSV(results, "data/gmm_results_max.csv")

Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_inverse_transform(self):
        a = dpp.RobustScaler()
        result = a.inverse_transform(a.fit_transform(X))
        assert dask.is_dask_collection(result)
        assert_eq_ar(result, X)

Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_transform(self):
        a = dpp.RobustScaler()
        b = spp.RobustScaler()

        a.fit(X)
        b.fit(X.compute())

        # overwriting dask-ml's fitted attributes to have them exactly equal
        # (the approximate equality is tested above)
        a.scale_ = b.scale_
        a.center_ = b.center_

        assert dask.is_dask_collection(a.transform(X))
        assert_eq_ar(a.transform(X), b.transform(X.compute()))

Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_fit(self):
        a = dpp.RobustScaler()
        b = spp.RobustScaler()

        # bigger data to make percentile more reliable
        # and not centered around 0 to make rtol work
        X, y = make_classification(n_samples=1000, chunks=200, random_state=0)
        X = X + 3

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a, b, rtol=0.2)

Source File: test_scale.py From skoot with MIT License

5 votes

def test_selective_scale_robust():
    # test the ref for a provided estimator
    rb_scale = RobustScaler().fit(X)
    trans = SelectiveRobustScaler().fit(X)

    assert_array_almost_equal(rb_scale.fit_transform(X),
                              trans.transform(X).values)

Source File: zil.py From incremental_learning.pytorch with MIT License

5 votes

def __init__(self, feature_range, robust=0, normalize=False, truncate=False):
        self.feature_range = feature_range
        self.robust = robust
        self.normalize = normalize
        self.truncate = truncate

        if self.robust:
            self.skprepro = skpreprocessing.RobustScaler()

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_generate_import_code():
    """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline."""

    pipeline = creator.Individual.from_string('GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset)

    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
"""
    assert expected_code == generate_import_code(pipeline, tpot_obj.operators)

Source File: custom_transformers.py From pandas-pipelines-custom-transformers with MIT License

5 votes

def fit(self, X, y=None):
        self.rs = RobustScaler()
        self.rs.fit(X)
        self.center_ = pd.Series(self.rs.center_, index=X.columns)
        self.scale_ = pd.Series(self.rs.scale_, index=X.columns)
        return self

Source File: data_utils.py From pt-ranking.github.io with MIT License

5 votes

def ini_scaler(self, joint_transform=False):
        assert self.scaler_id in SCALER_ID
        if self.scaler_id == 'MinMaxScaler':
            self.scaler = MinMaxScaler()
        elif self.scaler_id == 'RobustScaler':
            self.scaler = RobustScaler()
        elif self.scaler_id == 'StandardScaler':
            self.scaler = StandardScaler()

        if self.train and 'DATASET' == self.scaler_level:
            f_mat = self.df[self.feature_cols]
            self.scaler.fit(f_mat)

            if joint_transform: self.df[self.feature_cols] = self.scaler.transform(f_mat)

Source File: transformations.py From AMPL with MIT License

5 votes

def __init__(self, params, dataset):
        """Initializes a UMAPTransformer object.

        Args:
            params (Namespace): Contains parameters used to instantiate the transformer.
            dataset (Dataset): Dataset used to "train" the projection mapping.
        """

        # TODO: decide whether to make n_epochs a parameter
        #default_n_epochs = None
        default_n_epochs = 500

        if params.prediction_type == 'classification':
            target_metric = 'categorical'
        else:
            target_metric = 'l2'
        self.scaler = RobustScaler()
        # Use Imputer to replace missing values (NaNs) with means for each column
        self.imputer = Imputer()
        scaled_X = self.scaler.fit_transform(self.imputer.fit_transform(dataset.X))
        self.mapper = umap.UMAP(n_neighbors=params.umap_neighbors, 
                                n_components=params.umap_dim,
                                metric=params.umap_metric,
                                target_metric=target_metric,
                                target_weight=params.umap_targ_wt,
                                min_dist=params.umap_min_dist,
                                n_epochs=default_n_epochs)
        # TODO: How to deal with multitask data?
        self.mapper.fit(scaled_X, y=dataset.y.flatten())

    # ****************************************************************************************

Source File: diff.py From gordo with GNU Affero General Public License v3.0

5 votes

def __init__(
        self,
        base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"),
        scaler: TransformerMixin = RobustScaler(),
        require_thresholds: bool = True,
        window=None,
    ):
        """
        Classifier which wraps a ``base_estimator`` and provides a diff error
        based approach to anomaly detection.

        It trains a ``scaler`` to the target **after** training, purely for
        error calculations. The underlying ``base_estimator`` is trained
        with the original, unscaled, ``y``.

        Parameters
        ----------
        base_estimator: sklearn.base.BaseEstimator
            The model to which normal ``.fit``, ``.predict`` methods will be used.
            defaults to py:class:`gordo.machine.model.models.KerasAutoEncoder` with
            ``kind='feedforward_hourglass``
        scaler: sklearn.base.TransformerMixin
            Defaults to ``sklearn.preprocessing.RobustScaler``
            Used for transforming model output and the original ``y`` to calculate
            the difference/error in model output vs expected.
        require_thresholds: bool
            Requires calculating ``thresholds_`` via a call to :func:`~DiffBasedAnomalyDetector.cross_validate`.
            If this is set (default True), but :func:`~DiffBasedAnomalyDetector.cross_validate`
            was not called before calling :func:`~DiffBasedAnomalyDetector.anomaly` an ``AttributeError``
            will be raised.
        window: int
            Window size for smoothed thresholds
        """
        self.base_estimator = base_estimator
        self.scaler = scaler
        self.require_thresholds = require_thresholds
        self.window = window

Source File: test_sklearn_pipeline_within_pipeline.py From sklearn-onnx with MIT License

4 votes

def test_pipeline_column_transformer_pipeline_imputer_scaler_lr(self):
        X = np.array([[1, 2], [3, np.nan], [3, 0]], dtype=np.float32)
        y = np.array([1, 0, 1])
        model = Pipeline([
            (
                "ct",
                ColumnTransformer([
                    (
                        "pipeline1",
                        Pipeline([
                            ("imputer", SimpleImputer()),
                            ("scaler", StandardScaler()),
                        ]),
                        [0],
                    ),
                    (
                        "pipeline2",
                        Pipeline([
                            ("imputer", SimpleImputer()),
                            ("scaler", RobustScaler()),
                        ]),
                        [1],
                    ),
                ]),
            ),
            ("lr", LogisticRegression(solver="liblinear")),
        ])
        model.fit(X, y)
        model_onnx = convert_sklearn(
            model,
            "pipelinewithinpipeline",
            [("input", FloatTensorType([None, X.shape[1]]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X,
            model,
            model_onnx,
            basename="SklearnPipelineCTPipelineImputerScalerLR",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.2.1')",
        )

Source File: test_pipe.py From skutil with BSD 3-Clause "New" or "Revised" License

4 votes

def test_random_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer',       FeatureRetainer()),  # will retain all
        ('dropper',        FeatureDropper()),  # won't drop any
        ('mapper',         FunctionMapper()),  # pass through
        ('encoder',        OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity',   MulticollinearityFilterer(threshold=0.85)),
        ('imputer',        SelectiveImputer()),  # pass through
        ('scaler',         SelectiveScaler()),
        ('boxcox',         BoxCoxTransformer()),
        ('nzv',            NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca',            SelectivePCA(n_components=0.9)),
        ('model',          RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search
    hp = {
        'collinearity__threshold':    uniform(loc=.8, scale=.15),
        'collinearity__method':       ['pearson', 'kendall', 'spearman'],
        'scaler__scaler':             [StandardScaler(), RobustScaler()],
        'pca__n_components':          uniform(loc=.75, scale=.2),
        'pca__whiten':                [True, False],
        'model__n_estimators':        randint(5, 10),
        'model__max_depth':           randint(2, 5),
        'model__min_samples_leaf':    randint(1, 5),
        'model__max_features':        uniform(loc=.5, scale=.5),
        'model__max_leaf_nodes':      randint(10, 15)
    }

    # define the gridsearch
    search = RandomizedSearchCV(pipe, hp,
                                n_iter=2,  # just to test it even works
                                scoring='accuracy',
                                cv=2,
                                random_state=42)

    # fit the search
    search.fit(X_train, y_train)

    # test the report
    report_grid_score_detail(search, charts=False)

Source File: test_big.py From skutil with BSD 3-Clause "New" or "Revised" License

4 votes

def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works

Python sklearn.preprocessing.RobustScaler() Examples