Python sklearn.preprocessing.RobustScaler() Examples
The following are 25
code examples of sklearn.preprocessing.RobustScaler().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.preprocessing
, or try the search function
.
Example #1
Source File: CategoryProjector.py From scattertext with Apache License 2.0 | 6 votes |
def fit_transform(self, X): compact_category_counts_catscale = X / X.sum(axis=0) compact_category_counts_catscale_std = ( compact_category_counts_catscale.T - compact_category_counts_catscale.mean(axis=1)).T return RobustScaler().fit_transform(compact_category_counts_catscale_std)
Example #2
Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.preprocessing.Binarizer, pp.Binarizer) self.assertIs(df.preprocessing.FunctionTransformer, pp.FunctionTransformer) self.assertIs(df.preprocessing.Imputer, pp.Imputer) self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer) self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer) self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder) self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer) self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler) self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler) self.assertIs(df.preprocessing.Normalizer, pp.Normalizer) self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder) self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures) self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler) self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
Example #3
Source File: test_investigate.py From sklearn-onnx with MIT License | 6 votes |
def test_simple_feature_union(self): data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]], dtype=numpy.float32) model = FeatureUnion([("scaler1", StandardScaler()), ("scaler2", RobustScaler())]) model.fit(data) all_models = list(enumerate_pipeline_models(model)) steps = collect_intermediate_steps(model, "feature union", [("input", FloatTensorType([None, 2]))]) assert len(steps) == 2 assert len(all_models) == 3 model.transform(data) for step in steps: onnx_step = step['onnx_step'] sess = onnxruntime.InferenceSession(onnx_step.SerializeToString()) onnx_outputs = sess.run(None, {'input': data}) onnx_output = onnx_outputs[0] skl_outputs = step['model']._debug.outputs['transform'] assert_almost_equal(onnx_output, skl_outputs) compare_objects(onnx_output, skl_outputs)
Example #4
Source File: model.py From revscoring with MIT License | 6 votes |
def __init__(self, *args, scale=False, center=False, **kwargs): """ A machine learned model. Beyond :class:`revscoring.Model`, this "Learned" models implement :func:`~revscoring.scoring.models.Learned.fit` and :func:`~revscoring.scoring.models.Learned.cross_validate`. """ super().__init__(*args, **kwargs) self.trained = None if scale or center: self.scaler = RobustScaler(with_centering=center, with_scaling=scale) else: self.scaler = None self.params.update({ 'scale': scale, 'center': center })
Example #5
Source File: scaler.py From pyts with BSD 3-Clause "New" or "Revised" License | 6 votes |
def transform(self, X): """Scale the data. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Data to scale. Returns ------- X_new : array-like, shape = (n_samples, n_timestamps) Scaled data. """ X = check_array(X, dtype='float64') scaler = SklearnRobustScaler( with_centering=self.with_centering, with_scaling=self.with_scaling, quantile_range=self.quantile_range ) X_new = scaler.fit_transform(X.T).T return X_new
Example #6
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_df_values(self): est1 = dpp.RobustScaler() est2 = dpp.RobustScaler() result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) if hasattr(result_df, "values"): result_df = result_df.values assert_eq_ar(result_ar, result_df) for attr in ["scale_", "center_"]: assert_eq_ar(getattr(est1, attr), getattr(est2, attr)) assert_eq_ar(est1.transform(X), est2.transform(X)) assert_eq_ar(est1.transform(df).values, est2.transform(X)) assert_eq_ar(est1.transform(X), est2.transform(df).values) # different data types df["0"] = df["0"].astype("float32") result_ar = est1.fit_transform(X) result_df = est2.fit_transform(df) if hasattr(result_df, "values"): result_df = result_df.values assert_eq_ar(result_ar, result_df)
Example #7
Source File: RobustScaler.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def __init__(self, options): self.handle_options(options) out_params = convert_params( options.get('params', {}), bools=['with_centering', 'with_scaling'], strs=['quantile_range'], ) if StrictVersion(sklearn_version) < StrictVersion(quantile_range_required_version) and 'quantile_range' in out_params.keys(): out_params.pop('quantile_range') msg = 'The quantile_range option is ignored in this version of scikit-learn ({}): version {} or higher required' msg = msg.format(sklearn_version, quantile_range_required_version) messages.warn(msg) if 'quantile_range' in out_params.keys(): try: out_params['quantile_range'] = tuple(int(i) for i in out_params['quantile_range'].split('-')) assert len(out_params['quantile_range']) == 2 except: raise RuntimeError('Syntax Error: quantile_range requires a range, e.g., quantile_range=25-75') self.estimator = _RobustScaler(**out_params)
Example #8
Source File: staticautoencoder.py From pyodds with MIT License | 6 votes |
def fit(self, X): """Fit detector. Parameters ---------- X : dataframe of shape (n_samples, n_features) The input samples. """ scaler = preprocessing.RobustScaler().fit(X) X_train = scaler.transform(X) if self.hidden_neurons is None: self.hidden_neurons=[X_train.shape[1]//2+1,X_train.shape[1]//4+1,X_train.shape[1]//4+1,X_train.shape[1]//2+1] self.batch_size=X_train.shape[0]//10 self.model=self._build_model() self.model.fit(X_train,X_train,epochs=self.epoch,batch_size=self.batch_size) return self
Example #9
Source File: engine.py From Clairvoyant with MIT License | 5 votes |
def fit(self, X, y): self.XX = vstack(X) self.yy = hstack(y) self.scaler = RobustScaler().fit(self.XX) self.svc.fit(self.scaler.transform(self.XX), self.yy)
Example #10
Source File: common.py From typhon with MIT License | 5 votes |
def _iwp_model(self, processes, cv_folds): """Return the default model for the IWP regressor """ # Estimators are normally objects that have a fit and predict method # (e.g. MLPRegressor from sklearn). To make their training easier we # scale the input data in advance. With Pipeline objects from sklearn # we can combine such steps easily since they behave like an # estimator object as well. estimator = Pipeline([ # SVM or NN work better if we have scaled the data in the first # place. MinMaxScaler is the simplest one. RobustScaler or # StandardScaler could be an alternative. ("scaler", RobustScaler(quantile_range=(15, 85))), # The "real" estimator: ("estimator", MLPRegressor(max_iter=6000, early_stopping=True)), ]) # To optimize the results, we try different hyper parameters by # using a grid search hidden_layer_sizes = [ (15, 10, 3), #(50, 20), ] hyper_parameter = [ { # Hyper parameter for lbfgs solver 'estimator__solver': ['lbfgs'], 'estimator__activation': ['tanh'], 'estimator__hidden_layer_sizes': hidden_layer_sizes, 'estimator__random_state': [0, 42, 100, 3452], 'estimator__alpha': [0.1, 0.001, 0.0001], }, ] return GridSearchCV( estimator, hyper_parameter, refit=True, n_jobs=processes, cv=cv_folds, verbose=self.verbose, )
Example #11
Source File: test_investigate.py From sklearn-onnx with MIT License | 5 votes |
def test_simple_column_transformer(self): if ColumnTransformer is None: return data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]], dtype=numpy.float32) model = ColumnTransformer([("scaler1", StandardScaler(), [0]), ("scaler2", RobustScaler(), [1])]) model.fit(data) all_models = list(enumerate_pipeline_models(model)) steps = collect_intermediate_steps(model, "coulmn transformer", [("input", FloatTensorType([None, 2]))]) assert len(steps) == 2 assert len(all_models) == 3 model.transform(data) for step in steps: onnx_step = step['onnx_step'] sess = onnxruntime.InferenceSession(onnx_step.SerializeToString()) onnx_outputs = sess.run(None, {'input': data}) onnx_output = onnx_outputs[0] skl_outputs = step['model']._debug.outputs['transform'] assert_almost_equal(onnx_output, skl_outputs) compare_objects(onnx_output.tolist(), skl_outputs.tolist())
Example #12
Source File: gmm.py From ml-ids with MIT License | 5 votes |
def main(): """Run the IDS using GMM experiment.""" week3Data = _parseTrainingData() # Scale the training data (ignore the timestamp column) scaler = preprocessing.RobustScaler().fit(week3Data[:, 1:]) X_train = scaler.transform(week3Data[:, 1:]) del week3Data try: gmm = pickle.load(open("data/gmm.pkl", "rb")) print("Loading pre-trained GMM...") except IOError: print("Training the Gaussian Mixture...") gmm = GaussianMixture(n_components=16, covariance_type='full', # reg_covar=1, verbose=1, verbose_interval=2).fit(X_train) pickle.dump(gmm, open("data/gmm.pkl", "wb")) del X_train X_orig = _parseTestingData() print("Scaling the test data...") X_test = scaler.transform(X_orig[:, 1:]) print("Calculating prosterior probabilies of test data...") probs = gmm.predict_proba(X_test) del X_test scores = _score(probs) del probs results = np.hstack((X_orig, scores.reshape((scores.shape[0], 1)))) _outputToCSV(results, "data/gmm_results_max.csv")
Example #13
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_inverse_transform(self): a = dpp.RobustScaler() result = a.inverse_transform(a.fit_transform(X)) assert dask.is_dask_collection(result) assert_eq_ar(result, X)
Example #14
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_transform(self): a = dpp.RobustScaler() b = spp.RobustScaler() a.fit(X) b.fit(X.compute()) # overwriting dask-ml's fitted attributes to have them exactly equal # (the approximate equality is tested above) a.scale_ = b.scale_ a.center_ = b.center_ assert dask.is_dask_collection(a.transform(X)) assert_eq_ar(a.transform(X), b.transform(X.compute()))
Example #15
Source File: test_data.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_fit(self): a = dpp.RobustScaler() b = spp.RobustScaler() # bigger data to make percentile more reliable # and not centered around 0 to make rtol work X, y = make_classification(n_samples=1000, chunks=200, random_state=0) X = X + 3 a.fit(X) b.fit(X.compute()) assert_estimator_equal(a, b, rtol=0.2)
Example #16
Source File: test_scale.py From skoot with MIT License | 5 votes |
def test_selective_scale_robust(): # test the ref for a provided estimator rb_scale = RobustScaler().fit(X) trans = SelectiveRobustScaler().fit(X) assert_array_almost_equal(rb_scale.fit_transform(X), trans.transform(X).values)
Example #17
Source File: zil.py From incremental_learning.pytorch with MIT License | 5 votes |
def __init__(self, feature_range, robust=0, normalize=False, truncate=False): self.feature_range = feature_range self.robust = robust self.normalize = normalize self.truncate = truncate if self.robust: self.skprepro = skpreprocessing.RobustScaler()
Example #18
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_generate_import_code(): """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline.""" pipeline = creator.Individual.from_string('GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler """ assert expected_code == generate_import_code(pipeline, tpot_obj.operators)
Example #19
Source File: custom_transformers.py From pandas-pipelines-custom-transformers with MIT License | 5 votes |
def fit(self, X, y=None): self.rs = RobustScaler() self.rs.fit(X) self.center_ = pd.Series(self.rs.center_, index=X.columns) self.scale_ = pd.Series(self.rs.scale_, index=X.columns) return self
Example #20
Source File: data_utils.py From pt-ranking.github.io with MIT License | 5 votes |
def ini_scaler(self, joint_transform=False): assert self.scaler_id in SCALER_ID if self.scaler_id == 'MinMaxScaler': self.scaler = MinMaxScaler() elif self.scaler_id == 'RobustScaler': self.scaler = RobustScaler() elif self.scaler_id == 'StandardScaler': self.scaler = StandardScaler() if self.train and 'DATASET' == self.scaler_level: f_mat = self.df[self.feature_cols] self.scaler.fit(f_mat) if joint_transform: self.df[self.feature_cols] = self.scaler.transform(f_mat)
Example #21
Source File: transformations.py From AMPL with MIT License | 5 votes |
def __init__(self, params, dataset): """Initializes a UMAPTransformer object. Args: params (Namespace): Contains parameters used to instantiate the transformer. dataset (Dataset): Dataset used to "train" the projection mapping. """ # TODO: decide whether to make n_epochs a parameter #default_n_epochs = None default_n_epochs = 500 if params.prediction_type == 'classification': target_metric = 'categorical' else: target_metric = 'l2' self.scaler = RobustScaler() # Use Imputer to replace missing values (NaNs) with means for each column self.imputer = Imputer() scaled_X = self.scaler.fit_transform(self.imputer.fit_transform(dataset.X)) self.mapper = umap.UMAP(n_neighbors=params.umap_neighbors, n_components=params.umap_dim, metric=params.umap_metric, target_metric=target_metric, target_weight=params.umap_targ_wt, min_dist=params.umap_min_dist, n_epochs=default_n_epochs) # TODO: How to deal with multitask data? self.mapper.fit(scaled_X, y=dataset.y.flatten()) # ****************************************************************************************
Example #22
Source File: diff.py From gordo with GNU Affero General Public License v3.0 | 5 votes |
def __init__( self, base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"), scaler: TransformerMixin = RobustScaler(), require_thresholds: bool = True, window=None, ): """ Classifier which wraps a ``base_estimator`` and provides a diff error based approach to anomaly detection. It trains a ``scaler`` to the target **after** training, purely for error calculations. The underlying ``base_estimator`` is trained with the original, unscaled, ``y``. Parameters ---------- base_estimator: sklearn.base.BaseEstimator The model to which normal ``.fit``, ``.predict`` methods will be used. defaults to py:class:`gordo.machine.model.models.KerasAutoEncoder` with ``kind='feedforward_hourglass`` scaler: sklearn.base.TransformerMixin Defaults to ``sklearn.preprocessing.RobustScaler`` Used for transforming model output and the original ``y`` to calculate the difference/error in model output vs expected. require_thresholds: bool Requires calculating ``thresholds_`` via a call to :func:`~DiffBasedAnomalyDetector.cross_validate`. If this is set (default True), but :func:`~DiffBasedAnomalyDetector.cross_validate` was not called before calling :func:`~DiffBasedAnomalyDetector.anomaly` an ``AttributeError`` will be raised. window: int Window size for smoothed thresholds """ self.base_estimator = base_estimator self.scaler = scaler self.require_thresholds = require_thresholds self.window = window
Example #23
Source File: test_sklearn_pipeline_within_pipeline.py From sklearn-onnx with MIT License | 4 votes |
def test_pipeline_column_transformer_pipeline_imputer_scaler_lr(self): X = np.array([[1, 2], [3, np.nan], [3, 0]], dtype=np.float32) y = np.array([1, 0, 1]) model = Pipeline([ ( "ct", ColumnTransformer([ ( "pipeline1", Pipeline([ ("imputer", SimpleImputer()), ("scaler", StandardScaler()), ]), [0], ), ( "pipeline2", Pipeline([ ("imputer", SimpleImputer()), ("scaler", RobustScaler()), ]), [1], ), ]), ), ("lr", LogisticRegression(solver="liblinear")), ]) model.fit(X, y) model_onnx = convert_sklearn( model, "pipelinewithinpipeline", [("input", FloatTensorType([None, X.shape[1]]))], ) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnPipelineCTPipelineImputerScalerLR", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
Example #24
Source File: test_pipe.py From skutil with BSD 3-Clause "New" or "Revised" License | 4 votes |
def test_random_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [StandardScaler(), RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=2, random_state=42) # fit the search search.fit(X_train, y_train) # test the report report_grid_score_detail(search, charts=False)
Example #25
Source File: test_big.py From skutil with BSD 3-Clause "New" or "Revised" License | 4 votes |
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works