Python sklearn.impute.SimpleImputer() Examples
The following are 30
code examples of sklearn.impute.SimpleImputer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.impute
, or try the search function
.
Example #1
Source File: estimator.py From ramp-workflow with BSD 3-Clause "New" or "Revised" License | 8 votes |
def get_estimator(): categorical_cols = ['Sex', 'Pclass', 'Embarked'] numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare'] preprocessor = make_column_transformer( (OneHotEncoder(handle_unknown='ignore'), categorical_cols), (SimpleImputer(strategy='constant', fill_value=-1), numerical_cols), ) pipeline = Pipeline([ ('transformer', preprocessor), ('classifier', LogisticRegression()), ]) return pipeline
Example #2
Source File: estimator.py From ramp-workflow with BSD 3-Clause "New" or "Revised" License | 7 votes |
def get_estimator(): categorical_cols = ['Sex', 'Pclass', 'Embarked'] numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare'] preprocessor = make_column_transformer( (OneHotEncoder(handle_unknown='ignore'), categorical_cols), (SimpleImputer(strategy='constant', fill_value=-1), numerical_cols), ) pipeline = Pipeline([ ('transformer', preprocessor), ('classifier', LogisticRegression()), ]) return pipeline
Example #3
Source File: test_cml_ImputerConverter.py From onnxmltools with MIT License | 6 votes |
def test_imputer(self): try: model = Imputer(missing_values='NaN', strategy='mean', axis=0) except TypeError: model = Imputer(missing_values=np.nan, strategy='mean') model.axis = 0 data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) from onnxmltools.convert.coreml.convert import convert import coremltools # noqa try: model_coreml = coremltools.converters.sklearn.convert(model) except ValueError as e: if 'not supported' in str(e): # Python 2.7 + scikit-learn 0.22 return model_onnx = convert(model_coreml.get_spec()) self.assertTrue(model_onnx is not None) dump_data_and_model(np.array(data, dtype=np.float32), model, model_onnx, basename="CmlImputerMeanFloat32")
Example #4
Source File: test_sklearn_imputer_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_simple_imputer_float_inputs(self): model = SimpleImputer(strategy="mean", fill_value="nan") data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) model_onnx = convert_sklearn( model, "scikit-learn simple imputer", [("input", FloatTensorType([None, 2]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx.graph.node is not None) # should contain only node self.assertEqual(len(model_onnx.graph.node), 1) # last node should contain the Imputer outputs = model_onnx.graph.output self.assertEqual(len(outputs), 1) self.assertEqual( outputs[0].type.tensor_type.shape.dim[-1].dim_value, 2) dump_data_and_model( np.array(data, dtype=np.float32), model, model_onnx, basename="SklearnSimpleImputerMeanFloat32")
Example #5
Source File: base.py From tpot with GNU Lesser General Public License v3.0 | 6 votes |
def _impute_values(self, features): """Impute missing values in a feature set. Parameters ---------- features: array-like {n_samples, n_features} A feature matrix Returns ------- array-like {n_samples, n_features} """ if self.verbosity > 1: print('Imputing missing values in feature set') if self._fitted_imputer is None: self._fitted_imputer = SimpleImputer(strategy="median") self._fitted_imputer.fit(features) return self._fitted_imputer.transform(features)
Example #6
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_simple_imputation_add_indicator_sparse_matrix(arr_type): X_sparse = arr_type([ [np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9] ]) X_true = np.array([ [3., 1., 5., 1., 0., 0.], [2., 2., 1., 0., 1., 0.], [6., 3., 5., 0., 0., 1.], [1., 2., 9., 0., 0., 0.], ]) imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) X_trans = imputer.fit_transform(X_sparse) assert sparse.issparse(X_trans) assert X_trans.shape == X_true.shape assert_allclose(X_trans.toarray(), X_true)
Example #7
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"] } Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
Example #8
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_constant_pandas(dtype): # Test imputation using the constant strategy on pandas df pd = pytest.importorskip("pandas") f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n" ",i,x,\n" "a,,y,\n" "a,j,,\n" "b,j,x,") df = pd.read_csv(f, dtype=dtype) X_true = np.array([ ["missing_value", "i", "x", "missing_value"], ["a", "missing_value", "y", "missing_value"], ["a", "j", "missing_value", "missing_value"], ["b", "j", "x", "missing_value"] ], dtype=object) imputer = SimpleImputer(strategy="constant") X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true)
Example #9
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_constant_object(marker): # Test imputation using the constant strategy on objects X = np.array([ [marker, "a", "b", marker], ["c", marker, "d", marker], ["e", "f", marker, marker], ["g", "h", "i", marker] ], dtype=object) X_true = np.array([ ["missing", "a", "b", "missing"], ["c", "missing", "d", "missing"], ["e", "f", "missing", "missing"], ["g", "h", "i", "missing"] ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="constant", fill_value="missing") X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
Example #10
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([ [np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan], [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan] ]) X_true = np.array([ [-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1] ]) X = array_constructor(X) X_true = array_constructor(X_true) imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true)
Example #11
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_most_frequent_pandas(dtype): # Test imputation using the most frequent strategy on pandas df pd = pytest.importorskip("pandas") f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n" ",i,x,\n" "a,,y,\n" "a,j,,\n" "b,j,x,") df = pd.read_csv(f, dtype=dtype) X_true = np.array([ ["a", "i", "x"], ["a", "j", "y"], ["a", "j", "x"], ["b", "j", "x"] ], dtype=object) imputer = SimpleImputer(strategy="most_frequent") X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true)
Example #12
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_most_frequent_objects(marker): # Test imputation using the most-frequent strategy. X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], [marker, "b", "d", marker], [marker, "c", "d", "h"], ], dtype=object) X_true = np.array([ ["c", "a", "f"], ["c", "d", "d"], ["b", "d", "d"], ["c", "d", "h"], ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="most_frequent") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true)
Example #13
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_most_frequent(): # Test imputation using the most-frequent strategy. X = np.array([ [-1, -1, 0, 5], [-1, 2, -1, 3], [-1, 1, 3, -1], [-1, 2, 3, 7], ]) X_true = np.array([ [2, 0, 5], [2, 3, 3], [1, 3, 3], [2, 3, 7], ]) # scipy.stats.mode, used in SimpleImputer, doesn't return the first most # frequent as promised in the doc but the lowest most frequent. When this # test will fail after an update of scipy, SimpleImputer will need to be # updated to be consistent with the new (correct) behaviour _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1)
Example #14
Source File: survey_dataset_reader.py From cs-ranking with Apache License 2.0 | 6 votes |
def __load_dataset__(self): df = pd.io.stata.read_stata(self.train_file) orderings = [] features = [] for row in df.itertuples(): orderings.append(row[4:8]) context_feature = [float(i) if i != "." else np.NAN for i in row[13:33]] features.append(context_feature) X = np.array(features) X = SimpleImputer().fit_transform(X) X = np.array([np.log(np.array(X[:, i]) + 1) for i in range(len(features[0]))]) X = np.array(X.T) self.X = StandardScaler().fit_transform(X) orderings = np.array(orderings) - 1 self.Y = ranking_ordering_conversion(orderings) self.__check_dataset_validity__()
Example #15
Source File: imputation.py From Auto-PyTorch with Apache License 2.0 | 6 votes |
def fit(self, hyperparameter_config, X, train_indices, dataset_info): hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) if dataset_info.is_sparse: return {'imputation_preprocessor': None, 'all_nan_columns': None} # delete all nan columns all_nan = np.all(np.isnan(X), axis=0) X = X[:, ~all_nan] dataset_info.categorical_features = [dataset_info.categorical_features[i] for i, is_nan in enumerate(all_nan) if not is_nan] strategy = hyperparameter_config['strategy'] fill_value = int(np.nanmax(X)) + 1 if not dataset_info.is_sparse else 0 numerical_imputer = SimpleImputer(strategy=strategy, copy=False) categorical_imputer = SimpleImputer(strategy='constant', copy=False, fill_value=fill_value) transformer = ColumnTransformer( transformers=[('numerical_imputer', numerical_imputer, [i for i, c in enumerate(dataset_info.categorical_features) if not c]), ('categorical_imputer', categorical_imputer, [i for i, c in enumerate(dataset_info.categorical_features) if c])]) transformer.fit(X[train_indices]) X = transformer.transform(X) dataset_info.categorical_features = sorted(dataset_info.categorical_features) return { 'X': X, 'imputation_preprocessor': transformer, 'dataset_info': dataset_info , 'all_nan_columns': all_nan}
Example #16
Source File: lr_model.py From autogluon with Apache License 2.0 | 5 votes |
def preprocess_train(self, X, feature_types, vect_max_features): transformer_list = [] if len(feature_types['language']) > 0: pipeline = Pipeline(steps=[ ("preparator", NlpDataPreprocessor(nlp_cols=feature_types['language'])), ("vectorizer", TfidfVectorizer(ngram_range=self.params['proc.ngram_range'], sublinear_tf=True, max_features=vect_max_features, tokenizer=self.tokenize)) ]) transformer_list.append(('vect', pipeline)) if len(feature_types['onehot']) > 0: pipeline = Pipeline(steps=[ ('generator', OheFeaturesGenerator(cats_cols=feature_types['onehot'])), ]) transformer_list.append(('cats', pipeline)) if len(feature_types['continuous']) > 0: pipeline = Pipeline(steps=[ ('generator', NumericDataPreprocessor(cont_cols=feature_types['continuous'])), ('imputer', SimpleImputer(strategy=self.params['proc.impute_strategy'])), ('scaler', StandardScaler()) ]) transformer_list.append(('cont', pipeline)) if len(feature_types['skewed']) > 0: pipeline = Pipeline(steps=[ ('generator', NumericDataPreprocessor(cont_cols=feature_types['skewed'])), ('imputer', SimpleImputer(strategy=self.params['proc.impute_strategy'])), ('quantile', QuantileTransformer(output_distribution='normal')), # Or output_distribution = 'uniform' ]) transformer_list.append(('skew', pipeline)) self.pipeline = FeatureUnion(transformer_list=transformer_list) self.pipeline.fit(X)
Example #17
Source File: test_model_selection_sklearn.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] imputer = SimpleImputer(strategy="mean", missing_values=np.nan) p = Pipeline([("imputer", imputer), ("classifier", MockClassifier())]) dcv.GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y)
Example #18
Source File: test_feat_mappers.py From interpret-community with MIT License | 5 votes |
def _get_nested_pipelines_and_data(self, last_transformer=None): # returns a pipeline that can be used to test nested pipelines. When last_transformer is not None, it is # added as the last transformer in pipeline_1 steps = [("a", SimpleImputer()), ("b", OneHotEncoder())] if last_transformer: steps.append(("c", last_transformer)) pipeline_1 = Pipeline(steps) pipeline = Pipeline([("a", SimpleImputer()), ("b", pipeline_1)]) x = np.zeros((5, 2)) x[0, 0] = 1 x[0, 1] = 1 x[1, 0] = 2 return pipeline.fit(x), x
Example #19
Source File: test_data_mapper.py From interpret-community with MIT License | 5 votes |
def test_pipeline_transform_list(self): pipeline = Pipeline([("imputer", SimpleImputer()), ("onehotencoder", OneHotEncoder())]) x = np.ones((3, 2)) pipeline.fit(x) data_mapper = DataMapper([([0, 1], pipeline)]) result = data_mapper.transform(x) assert result.shape == (3, 2)
Example #20
Source File: test_feat_mappers.py From interpret-community with MIT License | 5 votes |
def test_identity_mapper(self): x = np.zeros((5, 2)) imputer = SimpleImputer() imputer.fit(x) mapper = IdentityMapper(imputer) mapper.transform(x) feature_map = np.eye(2) assert np.all(mapper.feature_map == feature_map)
Example #21
Source File: test_feat_mappers.py From interpret-community with MIT License | 5 votes |
def test_get_feature_mapper_tuple_for_pipeline(self): pipeline = Pipeline([("a", SimpleImputer()), ("b", SimpleImputer()), ("c", OneHotEncoder())]) x = np.zeros((5, 2)) x[0, 0] = 1 x[0, 1] = 1 pipeline.fit(x) feature_mapper = get_feature_mapper_for_pipeline(pipeline) feature_mapper.transform(x) feature_map = np.zeros((2, 4)) feature_map[0, :2] = 1 feature_map[1, 2:] = 1 assert np.all(feature_mapper.feature_map == feature_map)
Example #22
Source File: test_data_mapper.py From interpret-community with MIT License | 5 votes |
def test_pipeline_transform_column_transformer(self): pipeline = Pipeline([("imputer", SimpleImputer()), ("onehotencoder", OneHotEncoder())]) x = np.ones((3, 2)) column_transformer = ColumnTransformer([ ("column", pipeline, [0, 1]) ]) column_transformer.fit(x) data_mapper = DataMapper(column_transformer) result = data_mapper.transform(x) assert result.shape == (3, 2)
Example #23
Source File: test_ceteris_paribus.py From DALEX with GNU General Public License v3.0 | 5 votes |
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=400, random_state=0))]) clf.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, verbose=False)
Example #24
Source File: test_aggregated_profiles.py From DALEX with GNU General Public License v3.0 | 5 votes |
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(20, 20), max_iter=400, random_state=0))]) clf2 = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=400, random_state=0))]) clf.fit(self.X, self.y) clf2.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, label="model1", verbose=False) self.exp2 = dx.Explainer(clf2, self.X, self.y, verbose=False) self.exp3 = dx.Explainer(clf, self.X, self.y, label="model3", verbose=False)
Example #25
Source File: test_model_performance.py From DALEX with GNU General Public License v3.0 | 5 votes |
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=400, random_state=0))]) clf.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, verbose=False) self.exp2 = dx.Explainer(clf, self.X, self.y, label="model2", verbose=False)
Example #26
Source File: data_cleaner.py From MAST-ML with MIT License | 5 votes |
def imputation(df, strategy, cols_to_leave_out=None): """ Method that imputes values to the missing places based on the median, mean, etc. of the data in the column Args: df: (dataframe), pandas dataframe containing data strategy: (str), method of imputation, e.g. median, mean, etc. cols_to_leave_out: (list), list of column indices to not include in imputation Returns: df: (dataframe): dataframe with NaN or missing values resolved via imputation """ col_names = df.columns.tolist() if cols_to_leave_out is None: df_imputed = pd.DataFrame(Imputer(missing_values='NaN', strategy=strategy, axis=0).fit_transform(df)) else: df_include = df.drop(cols_to_leave_out, axis=1) df_hold_out = df.drop([c for c in df.columns if c not in cols_to_leave_out], axis=1) df_imputed = pd.DataFrame(Imputer(missing_values='NaN', strategy=strategy, axis=0).fit_transform(df_include), columns=df_include.columns) # Need to join the imputed dataframe with the columns containing strings that were held out if cols_to_leave_out is None: df = df_imputed else: df = pd.concat([df_hold_out, df_imputed], axis=1) col_names = df.columns.tolist() return df
Example #27
Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0 | 5 votes |
def test_mapper(self): domain = ContinuousDomain() df = DataFrame([{"X1" : 2.0, "X2" : 2, "y" : 2.0}, {"X1" : 1.0, "X2" : 0.5}, {"X1" : 2}, {"X2" : 2}, {"X1" : 2.0, "y" : 1}, {"X1" : 3.0, "X2" : 3.5}]) mapper = DataFrameMapper([ (["X1", "X2"], [domain, SimpleImputer(), StandardScaler()]), ("y", None) ]) mapper.fit_transform(df) self.assertEqual({"totalFreq" : [6, 6], "missingFreq" : [1, 2], "invalidFreq" : [0, 0]}, _array_to_list(domain.counts_)) self.assertEqual({"minimum" : [1.0, 0.5], "maximum" : [3.0, 3.5], "mean" : [2.0, 2.0]}, _array_to_list(dict((k, domain.numeric_info_[k]) for k in ["minimum", "maximum", "mean"]))) self.assertEqual([1.0, 0.5], domain.data_min_.tolist()) self.assertEqual([3.0, 3.5], domain.data_max_.tolist())
Example #28
Source File: classifier.py From ramp-workflow with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self): self.clf = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('classifier', LogisticRegression(C=1., solver='lbfgs')) ])
Example #29
Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0 | 5 votes |
def test_sequence_transform(self): X = DataFrame([[None], [1], [None]], columns = ["a"]) mapper = DataFrameMapper([ (["a"], [ExpressionTransformer("0 if pandas.isnull(X[0]) else X[0]"), SimpleImputer(missing_values = 0)]) ]) Xt = mapper.fit_transform(X) self.assertEqual([[1], [1], [1]], Xt.tolist())
Example #30
Source File: movie_maoyan_knn_font.py From akshare with MIT License | 5 votes |
def process_data(data): imputer = SimpleImputer(missing_values=np.nan, strategy="mean") return pd.DataFrame(imputer.fit_transform(pd.DataFrame(data)))