Python sklearn.preprocessing.Imputer() Examples
The following are 30
code examples of sklearn.preprocessing.Imputer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.preprocessing
, or try the search function
.
Example #1
Source File: test_sklearn_imputer_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_imputer_float_inputs(self): model = Imputer(missing_values="NaN", strategy="mean", axis=0) data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) model_onnx = convert_sklearn(model, "scikit-learn imputer", [("input", FloatTensorType([None, 2]))]) self.assertTrue(model_onnx.graph.node is not None) # should contain only node self.assertEqual(len(model_onnx.graph.node), 1) # last node should contain the Imputer outputs = model_onnx.graph.output self.assertEqual(len(outputs), 1) self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value, 2) dump_data_and_model( np.array(data, dtype=np.float32), model, model_onnx, basename="SklearnImputerMeanFloat32", )
Example #2
Source File: test_cml_ImputerConverter.py From onnxmltools with MIT License | 6 votes |
def test_imputer(self): try: model = Imputer(missing_values='NaN', strategy='mean', axis=0) except TypeError: model = Imputer(missing_values=np.nan, strategy='mean') model.axis = 0 data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) from onnxmltools.convert.coreml.convert import convert import coremltools # noqa try: model_coreml = coremltools.converters.sklearn.convert(model) except ValueError as e: if 'not supported' in str(e): # Python 2.7 + scikit-learn 0.22 return model_onnx = convert(model_coreml.get_spec()) self.assertTrue(model_onnx is not None) dump_data_and_model(np.array(data, dtype=np.float32), model, model_onnx, basename="CmlImputerMeanFloat32")
Example #3
Source File: Utils.py From Kaggle-Competition-Sberbank with MIT License | 6 votes |
def FeatureCombination(Df,s='',num_feature=2): feature_set = [] for c in Df.columns: if c.startswith(s): feature_set.append(c) print('combining', len(feature_set), 'features') data = Df[feature_set].values for c in Df.columns: if Df[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(Df[c].values)) Df[c] = lbl.transform(list(Df[c].values)) imp = preprocessing.Imputer() data = imp.fit_transform(data) data = preprocessing.scale(data) pca = PCA(num_feature) pca.fit(data) print('explained_variance_ratio_:', pca.explained_variance_ratio_) trans = pca.transform(data) for i in range(0,num_feature): Df[s+'_%d'%(i+1)] = trans[:,i] Df.drop(feature_set,1,inplace=True) return Df
Example #4
Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_Imputer(self): arr = np.array([1, np.nan, 3, 2]) s = pdml.ModelSeries(arr) mod1 = s.pp.Imputer(axis=0) s.fit(mod1) result = s.transform(mod1) expected = np.array([1, 2, 3, 2]) self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected) mod1 = s.pp.Imputer(axis=0) result = s.fit_transform(mod1) self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected)
Example #5
Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_transform_1d_frame_int(self): arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]) idx = pd.Index('a b c d e f g h i'.split(' ')) df = pdml.ModelFrame(arr, index=idx, columns=['X']) self.assertEqual(len(df.columns), 1) # reshape arr to 2d arr = arr.reshape(-1, 1) if pd.compat.PY3: models = ['Binarizer', 'Imputer', 'StandardScaler'] # MinMaxScalar raises TypeError in ufunc else: models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler'] for model in models: mod1 = getattr(df.preprocessing, model)() mod2 = getattr(pp, model)() self._assert_transform(df, arr, mod1, mod2) mod1 = getattr(df.preprocessing, model)() mod2 = getattr(pp, model)() self._assert_fit_transform(df, arr, mod1, mod2)
Example #6
Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.preprocessing.Binarizer, pp.Binarizer) self.assertIs(df.preprocessing.FunctionTransformer, pp.FunctionTransformer) self.assertIs(df.preprocessing.Imputer, pp.Imputer) self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer) self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer) self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder) self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer) self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler) self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler) self.assertIs(df.preprocessing.Normalizer, pp.Normalizer) self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder) self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures) self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler) self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
Example #7
Source File: test_sklearn_imputer_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_simple_imputer_float_inputs(self): model = SimpleImputer(strategy="mean", fill_value="nan") data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) model_onnx = convert_sklearn( model, "scikit-learn simple imputer", [("input", FloatTensorType([None, 2]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx.graph.node is not None) # should contain only node self.assertEqual(len(model_onnx.graph.node), 1) # last node should contain the Imputer outputs = model_onnx.graph.output self.assertEqual(len(outputs), 1) self.assertEqual( outputs[0].type.tensor_type.shape.dim[-1].dim_value, 2) dump_data_and_model( np.array(data, dtype=np.float32), model, model_onnx, basename="SklearnSimpleImputerMeanFloat32")
Example #8
Source File: GC_script.py From ClimateVegetationDynamics_GrangerCausality with GNU General Public License v3.0 | 6 votes |
def readFile(inpath): if os.path.isfile(inpath): dataset = genfromtxt(open(inpath,'r'), delimiter=',', dtype='f8')[0:] imp = Imputer(missing_values='NaN', strategy='mean', axis=0)# fill in the missing values with the mean of each column transformedData = imp.fit_transform(dataset) rmvedCols = imp.statistics_ idxRmved = np.where(np.isnan(rmvedCols))#take the indices of the nan columns nanTarget = dataset.shape[1]-1 in idxRmved[0]#check if the target is a nan column if nanTarget: raise ValueError("The target variable contains only nan values or inf") else: raise ValueError("File does not exist") return transformedData #parameters: vector 'target' which is the target variable #returns: the dataset which includes the previous values of the target
Example #9
Source File: test_categorical_imputer.py From coremltools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = Imputer(strategy="most_frequent", axis=0) scikit_data["data"][1, 8] = np.NaN input_data = scikit_data["data"][:, 8].reshape(-1, 1) scikit_model.fit(input_data, scikit_data["target"]) # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model
Example #10
Source File: train_predict.py From loan-default-prediction with MIT License | 5 votes |
def get_reg_pipeline(): clf = models.PartialRegressor( GradientBoostingRegressor(loss='ls', learning_rate=0.0075, n_estimators=5000, subsample=0.5, min_samples_split=20, min_samples_leaf=20, max_leaf_nodes=30, random_state=9753, verbose=0) ) steps = [('features', models.FeatureSelector()), ('Impute', Imputer(strategy='median')), ('scaler', StandardScaler()), ('clf', clf)] return Pipeline(steps)
Example #11
Source File: transformations.py From AMPL with MIT License | 5 votes |
def __init__(self, params, dataset): """Initializes a UMAPTransformer object. Args: params (Namespace): Contains parameters used to instantiate the transformer. dataset (Dataset): Dataset used to "train" the projection mapping. """ # TODO: decide whether to make n_epochs a parameter #default_n_epochs = None default_n_epochs = 500 if params.prediction_type == 'classification': target_metric = 'categorical' else: target_metric = 'l2' self.scaler = RobustScaler() # Use Imputer to replace missing values (NaNs) with means for each column self.imputer = Imputer() scaled_X = self.scaler.fit_transform(self.imputer.fit_transform(dataset.X)) self.mapper = umap.UMAP(n_neighbors=params.umap_neighbors, n_components=params.umap_dim, metric=params.umap_metric, target_metric=target_metric, target_weight=params.umap_targ_wt, min_dist=params.umap_min_dist, n_epochs=default_n_epochs) # TODO: How to deal with multitask data? self.mapper.fit(scaled_X, y=dataset.y.flatten()) # ****************************************************************************************
Example #12
Source File: test_calibration.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_calibration_nan_imputer(): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', Imputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X)
Example #13
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_permutation_test_score_allow_nans(): # Check that permutation_test_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) cval.permutation_test_score(p, X, y, cv=5)
Example #14
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cross_val_score_allow_nans(): # Check that cross_val_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) cval.cross_val_score(p, X, y, cv=5)
Example #15
Source File: test_search.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_grid_search_allows_nans(): # Test GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
Example #16
Source File: test_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cross_val_score_allow_nans(): # Check that cross_val_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) cross_val_score(p, X, y, cv=5)
Example #17
Source File: test_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_permutation_test_score_allow_nans(): # Check that permutation_test_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) permutation_test_score(p, X, y, cv=5)
Example #18
Source File: train_predict.py From loan-default-prediction with MIT License | 5 votes |
def get_clf_pipeline(): clf = models.DefaultClassifier( GradientBoostingClassifier( loss='deviance', learning_rate=0.01, n_estimators=3000, subsample=0.6, min_samples_split=12, min_samples_leaf=12, max_depth=6, random_state=1357, verbose=0) ) steps = [('features', models.FeatureSelector()), ('Impute', Imputer(strategy='median')), ('scaler', StandardScaler()), ('clf', clf)] return Pipeline(steps)
Example #19
Source File: data_utils.py From Benchmarks with MIT License | 5 votes |
def impute_and_scale_array(mat, scaling=None): """ Impute missing values with mean and scale data included in numpy array. Parameters ---------- mat : numpy array Array to scale scaling : string String describing type of scaling to apply. Options recognized: 'maxabs', 'minmax', 'std'. 'maxabs' : scales data to range [-1 to 1]. 'minmax' : scales data to range [-1 to 1]. 'std' : scales data to normal variable with mean 0 and standard deviation 1. (Default: None, no scaling). Return ---------- Returns the numpy array imputed with the mean value of the \ column and scaled by the method specified. If no scaling method is specified, \ it returns the imputed numpy array. """ # imputer = Imputer(strategy='mean', axis=0, copy=False) # imputer = SimpleImputer(strategy='mean', copy=False) # Next line is from conditional import. axis=0 is default # in old version so it is not necessary. imputer = Imputer(strategy='mean', copy=False) imputer.fit_transform(mat) return scale_array(mat, scaling)
Example #20
Source File: uno_data.py From Benchmarks with MIT License | 5 votes |
def impute_and_scale(df, scaling='std', imputing='mean', dropna='all'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ if dropna: df = df.dropna(axis=1, how=dropna) else: empty_cols = df.columns[df.notnull().sum() == 0] df[empty_cols] = 0 if imputing is None or imputing.lower() == 'none': mat = df.values else: imputer = Imputer(strategy=imputing) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
Example #21
Source File: NCI60.py From Benchmarks with MIT License | 5 votes |
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean') mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
Example #22
Source File: p1b3.py From Benchmarks with MIT License | 5 votes |
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') #imputer = Imputer(strategy='mean', axis=0) imputer = Imputer(strategy='mean') mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
Example #23
Source File: test_imputer.py From coremltools with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_conversion_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() sh = scikit_data.data.shape rn.seed(0) missing_value_indices = [ (rn.randint(sh[0]), rn.randint(sh[1])) for k in range(sh[0]) ] for strategy in ["mean", "median", "most_frequent"]: for missing_value in [0, "NaN", -999]: X = np.array(scikit_data.data).copy() for i, j in missing_value_indices: X[i, j] = missing_value model = Imputer(missing_values=missing_value, strategy=strategy) model = model.fit(X) tr_X = model.transform(X.copy()) spec = converter.convert(model, scikit_data.feature_names, "out") input_data = [dict(zip(scikit_data.feature_names, row)) for row in X] output_data = [{"out": row} for row in tr_X] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
Example #24
Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_transform_series_int(self): arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]) s = pdml.ModelSeries(arr, index='a b c d e f g h i'.split(' ')) # reshape arr to 2d arr = arr.reshape(-1, 1) if pd.compat.PY3: models = ['Binarizer', 'Imputer', 'StandardScaler'] # MinMaxScalar raises TypeError in ufunc else: models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler'] for model in models: mod1 = getattr(s.preprocessing, model)() mod2 = getattr(pp, model)() s.fit(mod1) mod2.fit(arr) result = s.transform(mod1) expected = mod2.transform(arr).flatten() self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected) mod1 = getattr(s.preprocessing, model)() mod2 = getattr(pp, model)() result = s.fit_transform(mod1) expected = mod2.fit_transform(arr).flatten() self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected)
Example #25
Source File: pipelinecomponents.py From sia-cog with MIT License | 5 votes |
def data_handlemissing(dataframe, pipeline): try: if pipeline['options']['type'] == "dropcolumns": thresh = pipeline['options']['thresh'] if thresh == -1: dataframe.dropna(axis=1, how="all", inplace=True) elif thresh == 0: dataframe.dropna(axis=1, how="any", inplace=True) elif thresh > 0: dataframe.dropna(axis=1, thresh=thresh, inplace=True) elif pipeline['options']['type'] == "droprows": thresh = pipeline['options']['thresh'] if thresh == -1: dataframe.dropna(axis=0, how="all", inplace=True) elif thresh == 0: dataframe.dropna(axis=0, how="any", inplace=True) elif thresh > 0: dataframe.dropna(axis=0, thresh=thresh) elif pipeline['options']['type'] == "fillmissing": strategy = pipeline['options']['strategy'] imp = Imputer(missing_values='NaN', strategy=strategy, axis=0) array = imp.fit_transform(dataframe.values) dataframe = pandas.DataFrame(array, columns = dataframe.columns) return dataframe except Exception as e: raise Exception("data_handlemissing: " + str(e))
Example #26
Source File: test_categorical_imputer.py From coremltools with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = Imputer() spec = converter.convert(model, "data", "out") # Check the expected class during covnersion. with self.assertRaises(Exception): from sklearn.linear_model import LinearRegression model = LinearRegression() spec = converter.convert(model, "data", "out")
Example #27
Source File: Numerical.py From keras-pandas with MIT License | 5 votes |
def __init__(self): self.supports_output = True self.default_transformation_pipeline = [Imputer(strategy='mean'), StandardScaler()]
Example #28
Source File: GC_script.py From ClimateVegetationDynamics_GrangerCausality with GNU General Public License v3.0 | 5 votes |
def createAuto(target): win=13 # window size, how many previous values we take of the target (here 12 because the range goes from 1-12 without the 13) dataAuto = np.empty((len(target),win-1)) for i in range(1,win): dataAuto[:,i-1] = shift2(target, i) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) transformedDataAuto = imp.fit_transform(dataAuto) X_auto = transformedDataAuto return X_auto #parameters: 'X' the predictors, 'y' the target, 'cvFolds' number of folds, 'estimator' machine learning algorithm #returns: the R squared for each fold
Example #29
Source File: custom_transformers.py From pandas-pipelines-custom-transformers with MIT License | 5 votes |
def fit(self, X, y=None): self.imp = Imputer(strategy=self.strategy) self.imp.fit(X) self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns) return self
Example #30
Source File: titanic.py From ml-on-gcp with Apache License 2.0 | 5 votes |
def train_model(titanic_data_path, model_output_path): print('Loading the data...') try: with tf.gfile.Open(titanic_data_path, 'r') as data_file: train_df = pd.read_csv(data_file) print('Number of samples: {}'.format(train_df.shape[0])) target_name = 'Survived' feature_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked'] print('Preparing the features...') train_features = train_df[feature_names].copy() train_features['Age'] = Imputer().fit_transform(train_features['Age'].values.reshape(-1, 1)) embarked = train_features['Embarked'] train_features['Embarked'] = embarked.fillna(embarked.mode()[0]) train_features = pd.get_dummies(train_features) train_target = train_df[target_name] print('Training the model...') parameters = {'max_depth': [2, 3, 4, 5, 6, 7], 'n_estimators': [50, 100, 150, 200]} gsc = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1, cv=5) gsc.fit(train_features, train_target) print('Best Hyper Parameters: {}'.format(gsc.best_params_)) print('Accuracy: {}'.format(gsc.best_score_)) with tf.gfile.Open(model_output_path, 'wb') as model_file: joblib.dump(gsc.best_estimator_, model_file, protocol=1) except Exception as e: print('Error: {}'.format(e))