Python pyspark.ml.feature.VectorAssembler() Examples
The following are 11
code examples of pyspark.ml.feature.VectorAssembler().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.ml.feature
, or try the search function
.
Example #1
Source File: feature_selection.py From search-MjoLniR with MIT License | 6 votes |
def select_features( wiki: str, num_features: int, metadata: Dict ) -> mt.Transformer: def transform(df: DataFrame) -> DataFrame: # Compute the "best" features, per some metric sc = df.sql_ctx.sparkSession.sparkContext features = metadata['input_feature_meta']['features'] selected = mjolnir.feature_engineering.select_features( sc, df, features, num_features, algo='mrmr') metadata['wiki_features'][wiki] = selected # Rebuild the `features` col with only the selected features keep_cols = metadata['default_cols'] + selected df_selected = df.select(*keep_cols) assembler = VectorAssembler( inputCols=selected, outputCol='features') return assembler.transform(df_selected).drop(*selected) return transform
Example #2
Source File: test_spark_model_export.py From mlflow with Apache License 2.0 | 6 votes |
def spark_model_estimator(iris_df, spark_context): feature_names, iris_pandas_df, iris_spark_df = iris_df assembler = VectorAssembler(inputCols=feature_names, outputCol="features") features_df = assembler.transform(iris_spark_df) lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) # Fit the model model = lr.fit(features_df) preds_df = model.transform(features_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] return SparkModelWithData(model=model, spark_df=features_df, pandas_df=iris_pandas_df, predictions=preds)
Example #3
Source File: random_forest.py From gnomad_methods with MIT License | 6 votes |
def get_features_importance( rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3 ) -> Dict[str, float]: """ Extract the features importance from a Pipeline model containing a RandomForestClassifier stage. :param rf_pipeline: Input pipeline :param rf_index: index of the RandomForestClassifier stage :param assembler_index: index of the VectorAssembler stage :return: feature importance for each feature in the RF model """ feature_names = [ x[: -len("_indexed")] if x.endswith("_indexed") else x for x in rf_pipeline.stages[assembler_index].getInputCols() ] return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))
Example #4
Source File: ops_names.py From onnxmltools with MIT License | 6 votes |
def build_sparkml_operator_name_map(): res = {k: "pyspark.ml.feature." + k.__name__ for k in [ Binarizer, BucketedRandomProjectionLSHModel, Bucketizer, ChiSqSelectorModel, CountVectorizerModel, DCT, ElementwiseProduct, HashingTF, IDFModel, ImputerModel, IndexToString, MaxAbsScalerModel, MinHashLSHModel, MinMaxScalerModel, NGram, Normalizer, OneHotEncoderModel, PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer, StandardScalerModel, StopWordsRemover, StringIndexerModel, Tokenizer, VectorAssembler, VectorIndexerModel, VectorSlicer, Word2VecModel ]} res.update({k: "pyspark.ml.classification." + k.__name__ for k in [ LinearSVCModel, LogisticRegressionModel, DecisionTreeClassificationModel, GBTClassificationModel, RandomForestClassificationModel, NaiveBayesModel, MultilayerPerceptronClassificationModel, OneVsRestModel ]}) res.update({k: "pyspark.ml.regression." + k.__name__ for k in [ AFTSurvivalRegressionModel, DecisionTreeRegressionModel, GBTRegressionModel, GBTRegressionModel, GeneralizedLinearRegressionModel, IsotonicRegressionModel, LinearRegressionModel, RandomForestRegressionModel ]}) return res
Example #5
Source File: test_pipeline.py From onnxmltools with MIT License | 6 votes |
def test_model_pipeline_3_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) # we need the dropLast option otherwise when assembled together (below) # we won't be able to expand the features without difficulties stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_3Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #6
Source File: test_vector_assembler.py From onnxmltools with MIT License | 6 votes |
def test_model_vector_assembler(self): col_names = ["a", "b", "c"] model = VectorAssembler(inputCols=col_names, outputCol='features') data = self.spark.createDataFrame([(1., 0., 3.)], col_names) model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler', [ ('a', FloatTensorType([1, 1])), ('b', FloatTensorType([1, 1])), ('c', FloatTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) expected = predicted.select("features").toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values data_np = { 'a': data.select('a').toPandas().values.astype(numpy.float32), 'b': data.select('b').toPandas().values.astype(numpy.float32), 'c': data.select('c').toPandas().values.astype(numpy.float32) } paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorAssembler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #7
Source File: feature_engineering.py From search-MjoLniR with MIT License | 5 votes |
def _bucketize(df, input_cols): def j_str_arr(arr): gateway = SparkContext._gateway j_str = gateway.jvm.java.lang.String j_arr = gateway.new_array(j_str, len(arr)) for i, val in enumerate(arr): j_arr[i] = val return j_arr output_cols = ['{}-bucketed'.format(x) for x in input_cols] # Sadly the multi-col versions are only in scala, pyspark doesn't # have them yet. j_bucketizer = ( JavaParams._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer") .setInputCols(j_str_arr(input_cols)) .setOutputCols(j_str_arr(output_cols)) .setNumBuckets(254) .setRelativeError(1/2550) .setHandleInvalid('error') .fit(df._jdf)) j_df_bucketized = j_bucketizer.transform(df._jdf) df_bucketized = DataFrame(j_df_bucketized, df.sql_ctx).drop(*input_cols) # Now we need to assemble the bucketized values into vector # form for the feature selector to work with. assembler = VectorAssembler( inputCols=output_cols, outputCol='features') return assembler.transform(df_bucketized).drop(*output_cols)
Example #8
Source File: ml.py From koalas with Apache License 2.0 | 5 votes |
def to_numeric_df(kdf: "ks.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Tuple[str, ...]]]: """ Takes a dataframe and turns it into a dataframe containing a single numerical vector of doubles. This dataframe has a single field called '_1'. TODO: index is not preserved currently :param kdf: the Koalas dataframe. :return: a pair of dataframe, list of strings (the name of the columns that were converted to numerical types) >>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']})) (DataFrame[__correlation_output__: vector], [('A',), ('B',)]) """ # TODO, it should be more robust. accepted_types = { np.dtype(dt) for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_] } numeric_column_labels = [ label for label in kdf._internal.column_labels if kdf[label].dtype in accepted_types ] numeric_df = kdf._internal.spark_frame.select( *[kdf._internal.spark_column_for(idx) for idx in numeric_column_labels] ) va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN) v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN) return v, numeric_column_labels
Example #9
Source File: test_spark_model_export.py From mlflow with Apache License 2.0 | 5 votes |
def spark_model_iris(iris_df): feature_names, iris_pandas_df, iris_spark_df = iris_df assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(iris_spark_df) preds_df = model.transform(iris_spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] return SparkModelWithData(model=model, spark_df=iris_spark_df, pandas_df=iris_pandas_df, predictions=preds)
Example #10
Source File: test_spark_model_export.py From mlflow with Apache License 2.0 | 5 votes |
def spark_model_transformer(iris_df): feature_names, iris_pandas_df, iris_spark_df = iris_df assembler = VectorAssembler(inputCols=feature_names, outputCol="features") # Fit the model preds_df = assembler.transform(iris_spark_df) preds = [x.features for x in preds_df.select("features").collect()] return SparkModelWithData(model=assembler, spark_df=iris_spark_df, pandas_df=iris_pandas_df, predictions=preds)
Example #11
Source File: test_pipeline.py From onnxmltools with MIT License | 4 votes |
def test_model_pipeline_4_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip')) stages.append(LogisticRegression(maxIter=100, tol=0.0001)) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('income', StringTensorType([1, 1])), ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'income': test_data.select('income').toPandas().values, 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = [ predicted.toPandas().label.values.astype(numpy.float32), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_4Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)