Python Examples of pyspark.ml.feature.VectorAssembler

Source File: feature_selection.py From search-MjoLniR with MIT License

6 votes

def select_features(
    wiki: str,
    num_features: int,
    metadata: Dict
) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        # Compute the "best" features, per some metric
        sc = df.sql_ctx.sparkSession.sparkContext
        features = metadata['input_feature_meta']['features']
        selected = mjolnir.feature_engineering.select_features(
            sc, df, features, num_features, algo='mrmr')
        metadata['wiki_features'][wiki] = selected

        # Rebuild the `features` col with only the selected features
        keep_cols = metadata['default_cols'] + selected
        df_selected = df.select(*keep_cols)
        assembler = VectorAssembler(
            inputCols=selected, outputCol='features')
        return assembler.transform(df_selected).drop(*selected)
    return transform

Source File: test_spark_model_export.py From mlflow with Apache License 2.0

6 votes

def spark_model_estimator(iris_df, spark_context):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    features_df = assembler.transform(iris_spark_df)
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    # Fit the model
    model = lr.fit(features_df)
    preds_df = model.transform(features_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=features_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds)

Source File: random_forest.py From gnomad_methods with MIT License

6 votes

def get_features_importance(
    rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3
) -> Dict[str, float]:
    """
    Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.

    :param rf_pipeline: Input pipeline
    :param rf_index: index of the RandomForestClassifier stage
    :param assembler_index: index of the VectorAssembler stage
    :return: feature importance for each feature in the RF model
    """

    feature_names = [
        x[: -len("_indexed")] if x.endswith("_indexed") else x
        for x in rf_pipeline.stages[assembler_index].getInputCols()
    ]

    return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))

Source File: ops_names.py From onnxmltools with MIT License

6 votes

def build_sparkml_operator_name_map():
    res = {k: "pyspark.ml.feature." + k.__name__ for k in [
        Binarizer, BucketedRandomProjectionLSHModel, Bucketizer,
        ChiSqSelectorModel, CountVectorizerModel, DCT, ElementwiseProduct, HashingTF, IDFModel, ImputerModel,
        IndexToString, MaxAbsScalerModel, MinHashLSHModel, MinMaxScalerModel, NGram, Normalizer, OneHotEncoderModel,
        PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer,
        StandardScalerModel, StopWordsRemover, StringIndexerModel, Tokenizer, VectorAssembler, VectorIndexerModel,
        VectorSlicer, Word2VecModel
    ]}
    res.update({k: "pyspark.ml.classification." + k.__name__ for k in [
        LinearSVCModel, LogisticRegressionModel, DecisionTreeClassificationModel, GBTClassificationModel,
        RandomForestClassificationModel, NaiveBayesModel, MultilayerPerceptronClassificationModel, OneVsRestModel
    ]})
    res.update({k: "pyspark.ml.regression." + k.__name__ for k in [
        AFTSurvivalRegressionModel, DecisionTreeRegressionModel, GBTRegressionModel, GBTRegressionModel,
        GeneralizedLinearRegressionModel, IsotonicRegressionModel, LinearRegressionModel, RandomForestRegressionModel
    ]})
    return res

Source File: test_pipeline.py From onnxmltools with MIT License

6 votes

def test_model_pipeline_3_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            # we need the dropLast option otherwise when assembled together (below)
            # we won't be able to expand the features without difficulties
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_3Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

Source File: test_vector_assembler.py From onnxmltools with MIT License

6 votes

def test_model_vector_assembler(self):
        col_names = ["a", "b", "c"]
        model = VectorAssembler(inputCols=col_names, outputCol='features')
        data = self.spark.createDataFrame([(1., 0., 3.)], col_names)
        model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler',  [
            ('a', FloatTensorType([1, 1])),
            ('b', FloatTensorType([1, 1])),
            ('c', FloatTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.select("features").toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
        data_np = {
            'a': data.select('a').toPandas().values.astype(numpy.float32),
            'b': data.select('b').toPandas().values.astype(numpy.float32),
            'c': data.select('c').toPandas().values.astype(numpy.float32)
        }
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlVectorAssembler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

Source File: feature_engineering.py From search-MjoLniR with MIT License

5 votes

def _bucketize(df, input_cols):
    def j_str_arr(arr):
        gateway = SparkContext._gateway
        j_str = gateway.jvm.java.lang.String
        j_arr = gateway.new_array(j_str, len(arr))
        for i, val in enumerate(arr):
            j_arr[i] = val
        return j_arr

    output_cols = ['{}-bucketed'.format(x) for x in input_cols]
    # Sadly the multi-col versions are only in scala, pyspark doesn't
    # have them yet.
    j_bucketizer = (
        JavaParams._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer")
        .setInputCols(j_str_arr(input_cols))
        .setOutputCols(j_str_arr(output_cols))
        .setNumBuckets(254)
        .setRelativeError(1/2550)
        .setHandleInvalid('error')
        .fit(df._jdf))
    j_df_bucketized = j_bucketizer.transform(df._jdf)
    df_bucketized = DataFrame(j_df_bucketized, df.sql_ctx).drop(*input_cols)
    # Now we need to assemble the bucketized values into vector
    # form for the feature selector to work with.
    assembler = VectorAssembler(
            inputCols=output_cols, outputCol='features')
    return assembler.transform(df_bucketized).drop(*output_cols)

Source File: ml.py From koalas with Apache License 2.0

5 votes

def to_numeric_df(kdf: "ks.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Tuple[str, ...]]]:
    """
    Takes a dataframe and turns it into a dataframe containing a single numerical
    vector of doubles. This dataframe has a single field called '_1'.

    TODO: index is not preserved currently
    :param kdf: the Koalas dataframe.
    :return: a pair of dataframe, list of strings (the name of the columns
             that were converted to numerical types)

    >>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}))
    (DataFrame[__correlation_output__: vector], [('A',), ('B',)])
    """
    # TODO, it should be more robust.
    accepted_types = {
        np.dtype(dt)
        for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_]
    }
    numeric_column_labels = [
        label for label in kdf._internal.column_labels if kdf[label].dtype in accepted_types
    ]
    numeric_df = kdf._internal.spark_frame.select(
        *[kdf._internal.spark_column_for(idx) for idx in numeric_column_labels]
    )
    va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN)
    v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
    return v, numeric_column_labels

Source File: test_spark_model_export.py From mlflow with Apache License 2.0

5 votes

def spark_model_iris(iris_df):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(iris_spark_df)
    preds_df = model.transform(iris_spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=iris_spark_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds)

Source File: test_spark_model_export.py From mlflow with Apache License 2.0

5 votes

def spark_model_transformer(iris_df):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    # Fit the model
    preds_df = assembler.transform(iris_spark_df)
    preds = [x.features for x in preds_df.select("features").collect()]
    return SparkModelWithData(model=assembler,
                              spark_df=iris_spark_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds)

Source File: test_pipeline.py From onnxmltools with MIT License

4 votes

def test_model_pipeline_4_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
        stages.append(LogisticRegression(maxIter=100, tol=0.0001))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('income', StringTensorType([1, 1])),
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'income': test_data.select('income').toPandas().values,
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = [
            predicted.toPandas().label.values.astype(numpy.float32),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_4Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

Python pyspark.ml.feature.VectorAssembler() Examples