Python pyspark.ml.feature.StringIndexer() Examples
The following are 10
code examples of pyspark.ml.feature.StringIndexer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.ml.feature
, or try the search function
.
Example #1
Source File: test_index_to_string.py From onnxmltools with MIT License | 8 votes |
def test_index_to_string(self): original_data = self.spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") string_indexer_model = string_indexer.fit(original_data) data = string_indexer_model.transform(original_data) model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory", labels=['A', 'B', 'C']) # the input name should match that of what IndexToString.inputCol model_onnx = convert_sparkml(model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("originalCategory").toPandas().values data_np = data.select('categoryIndex').toPandas().values.astype(numpy.int64) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlIndexToString") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['originalCategory'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #2
Source File: test_pipeline.py From onnxmltools with MIT License | 6 votes |
def test_model_pipeline_3_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) # we need the dropLast option otherwise when assembled together (below) # we won't be able to expand the features without difficulties stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_3Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #3
Source File: sc_classification.py From atap with Apache License 2.0 | 5 votes |
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
Example #4
Source File: random_forest.py From gnomad_methods with MIT License | 5 votes |
def get_labels(rf_pipeline: pyspark.ml.PipelineModel) -> List[str]: """ Returns the labels from the StringIndexer stage at index 0 from an RF pipeline model :param rf_pipeline: Input pipeline :return: labels """ return rf_pipeline.stages[0].labels
Example #5
Source File: test_index_to_string.py From onnxmltools with MIT License | 5 votes |
def test_index_to_string_throws(self): original_data = self.spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") string_indexer_model = string_indexer.fit(original_data) data = string_indexer_model.transform(original_data) model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") # the input name should match that of what IndexToString.inputCol model_onnx = None with pytest.raises(SparkMlConversionError): model_onnx = convert_sparkml(model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))])
Example #6
Source File: test_random_forest_regressor.py From onnxmltools with MIT License | 5 votes |
def test_random_forrest_regression(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register("truncateFeatures", lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='error') rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml(model, 'Sparkml RandomForest Regressor', [ ('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) data_np = { 'label': data.limit(1).toPandas().label.values, 'features': data.limit(1).toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #7
Source File: test_gbt_classifier.py From onnxmltools with MIT License | 5 votes |
def test_gbt_classifier(self): raw_data = self.spark.createDataFrame([ (1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])) ], ["label", "features"]) string_indexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = string_indexer.fit(raw_data) data = si_model.transform(raw_data) gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml GBT Classifier', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTClassifier") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #8
Source File: test_random_forest_classifier.py From onnxmltools with MIT License | 5 votes |
def test_random_forrest_classification(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register("truncateFeatures", lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='keep') rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [ ('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = { 'label': data.toPandas().label.values, 'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestClassifier") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #9
Source File: test_pipeline.py From onnxmltools with MIT License | 4 votes |
def test_model_pipeline_4_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip')) stages.append(LogisticRegression(maxIter=100, tol=0.0001)) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('income', StringTensorType([1, 1])), ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'income': test_data.select('income').toPandas().values, 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = [ predicted.toPandas().label.values.astype(numpy.float32), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_4Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #10
Source File: test_pipeline.py From onnxmltools with MIT License | 4 votes |
def test_model_pipeline_2_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'])) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } predicted_np = [ predicted.toPandas().workclass_vec.apply(lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().education_vec.apply(lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().marital_status_vec.apply(lambda x: pandas.Series(x.toArray())).values ] expected = [numpy.asarray([expand_one_hot_vec(x) for x in row]) for row in predicted_np] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_2Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['workclass_vec', 'education_vec', 'marital_status_vec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)