Python pyspark.ml.Pipeline() Examples
The following are 22
code examples of pyspark.ml.Pipeline().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.ml
, or try the search function
.
Example #1
Source File: named_image_test.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def test_featurizer_in_pipeline(self): """ Tests that featurizer fits into an MLlib Pipeline. Does not test how good the featurization is for generalization. """ featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=self.name) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") pipeline = Pipeline(stages=[featurizer, lr]) # add arbitrary labels to run logistic regression # TODO: it's weird that the test fails on some combinations of labels. check why. label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType()) train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"])) lrModel = pipeline.fit(train_df) # see if we at least get the training examples right. # with 5 examples and e.g. 131k features (for InceptionV3), it ought to. pred_df_collected = lrModel.transform(train_df).collect() for row in pred_df_collected: self.assertEqual(int(row.prediction), row.label)
Example #2
Source File: test_pipeline.py From onnxmltools with MIT License | 6 votes |
def test_model_pipeline_3_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) # we need the dropLast option otherwise when assembled together (below) # we won't be able to expand the features without difficulties stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_3Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #3
Source File: random_forest.py From gnomad_methods with MIT License | 6 votes |
def save_model( rf_pipeline: pyspark.ml.PipelineModel, out_path: str, overwrite: bool = False ) -> None: """ Saves a Random Forest pipeline model. :param rf_pipeline: Pipeline to save :param out_path: Output path :param overwrite: If set, will overwrite existing file(s) at output location :return: Nothing """ logger.info("Saving model to %s" % out_path) if overwrite: rf_pipeline.write().overwrite().save(out_path) else: rf_pipeline.save(out_path)
Example #4
Source File: random_forest.py From gnomad_methods with MIT License | 6 votes |
def get_features_importance( rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3 ) -> Dict[str, float]: """ Extract the features importance from a Pipeline model containing a RandomForestClassifier stage. :param rf_pipeline: Input pipeline :param rf_index: index of the RandomForestClassifier stage :param assembler_index: index of the VectorAssembler stage :return: feature importance for each feature in the RF model """ feature_names = [ x[: -len("_indexed")] if x.endswith("_indexed") else x for x in rf_pipeline.stages[assembler_index].getInputCols() ] return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))
Example #5
Source File: taar_similarity.py From python_mozetl with MIT License | 6 votes |
def compute_clusters(addons_df, num_clusters, random_seed): """ Performs user clustering by using add-on ids as features. """ # Build the stages of the pipeline. We need hashing to make the next # steps work. hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features") idf_stage = IDF(inputCol="hashed_features", outputCol="features", minDocFreq=1) # As a future improvement, we may add a sane value for the minimum cluster size # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure # to pass along the random seed if needed for tests. kmeans_kwargs = {"seed": random_seed} if random_seed else {} bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs) pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage]) # Run the pipeline and compute the results. model = pipeline.fit(addons_df) return model.transform(addons_df).select(["client_id", "prediction"])
Example #6
Source File: taar_similarity.py From telemetry-airflow with Mozilla Public License 2.0 | 6 votes |
def compute_clusters(addons_df, num_clusters, random_seed): """ Performs user clustering by using add-on ids as features. """ # Build the stages of the pipeline. We need hashing to make the next # steps work. hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features") idf_stage = IDF( inputCol="hashed_features", outputCol="features", minDocFreq=1 ) # As a future improvement, we may add a sane value for the minimum cluster size # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure # to pass along the random seed if needed for tests. kmeans_kwargs = {"seed": random_seed} if random_seed else {} bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs) pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage]) # Run the pipeline and compute the results. model = pipeline.fit(addons_df) return model.transform(addons_df).select(["client_id", "prediction"])
Example #7
Source File: pipeline_util.py From sparktorch with MIT License | 6 votes |
def unwrap(pipeline): if not (isinstance(pipeline, Pipeline) or isinstance(pipeline, PipelineModel)): raise TypeError("Cannot recognize a pipeline of type %s." % type(pipeline)) stages = pipeline.getStages() if isinstance(pipeline, Pipeline) else pipeline.stages for i, stage in enumerate(stages): if (isinstance(stage, Pipeline) or isinstance(stage, PipelineModel)): stages[i] = PysparkPipelineWrapper.unwrap(stage) if isinstance(stage, PysparkObjId._getCarrierClass()) and stage.getStopWords()[-1] == PysparkObjId._getPyObjId(): swords = stage.getStopWords()[:-1] # strip the id py_obj = load_byte_array(swords) stages[i] = py_obj if isinstance(pipeline, Pipeline): pipeline.setStages(stages) else: pipeline.stages = stages return pipeline
Example #8
Source File: pipeline_util.py From sparkflow with MIT License | 6 votes |
def unwrap(pipeline): if not (isinstance(pipeline, Pipeline) or isinstance(pipeline, PipelineModel)): raise TypeError("Cannot recognize a pipeline of type %s." % type(pipeline)) stages = pipeline.getStages() if isinstance(pipeline, Pipeline) else pipeline.stages for i, stage in enumerate(stages): if (isinstance(stage, Pipeline) or isinstance(stage, PipelineModel)): stages[i] = PysparkPipelineWrapper.unwrap(stage) if isinstance(stage, PysparkObjId._getCarrierClass()) and stage.getStopWords()[-1] == PysparkObjId._getPyObjId(): swords = stage.getStopWords()[:-1] # strip the id py_obj = load_byte_array(swords) stages[i] = py_obj if isinstance(pipeline, Pipeline): pipeline.setStages(stages) else: pipeline.stages = stages return pipeline
Example #9
Source File: sc_vectorization.py From atap with Apache License 2.0 | 5 votes |
def make_vectorizer(stopwords=True, tfidf=True, n_features=5000): # Creates a vectorization pipeline that starts with tokenization stages = [ Tokenizer(inputCol="text", outputCol="tokens"), ] # Append stopwords to the pipeline if requested if stopwords: stages.append( StopWordsRemover( caseSensitive=False, outputCol="filtered_tokens", inputCol=stages[-1].getOutputCol(), ), ) # Create the Hashing term frequency vectorizer stages.append( HashingTF( numFeatures=n_features, inputCol=stages[-1].getOutputCol(), outputCol="frequency" ) ) # Append the IDF vectorizer if requested if tfidf: stages.append( IDF(inputCol=stages[-1].getOutputCol(), outputCol="tfidf") ) # Return the completed pipeline return Pipeline(stages=stages) ## Main functionality
Example #10
Source File: sc_classification.py From atap with Apache License 2.0 | 5 votes |
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
Example #11
Source File: sc_clustering.py From atap with Apache License 2.0 | 5 votes |
def main(sc, spark): # Load the Corpus corpus = load_corpus(sc, spark) # Create the vector/cluster pipeline pipeline = Pipeline(stages=[ Tokenizer(inputCol="text", outputCol="tokens"), Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"), BisectingKMeans(k=10, featuresCol="vecs", maxIter=10), ]) # Fit the model model = pipeline.fit(corpus) corpus = model.transform(corpus) # Evaluate clustering. bkm = model.stages[-1] cost = bkm.computeCost(corpus) sizes = bkm.summary.clusterSizes # TODO: compute cost of each cluster individually # Get the text representation of each cluster. wvec = model.stages[-2] table = [["Cluster", "Size", "Terms"]] for ci, c in enumerate(bkm.clusterCenters()): ct = wvec.findSynonyms(c, 7) size = sizes[ci] terms = " ".join([row.word for row in ct.take(7)]) table.append([ci, size, terms]) # Print Results print(tabulate(table)) print("Sum of square distance to center: {:0.3f}".format(cost))
Example #12
Source File: spark_ml_pipline.py From Hanhan-Spark-Python with MIT License | 5 votes |
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
Example #13
Source File: test_ml_model.py From elephas with MIT License | 5 votes |
def test_spark_ml_model(spark_context): df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision()) print(metrics.recall())
Example #14
Source File: als.py From mlflow with Apache License 2.0 | 5 votes |
def train_als(ratings_data, split_prop, max_iter, reg_param, rank, cold_start_strategy): seed = 42 spark = pyspark.sql.SparkSession.builder.getOrCreate() ratings_df = spark.read.parquet(ratings_data) (training_df, test_df) = ratings_df.randomSplit([split_prop, 1 - split_prop], seed=seed) training_df.cache() test_df.cache() mlflow.log_metric("training_nrows", training_df.count()) mlflow.log_metric("test_nrows", test_df.count()) print('Training: {0}, test: {1}'.format(training_df.count(), test_df.count())) als = (ALS() .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") .setPredictionCol("predictions") .setMaxIter(max_iter) .setSeed(seed) .setRegParam(reg_param) .setColdStartStrategy(cold_start_strategy) .setRank(rank)) als_model = Pipeline(stages=[als]).fit(training_df) reg_eval = RegressionEvaluator(predictionCol="predictions", labelCol="rating", metricName="mse") predicted_test_dF = als_model.transform(test_df) test_mse = reg_eval.evaluate(predicted_test_dF) train_mse = reg_eval.evaluate(als_model.transform(training_df)) print('The model had a MSE on the test set of {0}'.format(test_mse)) print('The model had a MSE on the (train) set of {0}'.format(train_mse)) mlflow.log_metric("test_mse", test_mse) mlflow.log_metric("train_mse", train_mse) mlflow.spark.log_model(als_model, "als-model")
Example #15
Source File: __init__.py From pyspark2pmml with GNU Affero General Public License v3.0 | 5 votes |
def testWorkflow(self): df = self.sqlContext.read.csv(os.path.join(os.path.dirname(__file__), "resources/Iris.csv"), header = True, inferSchema = True) formula = RFormula(formula = "Species ~ .") classifier = DecisionTreeClassifier() pipeline = Pipeline(stages = [formula, classifier]) pipelineModel = pipeline.fit(df) pmmlBuilder = PMMLBuilder(self.sc, df, pipelineModel) \ .verify(df.sample(False, 0.1)) pmml = pmmlBuilder.build() self.assertIsInstance(pmml, JavaObject) pmmlByteArray = pmmlBuilder.buildByteArray() self.assertTrue(isinstance(pmmlByteArray, bytes) or isinstance(pmmlByteArray, bytearray)) pmmlString = pmmlByteArray.decode("UTF-8") self.assertTrue("<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" xmlns:data=\"http://jpmml.org/jpmml-model/InlineTable\" version=\"4.3\">" in pmmlString) self.assertTrue("<VerificationFields>" in pmmlString) pmmlBuilder = pmmlBuilder.putOption(classifier, "compact", False) nonCompactFile = tempfile.NamedTemporaryFile(prefix = "pyspark2pmml-", suffix = ".pmml") nonCompactPmmlPath = pmmlBuilder.buildFile(nonCompactFile.name) pmmlBuilder = pmmlBuilder.putOption(classifier, "compact", True) compactFile = tempfile.NamedTemporaryFile(prefix = "pyspark2pmml-", suffix = ".pmml") compactPmmlPath = pmmlBuilder.buildFile(compactFile.name) self.assertGreater(os.path.getsize(nonCompactPmmlPath), os.path.getsize(compactPmmlPath) + 100)
Example #16
Source File: test_search_2.py From spark-sklearn with Apache License 2.0 | 5 votes |
def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso()) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
Example #17
Source File: test_decision_tree_regressor.py From onnxmltools with MIT License | 5 votes |
def test_decision_tree_regressor_pipeline(self): import os this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) feature_count = 5 self.spark.udf.register("truncateFeatures", lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("label", "truncateFeatures(features) as features") featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4, handleInvalid='error') (trainingData, testData) = data.randomSplit([0.7, 0.3]) dt = DecisionTreeRegressor(featuresCol="indexedFeatures") pipeline = Pipeline(stages=[featureIndexer, dt]) model = pipeline.fit(trainingData) model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Regressor Pipeline', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(testData) data_np = testData.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeRegressorPipeline") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #18
Source File: test_decision_tree_classifier.py From onnxmltools with MIT License | 5 votes |
def test_tree_pipeline(self): import os this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register("truncateFeatures", lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel", handleInvalid='error') feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='error') dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[label_indexer, feature_indexer, dt]) model = pipeline.fit(data) model_onnx = convert_sparkml(model, 'Sparkml Decision Tree Pipeline', [ ('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) data_np = { 'label': data.limit(1).toPandas().label.values, 'features': data.limit(1).toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.int64), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreePipeline") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #19
Source File: test_random_forest_classifier.py From onnxmltools with MIT License | 5 votes |
def test_random_forrest_classification(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register("truncateFeatures", lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='keep') rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [ ('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = { 'label': data.toPandas().label.values, 'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestClassifier") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #20
Source File: sparkRegressor.py From mmtf-pyspark with Apache License 2.0 | 4 votes |
def fit(self, data): '''Dataset must at least contain the following two columns: label : the class labels features : feature vector Parameters ---------- data : Dataset<Row> Returns ------- dict mapping of metrics ''' # Split the data into training and test sets (30% held out for testing) splits = data.randomSplit([1.0-self.testFraction, self.testFraction], self.seed) trainingData = splits[0] testData = splits[1] # Train a RandomForest model self.predictor.setLabelCol(self.label).setFeaturesCol("features") # Chain indexer and forest in a Pipeline pipeline = Pipeline().setStages([self.predictor]) # Train Model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) # Display some sample predictions print(f"Sample predictions: {str(self.predictor).split('_')[0]}") # TODO self.predictor.getClass().getSimpleName primaryKey = predictions.columns[0] predictions.select(primaryKey, self.label, "prediction").sample(False, 0.1, self.seed).show(50) # Collect Metrics metrics = OrderedDict() metrics["Method"] = str(self.predictor).split("_")[0] # TODO evaluator = RegressionEvaluator().setLabelCol(self.label) \ .setPredictionCol("prediction") \ .setMetricName("rmse") metrics["rmse"] = str(evaluator.evaluate(predictions)) return metrics
Example #21
Source File: test_pipeline.py From onnxmltools with MIT License | 4 votes |
def test_model_pipeline_4_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip')) stages.append(LogisticRegression(maxIter=100, tol=0.0001)) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('income', StringTensorType([1, 1])), ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'income': test_data.select('income').toPandas().values, 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = [ predicted.toPandas().label.values.astype(numpy.float32), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_4Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #22
Source File: test_pipeline.py From onnxmltools with MIT License | 4 votes |
def test_model_pipeline_2_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'])) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } predicted_np = [ predicted.toPandas().workclass_vec.apply(lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().education_vec.apply(lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().marital_status_vec.apply(lambda x: pandas.Series(x.toArray())).values ] expected = [numpy.asarray([expand_one_hot_vec(x) for x in row]) for row in predicted_np] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_2Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['workclass_vec', 'education_vec', 'marital_status_vec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)