Python pyspark.ml.classification.LogisticRegression() Examples
The following are 14
code examples of pyspark.ml.classification.LogisticRegression().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.ml.classification
, or try the search function
.
Example #1
Source File: named_image_test.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def test_featurizer_in_pipeline(self): """ Tests that featurizer fits into an MLlib Pipeline. Does not test how good the featurization is for generalization. """ featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=self.name) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") pipeline = Pipeline(stages=[featurizer, lr]) # add arbitrary labels to run logistic regression # TODO: it's weird that the test fails on some combinations of labels. check why. label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType()) train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"])) lrModel = pipeline.fit(train_df) # see if we at least get the training examples right. # with 5 examples and e.g. 131k features (for InceptionV3), it ought to. pred_df_collected = lrModel.transform(train_df).collect() for row in pred_df_collected: self.assertEqual(int(row.prediction), row.label)
Example #2
Source File: test_spark_model_export.py From mlflow with Apache License 2.0 | 6 votes |
def spark_model_estimator(iris_df, spark_context): feature_names, iris_pandas_df, iris_spark_df = iris_df assembler = VectorAssembler(inputCols=feature_names, outputCol="features") features_df = assembler.transform(iris_spark_df) lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) # Fit the model model = lr.fit(features_df) preds_df = model.transform(features_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] return SparkModelWithData(model=model, spark_df=features_df, pandas_df=iris_pandas_df, predictions=preds)
Example #3
Source File: test_one_vs_rest.py From onnxmltools with MIT License | 6 votes |
def test_one_vs_rest(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneVsRest") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #4
Source File: spark_ml_pipline.py From Hanhan-Spark-Python with MIT License | 5 votes |
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
Example #5
Source File: sc_classification.py From atap with Apache License 2.0 | 5 votes |
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
Example #6
Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0 | 5 votes |
def dump_training_info(blorModel): """ This function is useful for debugging when we do not converge to a solution during LogisticRegression. """ trainingSummary = blorModel.summary print("Total iterations: %d" % trainingSummary.totalIterations) print("Intercepts: " + str(blorModel.intercept)) print("Coefficients: " + str(blorModel.coefficients)) # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective)
Example #7
Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0 | 5 votes |
def compute_regression(spark, rdd_list, regParam, elasticNetParam): df0 = spark.sparkContext.union(rdd_list).toDF() blor = LogisticRegression( maxIter=50, regParam=regParam, weightCol="weight", elasticNetParam=elasticNetParam, ) blorModel = blor.fit(df0) return blorModel
Example #8
Source File: taar_ensemble.py From python_mozetl with MIT License | 5 votes |
def dump_training_info(blorModel): """ This function is useful for debugging when we do not converge to a solution during LogisticRegression. """ trainingSummary = blorModel.summary print("Total iterations: %d" % trainingSummary.totalIterations) print("Intercepts: " + str(blorModel.intercept)) print("Coefficients: " + str(blorModel.coefficients)) # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective)
Example #9
Source File: taar_ensemble.py From python_mozetl with MIT License | 5 votes |
def compute_regression(spark, rdd_list, regParam, elasticNetParam): df0 = spark.sparkContext.union(rdd_list).toDF() blor = LogisticRegression( maxIter=50, regParam=regParam, weightCol="weight", elasticNetParam=elasticNetParam, ) blorModel = blor.fit(df0) return blorModel
Example #10
Source File: test_spark_model_export.py From mlflow with Apache License 2.0 | 5 votes |
def spark_model_iris(iris_df): feature_names, iris_pandas_df, iris_spark_df = iris_df assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(iris_spark_df) preds_df = model.transform(iris_spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] return SparkModelWithData(model=model, spark_df=iris_spark_df, pandas_df=iris_pandas_df, predictions=preds)
Example #11
Source File: converter_test.py From spark-sklearn with Apache License 2.0 | 5 votes |
def test_LogisticRegression_spark2skl(self): lr = LogisticRegression().fit(self.df) skl_lr = self.converter.toSKLearn(lr) self.assertTrue(isinstance(skl_lr, SKL_LogisticRegression), "Expected sklearn LogisticRegression but found type %s" % type(skl_lr)) self._compare_GLMs(skl_lr, lr) # Make sure this doesn't throw an error skl_lr.predict_proba(self.X)
Example #12
Source File: test_linear_classifier.py From onnxmltools with MIT License | 5 votes |
def test_model_logistic_regression_binary_class(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("label", "truncateFeatures(features) as features") lr = LogisticRegression(maxIter=100, tol=0.0001) model = lr.fit(data) # the name of the input for Logistic Regression is 'features' C = model.numFeatures model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, C]))]) self.assertTrue(model_onnx is not None) # run the model import pandas predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] # known error in onnxruntime 0.3.0 case paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlLogisticRegression") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #13
Source File: datasetClassifier.py From mmtf-pyspark with Apache License 2.0 | 4 votes |
def main(argv): # Name of prediction column label = argv[1] start = time.time() spark = SparkSession.builder \ .master("local[*]") \ .appName("datasetClassifier") \ .getOrCreate() data = spark.read.parquet(argv[0]).cache() vector = data.first()["features"] featureCount = len(vector) print(f"Feature count : {featureCount}") classCount = int(data.select(label).distinct().count()) print(f"Class count : {classCount}") print(f"Dataset size (unbalanced) : {data.count()}") data.groupby(label).count().show(classCount) data = datasetBalancer.downsample(data, label, 1) print(f"Dataset size (balanced) : {data.count()}") data.groupby(label).count().show(classCount) testFraction = 0.3 seed = 123 # DecisionTree dtc = DecisionTreeClassifier() mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") # RandomForest rfc = RandomForestClassifier() mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") # LogisticRegression lr = LogisticRegression() mcc = SparkMultiClassClassifier(lr, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") # MultilayerPerceptronClassifier layers = [featureCount, 10, classCount] mpc = MultilayerPerceptronClassifier().setLayers(layers) \ .setBlockSize(128) \ .setSeed(1234) \ .setMaxIter(200) mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") end = time.time() print("Time: %f sec." %(end-start))
Example #14
Source File: test_pipeline.py From onnxmltools with MIT License | 4 votes |
def test_model_pipeline_4_stage(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1) stages = [] for col in cols: stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip')) stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False)) stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features')) stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip')) stages.append(LogisticRegression(maxIter=100, tol=0.0001)) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [ ('income', StringTensorType([1, 1])), ('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'income': test_data.select('income').toPandas().values, 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } expected = [ predicted.toPandas().label.values.astype(numpy.float32), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPipeline_4Stage") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)