Python Examples of pyspark.ml.classification.LogisticRegression

Source File: named_image_test.py From spark-deep-learning with Apache License 2.0

6 votes

def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label)

Source File: test_spark_model_export.py From mlflow with Apache License 2.0

6 votes

def spark_model_estimator(iris_df, spark_context):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    features_df = assembler.transform(iris_spark_df)
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    # Fit the model
    model = lr.fit(features_df)
    preds_df = model.transform(features_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=features_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds)

Source File: test_one_vs_rest.py From onnxmltools with MIT License

6 votes

def test_one_vs_rest(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)
        lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlOneVsRest")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

Source File: spark_ml_pipline.py From Hanhan-Spark-Python with MIT License

5 votes

def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)

Source File: sc_classification.py From atap with Apache License 2.0

5 votes

def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only

Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0

5 votes

def dump_training_info(blorModel):
    """
    This function is useful for debugging when we do not converge to a
    solution during LogisticRegression.
    """
    trainingSummary = blorModel.summary

    print("Total iterations: %d" % trainingSummary.totalIterations)
    print("Intercepts: " + str(blorModel.intercept))
    print("Coefficients: " + str(blorModel.coefficients))
    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0

5 votes

def compute_regression(spark, rdd_list, regParam, elasticNetParam):
    df0 = spark.sparkContext.union(rdd_list).toDF()
    blor = LogisticRegression(
        maxIter=50,
        regParam=regParam,
        weightCol="weight",
        elasticNetParam=elasticNetParam,
    )

    blorModel = blor.fit(df0)
    return blorModel

Source File: taar_ensemble.py From python_mozetl with MIT License

5 votes

def dump_training_info(blorModel):
    """
    This function is useful for debugging when we do not converge to a
    solution during LogisticRegression.
    """
    trainingSummary = blorModel.summary

    print("Total iterations: %d" % trainingSummary.totalIterations)
    print("Intercepts: " + str(blorModel.intercept))
    print("Coefficients: " + str(blorModel.coefficients))
    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

Source File: taar_ensemble.py From python_mozetl with MIT License

5 votes

def compute_regression(spark, rdd_list, regParam, elasticNetParam):
    df0 = spark.sparkContext.union(rdd_list).toDF()
    blor = LogisticRegression(
        maxIter=50,
        regParam=regParam,
        weightCol="weight",
        elasticNetParam=elasticNetParam,
    )

    blorModel = blor.fit(df0)
    return blorModel

Source File: test_spark_model_export.py From mlflow with Apache License 2.0

5 votes

def spark_model_iris(iris_df):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(iris_spark_df)
    preds_df = model.transform(iris_spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=iris_spark_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds)

Source File: converter_test.py From spark-sklearn with Apache License 2.0

5 votes

def test_LogisticRegression_spark2skl(self):
        lr = LogisticRegression().fit(self.df)
        skl_lr = self.converter.toSKLearn(lr)
        self.assertTrue(isinstance(skl_lr, SKL_LogisticRegression),
                        "Expected sklearn LogisticRegression but found type %s" % type(skl_lr))
        self._compare_GLMs(skl_lr, lr)
        # Make sure this doesn't throw an error
        skl_lr.predict_proba(self.X)

Source File: test_linear_classifier.py From onnxmltools with MIT License

5 votes

def test_model_logistic_regression_binary_class(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]),
                                VectorUDT())
        data = original_data.selectExpr("label", "truncateFeatures(features) as features")
        lr = LogisticRegression(maxIter=100, tol=0.0001)
        model = lr.fit(data)
        # the name of the input for Logistic Regression is 'features'
        C = model.numFeatures
        model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, C]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        import pandas
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        # known error in onnxruntime 0.3.0 case
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlLogisticRegression")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

Source File: datasetClassifier.py From mmtf-pyspark with Apache License 2.0

4 votes

def main(argv):

    # Name of prediction column
    label = argv[1]

    start = time.time()

    spark = SparkSession.builder \
                        .master("local[*]") \
                        .appName("datasetClassifier") \
                        .getOrCreate()

    data = spark.read.parquet(argv[0]).cache()

    vector = data.first()["features"]
    featureCount = len(vector)

    print(f"Feature count    : {featureCount}")
    classCount = int(data.select(label).distinct().count())
    print(f"Class count    : {classCount}")
    print(f"Dataset size (unbalanced)    : {data.count()}")
    data.groupby(label).count().show(classCount)

    data = datasetBalancer.downsample(data, label, 1)

    print(f"Dataset size (balanced)  : {data.count()}")
    data.groupby(label).count().show(classCount)

    testFraction = 0.3
    seed = 123

    # DecisionTree
    dtc = DecisionTreeClassifier()
    mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # RandomForest
    rfc = RandomForestClassifier()
    mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # LogisticRegression
    lr = LogisticRegression()
    mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # MultilayerPerceptronClassifier
    layers = [featureCount, 10, classCount]
    mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(200)
    mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    end = time.time()
    print("Time: %f  sec." %(end-start))

Source File: test_pipeline.py From onnxmltools with MIT License

4 votes

def test_model_pipeline_4_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
        stages.append(LogisticRegression(maxIter=100, tol=0.0001))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('income', StringTensorType([1, 1])),
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'income': test_data.select('income').toPandas().values,
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = [
            predicted.toPandas().label.values.astype(numpy.float32),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_4Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

Python pyspark.ml.classification.LogisticRegression() Examples