Python pyspark.ml.classification.LogisticRegression() Examples

The following are 14 code examples of pyspark.ml.classification.LogisticRegression(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.ml.classification , or try the search function .
Example #1
Source File: named_image_test.py    From spark-deep-learning with Apache License 2.0 6 votes vote down vote up
def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label) 
Example #2
Source File: test_spark_model_export.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def spark_model_estimator(iris_df, spark_context):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    features_df = assembler.transform(iris_spark_df)
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    # Fit the model
    model = lr.fit(features_df)
    preds_df = model.transform(features_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=features_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds) 
Example #3
Source File: test_one_vs_rest.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_one_vs_rest(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)
        lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlOneVsRest")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #4
Source File: spark_ml_pipline.py    From Hanhan-Spark-Python with MIT License 5 votes vote down vote up
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output) 
Example #5
Source File: sc_classification.py    From atap with Apache License 2.0 5 votes vote down vote up
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only 
Example #6
Source File: taar_ensemble.py    From telemetry-airflow with Mozilla Public License 2.0 5 votes vote down vote up
def dump_training_info(blorModel):
    """
    This function is useful for debugging when we do not converge to a
    solution during LogisticRegression.
    """
    trainingSummary = blorModel.summary

    print("Total iterations: %d" % trainingSummary.totalIterations)
    print("Intercepts: " + str(blorModel.intercept))
    print("Coefficients: " + str(blorModel.coefficients))
    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective) 
Example #7
Source File: taar_ensemble.py    From telemetry-airflow with Mozilla Public License 2.0 5 votes vote down vote up
def compute_regression(spark, rdd_list, regParam, elasticNetParam):
    df0 = spark.sparkContext.union(rdd_list).toDF()
    blor = LogisticRegression(
        maxIter=50,
        regParam=regParam,
        weightCol="weight",
        elasticNetParam=elasticNetParam,
    )

    blorModel = blor.fit(df0)
    return blorModel 
Example #8
Source File: taar_ensemble.py    From python_mozetl with MIT License 5 votes vote down vote up
def dump_training_info(blorModel):
    """
    This function is useful for debugging when we do not converge to a
    solution during LogisticRegression.
    """
    trainingSummary = blorModel.summary

    print("Total iterations: %d" % trainingSummary.totalIterations)
    print("Intercepts: " + str(blorModel.intercept))
    print("Coefficients: " + str(blorModel.coefficients))
    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective) 
Example #9
Source File: taar_ensemble.py    From python_mozetl with MIT License 5 votes vote down vote up
def compute_regression(spark, rdd_list, regParam, elasticNetParam):
    df0 = spark.sparkContext.union(rdd_list).toDF()
    blor = LogisticRegression(
        maxIter=50,
        regParam=regParam,
        weightCol="weight",
        elasticNetParam=elasticNetParam,
    )

    blorModel = blor.fit(df0)
    return blorModel 
Example #10
Source File: test_spark_model_export.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def spark_model_iris(iris_df):
    feature_names, iris_pandas_df, iris_spark_df = iris_df
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(iris_spark_df)
    preds_df = model.transform(iris_spark_df)
    preds = [x.prediction for x in preds_df.select("prediction").collect()]
    return SparkModelWithData(model=model,
                              spark_df=iris_spark_df,
                              pandas_df=iris_pandas_df,
                              predictions=preds) 
Example #11
Source File: converter_test.py    From spark-sklearn with Apache License 2.0 5 votes vote down vote up
def test_LogisticRegression_spark2skl(self):
        lr = LogisticRegression().fit(self.df)
        skl_lr = self.converter.toSKLearn(lr)
        self.assertTrue(isinstance(skl_lr, SKL_LogisticRegression),
                        "Expected sklearn LogisticRegression but found type %s" % type(skl_lr))
        self._compare_GLMs(skl_lr, lr)
        # Make sure this doesn't throw an error
        skl_lr.predict_proba(self.X) 
Example #12
Source File: test_linear_classifier.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_model_logistic_regression_binary_class(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]),
                                VectorUDT())
        data = original_data.selectExpr("label", "truncateFeatures(features) as features")
        lr = LogisticRegression(maxIter=100, tol=0.0001)
        model = lr.fit(data)
        # the name of the input for Logistic Regression is 'features'
        C = model.numFeatures
        model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, C]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        import pandas
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        # known error in onnxruntime 0.3.0 case
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlLogisticRegression")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
Example #13
Source File: datasetClassifier.py    From mmtf-pyspark with Apache License 2.0 4 votes vote down vote up
def main(argv):

    # Name of prediction column
    label = argv[1]

    start = time.time()

    spark = SparkSession.builder \
                        .master("local[*]") \
                        .appName("datasetClassifier") \
                        .getOrCreate()

    data = spark.read.parquet(argv[0]).cache()

    vector = data.first()["features"]
    featureCount = len(vector)

    print(f"Feature count    : {featureCount}")
    classCount = int(data.select(label).distinct().count())
    print(f"Class count    : {classCount}")
    print(f"Dataset size (unbalanced)    : {data.count()}")
    data.groupby(label).count().show(classCount)

    data = datasetBalancer.downsample(data, label, 1)

    print(f"Dataset size (balanced)  : {data.count()}")
    data.groupby(label).count().show(classCount)

    testFraction = 0.3
    seed = 123

    # DecisionTree
    dtc = DecisionTreeClassifier()
    mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # RandomForest
    rfc = RandomForestClassifier()
    mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # LogisticRegression
    lr = LogisticRegression()
    mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # MultilayerPerceptronClassifier
    layers = [featureCount, 10, classCount]
    mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(200)
    mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    end = time.time()
    print("Time: %f  sec." %(end-start)) 
Example #14
Source File: test_pipeline.py    From onnxmltools with MIT License 4 votes vote down vote up
def test_model_pipeline_4_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
        stages.append(LogisticRegression(maxIter=100, tol=0.0001))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('income', StringTensorType([1, 1])),
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'income': test_data.select('income').toPandas().values,
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = [
            predicted.toPandas().label.values.astype(numpy.float32),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_4Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)