org.apache.spark.ml.classification.BinaryLogisticRegressionSummary Scala Examples
The following examples show how to use org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.ml.linalg.{ Matrix, Vectors } import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row object Describe { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) def main(args: Array[String]) { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Desribe") .getOrCreate() spark.conf.set("spark.debug.maxToStringFields", 10000) val DEFAULT_MAX_TO_STRING_FIELDS = 2500 if (SparkEnv.get != null) { SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) } else { DEFAULT_MAX_TO_STRING_FIELDS } import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") } }
Example 2
Source File: ChurnPredictionLR.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object ChurnPredictionLR { def main(args: Array[String]) { val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionLogisticRegression") import spark.implicits._ val numFolds = 10 val MaxIter: Seq[Int] = Seq(100) val RegParam: Seq[Double] = Seq(1.0) // L2 regularization param, set 0.10 with L1 reguarization val Tol: Seq[Double] = Seq(1e-8) val ElasticNetParam: Seq[Double] = Seq(1.0) // Combination of L1 and L2 val lr = new LogisticRegression() .setLabelCol("label") .setFeaturesCol("features") // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(PipelineConstruction.ipindexer, PipelineConstruction.labelindexer, PipelineConstruction.assembler, lr)) // Search through decision tree's maxDepth parameter for best model val paramGrid = new ParamGridBuilder() .addGrid(lr.maxIter, MaxIter) .addGrid(lr.regParam, RegParam) .addGrid(lr.tol, Tol) .addGrid(lr.elasticNetParam, ElasticNetParam) .build() val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") // Set up 10-fold cross validation val crossval = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(numFolds) val cvModel = crossval.fit(Preprocessing.trainDF) val predictions = cvModel.transform(Preprocessing.testSet) val result = predictions.select("label", "prediction", "probability") val resutDF = result.withColumnRenamed("prediction", "Predicted_label") resutDF.show(10) val accuracy = evaluator.evaluate(predictions) println("Classification accuracy: " + accuracy) // Compute other performence metrices val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) } }
Example 3
Source File: ModelEstimator.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegressionModel} import org.apache.spark.ml._ import org.apache.spark.sql import sql._ @throws(classOf[IllegalArgumentException]) final def trainWithSummary( trainDf: DataFrame, stages: Array[PipelineStage] ): Option[(Double, Double)] = { require(stages.size > 0, "Cannot process a pipeline without stages") // Print the training set data frame trainDf.printSchema this(trainDf, stages).stages.last match { case lrModel: LogisticRegressionModel => val binarySummary = lrModel.summary.asInstanceOf[BinaryLogisticRegressionSummary] // Set the model threshold to maximize F-Measure val f1: Double = binarySummary.fMeasureByThreshold.select("F-Measure").head.getDouble(0) Some(f1, binarySummary.areaUnderROC) case _ => None } } } // ------------------------------ EOF --------------------------------------------------------
Example 4
Source File: GBTLRExample.scala From spark-gbtlr with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.gbtlr.GBTLRClassifier import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession // scalastyle:off println object GBTLRExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("gbtlr example") .getOrCreate() val startTime = System.currentTimeMillis() val dataset = spark.read.option("header", "true").option("inferSchema", "true") .option("delimiter", ";").csv("data/bank/bank-full.csv") val columnNames = Array("job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome", "y") val indexers = columnNames.map(name => new StringIndexer() .setInputCol(name).setOutputCol(name + "_index")) val pipeline = new Pipeline().setStages(indexers) val data1 = pipeline.fit(dataset).transform(dataset) val data2 = data1.withColumnRenamed("y_index", "label") val assembler = new VectorAssembler() assembler.setInputCols(Array("age", "job_index", "marital_index", "education_index", "default_index", "balance", "housing_index", "loan_index", "contact_index", "day", "month_index", "duration", "campaign", "pdays", "previous", "poutcome_index")) assembler.setOutputCol("features") val data3 = assembler.transform(data2) val data4 = data3.randomSplit(Array(4, 1)) val gBTLRClassifier = new GBTLRClassifier() .setFeaturesCol("features") .setLabelCol("label") .setGBTMaxIter(10) .setLRMaxIter(100) .setRegParam(0.01) .setElasticNetParam(0.5) val model = gBTLRClassifier.fit(data4(0)) val summary = model.evaluate(data4(1)) val endTime = System.currentTimeMillis() val auc = summary.binaryLogisticRegressionSummary .asInstanceOf[BinaryLogisticRegressionSummary].areaUnderROC println(s"Training and evaluating cost ${(endTime - startTime) / 1000} seconds") println(s"The model's auc: ${auc}") } } // scalastyle:on println