org.apache.spark.ml.classification.LogisticRegression Scala Example

Source File: NaiveBayes.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License

5 votes

package com.chapter12.NaiveBayes

import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}



object NaiveBayesExample {
  def main(args: Array[String]): Unit = {    
    // Create the Spark session 
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    // Load the data stored in LIBSVM format as a DataFrame.
    val data = spark.read.format("libsvm").load("C:/Users/rezkar/Downloads/spark-2.1.0-bin-hadoop2.7/data/sample.data")

    // Split the data into training and test sets (30% held out for testing)
    val Array(trainingData, validationData) = data.randomSplit(Array(0.75, 0.25), seed = 12345L)

    // Train a NaiveBayes model.
    val nb = new NaiveBayes().setSmoothing(0.00001)        
    val model = nb.fit(trainingData)

    // Select example rows to display.
    val predictions = model.transform(validationData)
    predictions.show()

    // Select (prediction, true label) and compute test error obtain evaluator and compute the classification performnce metrics like accuracy, precision, recall and f1 measure. 
    val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setMetricName("areaUnderROC")
    val evaluator1 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy")
    val evaluator2 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedPrecision")
    val evaluator3 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedRecall")
    val evaluator4 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("f1")

    // compute the classification accuracy, precision, recall, f1 measure and error on test data.
    val areaUnderROC = evaluator.evaluate(predictions)
    val accuracy = evaluator1.evaluate(predictions)
    val precision = evaluator2.evaluate(predictions)
    val recall = evaluator3.evaluate(predictions)
    val f1 = evaluator4.evaluate(predictions)
    
    // Print the performance metrics
    println("areaUnderROC = " + areaUnderROC)
    println("Accuracy = " + accuracy)
    println("Precision = " + precision)
    println("Recall = " + recall)
    println("F1 = " + f1)
    println(s"Test Error = ${1 - accuracy}")
    
    data.show(20)

    spark.stop()
  }
}

Source File: MulticlassLogisticRegressionWithElasticNetExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.classification.LogisticRegression
// $example off$
import org.apache.spark.sql.SparkSession

object MulticlassLogisticRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MulticlassLogisticRegressionWithElasticNetExample")
      .getOrCreate()

    // $example on$
    // Load training data
    val training = spark
      .read
      .format("libsvm")
      .load("data/mllib/sample_multiclass_classification_data.txt")

    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for multinomial logistic regression
    println(s"Coefficients: \n${lrModel.coefficientMatrix}")
    println(s"Intercepts: ${lrModel.interceptVector}")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: SimpleTextClassificationPipeline.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.examples.ml

import scala.beans.BeanInfo

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}

@BeanInfo
case class LabeledDocument(id: Long, text: String, label: Double)

@BeanInfo
case class Document(id: Long, text: String)


object SimpleTextClassificationPipeline {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    // Prepare training documents, which are labeled.
    val training = sc.parallelize(Seq(
      LabeledDocument(0L, "a b c d e spark", 1.0),
      LabeledDocument(1L, "b d", 0.0),
      LabeledDocument(2L, "spark f g h", 1.0),
      LabeledDocument(3L, "hadoop mapreduce", 0.0)))

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words")
    val hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")
    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.001)
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, lr))

    // Fit the pipeline to training documents.
    val model = pipeline.fit(training.toDF())

    // Prepare test documents, which are unlabeled.
    val test = sc.parallelize(Seq(
      Document(4L, "spark i j k"),
      Document(5L, "l m n"),
      Document(6L, "spark hadoop spark"),
      Document(7L, "apache hadoop")))

    // Make predictions on test documents.
    model.transform(test.toDF())
      .select("id", "text", "probability", "prediction")
      .collect()
      .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
        println(s"($id, $text) --> prob=$prob, prediction=$prediction")
      }

    sc.stop()
  }
}

Source File: OpLogisticRegressionTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.classification

import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.PredictionEquality
import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.linalg.Vectors
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class OpLogisticRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LogisticRegressionModel],
  OpPredictorWrapper[LogisticRegression, LogisticRegressionModel]] with PredictionEquality {

  override def specName: String = Spec[OpLogisticRegression]

  val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features",
    Seq[(RealNN, OPVector)](
      1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector,
      0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector,
      0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector,
      1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector,
      1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector,
      0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector,
      1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector,
      0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector
    )
  )
  val feature1 = rawFeature1.copy(isResponse = true)
  val estimator = new OpLogisticRegression().setInput(feature1, feature2)

  val expectedResult = Seq(
    Prediction(1.0, Array(-20.88, 20.88), Array(0.0, 1.0)),
    Prediction(0.0, Array(16.70, -16.7), Array(1.0, 0.0)),
    Prediction(0.0, Array(22.2, -22.2), Array(1.0, 0.0)),
    Prediction(1.0, Array(-18.35, 18.35), Array(0.0, 1.0)),
    Prediction(1.0, Array(-31.46, 31.46), Array(0.0, 1.0)),
    Prediction(0.0, Array(24.67, -24.67), Array(1.0, 0.0)),
    Prediction(1.0, Array(-22.07, 22.07), Array(0.0, 1.0)),
    Prediction(0.0, Array(20.9, -20.9), Array(1.0, 0.0))
  )

  it should "allow the user to set the desired spark parameters" in {
    estimator
      .setRegParam(0.1)
      .setElasticNetParam(0.1)
      .setMaxIter(20)
    estimator.fit(inputData)

    estimator.predictor.getRegParam shouldBe 0.1
    estimator.predictor.getElasticNetParam shouldBe 0.1
    estimator.predictor.getMaxIter shouldBe 20
  }
}

Source File: SparkRWrappers.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.api.r

import org.apache.spark.ml.attribute._
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.DataFrame

private[r] object SparkRWrappers {
  def fitRModelFormula(
      value: String,
      df: DataFrame,
      family: String,
      lambda: Double,
      alpha: Double): PipelineModel = {
    val formula = new RFormula().setFormula(value)
    val estimator = family match {
      case "gaussian" => new LinearRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
      case "binomial" => new LogisticRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
    }
    val pipeline = new Pipeline().setStages(Array(formula, estimator))
    pipeline.fit(df)
  }

  def getModelWeights(model: PipelineModel): Array[Double] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        Array(m.intercept) ++ m.weights.toArray
      case _: LogisticRegressionModel =>
        throw new UnsupportedOperationException(
          "No weights available for LogisticRegressionModel")  // SPARK-9492
    }
  }

  def getModelFeatures(model: PipelineModel): Array[String] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        val attrs = AttributeGroup.fromStructField(
          m.summary.predictions.schema(m.summary.featuresCol))
        Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
      case _: LogisticRegressionModel =>
        throw new UnsupportedOperationException(
          "No features names available for LogisticRegressionModel")  // SPARK-9492
    }
  }
}

Source File: ROC.scala From s4ds with Apache License 2.0

5 votes

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

import breeze.linalg._
import breeze.plot._
import org.jfree.chart.axis.NumberTickUnit


object ROC extends App {

  val conf = new SparkConf().setAppName("ROC")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  import sqlContext._
  import sqlContext.implicits._

  val transformedTest = sqlContext.read.parquet("transformedTest.parquet")

  val labelScores = transformedTest.select("probability", "label").map {
    case Row(probability:Vector, label:Double) => (probability(1), label)
  }

  val bm = new BinaryClassificationMetrics(labelScores, 300)
  val roc = bm.roc.collect
  
  roc.foreach { println }

  val falsePositives = roc.map { _._1 }
  val truePositives = roc.map { _._2 }

  val f = Figure()
  val p = f.subplot(0)
  p += plot(falsePositives, truePositives)
  p.xlabel = "false positives"
  p.ylabel = "true positives"
  p.xlim = (0.0, 0.1)
  p.xaxis.setTickUnit(new NumberTickUnit(0.01))
  p.yaxis.setTickUnit(new NumberTickUnit(0.1))
  f.refresh
  f.saveas("roc.png")
  

}

Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0

5 votes

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.SaveMode

case class LabelledDocument(fileName:String, text:String, category:String)

object LogisticRegressionDemo extends App {

  val conf = new SparkConf().setAppName("LrTest")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  import sqlContext._
  import sqlContext.implicits._

  val spamText = sc.wholeTextFiles("spam/*")
  val hamText = sc.wholeTextFiles("ham/*")

  val spamDocuments = spamText.map { 
    case (fileName, text) => LabelledDocument(fileName, text, "spam")
  }
  val hamDocuments = hamText.map {
    case (fileName, text) => LabelledDocument(fileName, text, "ham")
  }

  val documentsDF = spamDocuments.union(hamDocuments).toDF
  documentsDF.persist

  val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3))

  val indexer = new StringIndexer().setInputCol("category").setOutputCol("label")
  val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
  val hasher = new HashingTF().setInputCol("words").setOutputCol("features")
  val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0)

  val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr))
  val model = pipeline.fit(trainDF)

  val transformedTrain = model.transform(trainDF)
  transformedTrain.persist
  
  val transformedTest = model.transform(testDF)
  transformedTest.persist

  println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count,
    " / ",transformedTrain.count)
  println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count,
    " / ",transformedTest.count)

  transformedTrain.select("fileName", "label", "prediction", "probability")
    .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet")
  transformedTest.select("fileName", "label", "prediction", "probability")
    .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet")
}

Source File: LogisticRegressionWithElasticNetExample.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.classification.LogisticRegression
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object LogisticRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LogisticRegressionWithElasticNetExample")
    val sc = new SparkContext(conf)
    val sqlCtx = new SQLContext(sc)

    // $example on$
    // Load training data
    val training = sqlCtx.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for logistic regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: SimpleTextClassificationPipeline.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

import scala.beans.BeanInfo

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}

@BeanInfo
case class LabeledDocument(id: Long, text: String, label: Double)

@BeanInfo
case class Document(id: Long, text: String)


object SimpleTextClassificationPipeline {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    // Prepare training documents, which are labeled.
    val training = sc.parallelize(Seq(
      LabeledDocument(0L, "a b c d e spark", 1.0),
      LabeledDocument(1L, "b d", 0.0),
      LabeledDocument(2L, "spark f g h", 1.0),
      LabeledDocument(3L, "hadoop mapreduce", 0.0)))

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words")
    val hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")
    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.001)
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, lr))

    // Fit the pipeline to training documents.
    val model = pipeline.fit(training.toDF())

    // Prepare test documents, which are unlabeled.
    val test = sc.parallelize(Seq(
      Document(4L, "spark i j k"),
      Document(5L, "l m n"),
      Document(6L, "spark hadoop spark"),
      Document(7L, "apache hadoop")))

    // Make predictions on test documents.
    model.transform(test.toDF())
      .select("id", "text", "probability", "prediction")
      .collect()
      .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
        println(s"($id, $text) --> prob=$prob, prediction=$prediction")
      }

    sc.stop()
  }
}
// scalastyle:on println

Source File: SparkRWrappers.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.api.r

import org.apache.spark.ml.attribute._
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.DataFrame

private[r] object SparkRWrappers {
  def fitRModelFormula(
      value: String,
      df: DataFrame,
      family: String,
      lambda: Double,
      alpha: Double,
      standardize: Boolean,
      solver: String): PipelineModel = {
    val formula = new RFormula().setFormula(value)
    val estimator = family match {
      case "gaussian" => new LinearRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
        .setStandardization(standardize)
        .setSolver(solver)
      case "binomial" => new LogisticRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
        .setStandardization(standardize)
    }
    val pipeline = new Pipeline().setStages(Array(formula, estimator))
    pipeline.fit(df)
  }

  def getModelCoefficients(model: PipelineModel): Array[Double] = {
    model.stages.last match {
      case m: LinearRegressionModel => {
        val coefficientStandardErrorsR = Array(m.summary.coefficientStandardErrors.last) ++
          m.summary.coefficientStandardErrors.dropRight(1)
        val tValuesR = Array(m.summary.tValues.last) ++ m.summary.tValues.dropRight(1)
        val pValuesR = Array(m.summary.pValues.last) ++ m.summary.pValues.dropRight(1)
        if (m.getFitIntercept) {
          Array(m.intercept) ++ m.coefficients.toArray ++ coefficientStandardErrorsR ++
            tValuesR ++ pValuesR
        } else {
          m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR
        }
      }
      case m: LogisticRegressionModel => {
        if (m.getFitIntercept) {
          Array(m.intercept) ++ m.coefficients.toArray
        } else {
          m.coefficients.toArray
        }
      }
    }
  }

  def getModelDevianceResiduals(model: PipelineModel): Array[Double] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        m.summary.devianceResiduals
      case m: LogisticRegressionModel =>
        throw new UnsupportedOperationException(
          "No deviance residuals available for LogisticRegressionModel")
    }
  }

  def getModelFeatures(model: PipelineModel): Array[String] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        val attrs = AttributeGroup.fromStructField(
          m.summary.predictions.schema(m.summary.featuresCol))
        if (m.getFitIntercept) {
          Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
        } else {
          attrs.attributes.get.map(_.name.get)
        }
      case m: LogisticRegressionModel =>
        val attrs = AttributeGroup.fromStructField(
          m.summary.predictions.schema(m.summary.featuresCol))
        if (m.getFitIntercept) {
          Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
        } else {
          attrs.attributes.get.map(_.name.get)
        }
    }
  }

  def getModelName(model: PipelineModel): String = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        "LinearRegressionModel"
      case m: LogisticRegressionModel =>
        "LogisticRegressionModel"
    }
  }
}

Source File: ACMEModel.scala From cdsw-simple-serving with Apache License 2.0

5 votes

// Don't execute these lines in the workbench -- skip to "Start workbench session"
package acme
import org.apache.spark.ml.PipelineModel


import com.cloudera.datascience.cdsw.acme.ACMEData
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import scala.util.Random

// Read and cache training data prepared from acme-dataeng:
val training = ACMEData.readData()
training.cache()
training.show()

// Build a logistic regression model,
val assembler = new VectorAssembler().
  setInputCols(training.columns.filter(_ != "Occupancy")).
  setOutputCol("featureVec")

val lr = new LogisticRegression().
  setFeaturesCol("featureVec").
  setLabelCol("Occupancy").
  setRawPredictionCol("rawPrediction")

val pipeline =
  new Pipeline().setStages(Array(assembler, lr))

// and tune that model:
val paramGrid = new ParamGridBuilder().
  addGrid(lr.regParam, Seq(0.00001, 0.001, 0.1)).
  addGrid(lr.elasticNetParam, Seq(1.0)).
  build()
    
val eval = new BinaryClassificationEvaluator().
  setLabelCol("Occupancy").
  setRawPredictionCol("rawPrediction")

val validator = new TrainValidationSplit().
  setSeed(Random.nextLong()).
  setEstimator(pipeline).
  setEvaluator(eval).
  setEstimatorParamMaps(paramGrid).
  setTrainRatio(0.9)

val validatorModel = validator.fit(training)
val pipelineModel = validatorModel.bestModel.asInstanceOf[PipelineModel]
val lrModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
    
// Logistic regression model parameters:
training.columns.zip(lrModel.coefficients.toArray).foreach(println)

// Model hyperparameters:
lrModel.getElasticNetParam
lrModel.getRegParam
    
// Validation metric (accuracy):
validatorModel.validationMetrics.max
    
pipelineModel
// End workbench session

  }
}

Source File: OneHotEncoderDemo2.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License

5 votes

package com.chapter11.SparkMachineLearning

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer }
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.functions.year
import org.apache.spark.ml.{ Pipeline, PipelineStage }
import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel }
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.sql.{ DataFrame, SparkSession }
import scala.collection.mutable
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

object OneHotEncoderDemo2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, "Jason", "Germany"),
        (1, "David", "France"),
        (2, "Martin", "Spain"),
        (3, "Jason", "USA"),
        (4, "Daiel", "UK"),
        (5, "Moahmed", "Bangladesh"),
        (6, "David", "Ireland"),
        (7, "Jason", "Netherlands"))).toDF("id", "name", "address")

    df.show(false)

    val indexer = new StringIndexer()
      .setInputCol("name")
      .setOutputCol("categoryIndex")
      .fit(df)
    val indexed = indexer.transform(df)

    val encoder = new OneHotEncoder()
      .setInputCol("categoryIndex")
      .setOutputCol("categoryVec")

    val encoded = encoder.transform(indexed)
    encoded.show()
    
    spark.stop()
  }
}

Source File: StringIndexerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License

5 votes

package com.chapter11.SparkMachineLearning

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer }
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.functions.year
import org.apache.spark.ml.{ Pipeline, PipelineStage }
import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel }
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.sql.{ DataFrame, SparkSession }
import scala.collection.mutable
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql._
import org.apache.spark.sql.SQLContext

object StringIndexerDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, "Jason", "Germany"),
        (1, "David", "France"),
        (2, "Martin", "Spain"),
        (3, "Jason", "USA"),
        (4, "Daiel", "UK"),
        (5, "Moahmed", "Bangladesh"),
        (6, "David", "Ireland"),
        (7, "Jason", "Netherlands"))).toDF("id", "name", "address")

    df.show(false)

    val indexer = new StringIndexer()
      .setInputCol("name")
      .setOutputCol("label")
      .fit(df)

    val indexed = indexer.transform(df)
    indexed.show(false)

    spark.stop()
  }
}

Source File: LogisticRegressionRecommender.scala From wordpress-posts-recommender with Apache License 2.0

5 votes

package wordpressworkshop

import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

case class LogisticRegressionRecommender(training: DataFrame) {

  val lr = new LogisticRegression()
  val paramMap = ParamMap(lr.maxIter -> 20)
                 .put(lr.regParam -> 0.01)
                 .put(lr.probabilityCol -> "probability")

  val model: LogisticRegressionModel = lr.fit(training, paramMap)

  def metrics(testData: DataFrame) = {
    val predictionAndLabels: RDD[(Double, Double)] =
      model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label"))

    new BinaryClassificationMetrics(predictionAndLabels)
  }

  def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] =
    model.transform(testData)
    .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1)))
}

Source File: TitanicLogisticRegression.scala From spark-spec with MIT License

5 votes

package com.github.mrpowers.spark.spec.ml.classification

import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.sql.DataFrame

object TitanicLogisticRegression extends SparkSessionWrapper {

  def withVectorizedFeatures(
    featureColNames: Array[String] = Array("Gender", "Age", "SibSp", "Parch", "Fare"),
    outputColName: String = "features"
  )(df: DataFrame): DataFrame = {
    val assembler: VectorAssembler = new VectorAssembler()
      .setInputCols(featureColNames)
      .setOutputCol(outputColName)
    assembler.transform(df)
  }

  def withLabel(
    inputColName: String = "Survived",
    outputColName: String = "label"
  )(df: DataFrame) = {
    val labelIndexer: StringIndexer = new StringIndexer()
      .setInputCol(inputColName)
      .setOutputCol(outputColName)

    labelIndexer
      .fit(df)
      .transform(df)
  }

  def model(df: DataFrame = TitanicData.trainingDF()): LogisticRegressionModel = {
    val trainFeatures: DataFrame = df
      .transform(withVectorizedFeatures())
      .transform(withLabel())
      .select("features", "label")

    // only uses the features and label columns
    new LogisticRegression()
      .fit(trainFeatures)
  }

  def persistModel(): Unit = {
    model().save("./tmp/titanic_model/")
  }

}

Source File: LogisticRegressionSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.classification

import com.ibm.aardpfark.pfa.ProbClassifierResult

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.{DataFrame, Row}

class LogisticRegressionSuite extends SparkClassifierPFASuiteBase[ProbClassifierResult] {
  import spark.implicits._

  def getOutput(df: DataFrame) = {
    df.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map
    {
      case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray)
    }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect()
  }

  val binaryData = spark.read.format("libsvm").load("data/sample_libsvm_data.txt")
  val multiData = spark.read.format("libsvm").load("data/sample_multiclass_classification_data.txt")

  val clf = new LogisticRegression()

  override val sparkTransformer = clf.fit(binaryData)
  val result = sparkTransformer.transform(binaryData)
  override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
  override val expectedOutput = getOutput(result)

  // Additional tests
  test("LogisticRegression w/o fitIntercept") {
    val sparkTransformer = clf.setFitIntercept(false).fit(binaryData)
    val result = sparkTransformer.transform(binaryData)
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("LogisticRegression w/ non-default threshold") {
    val sparkTransformer = clf.setThreshold(0.0).fit(binaryData)
    val result = sparkTransformer.transform(binaryData)
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)

    val sparkTransformer2 = clf.setThreshold(1.0).fit(binaryData)
    val result2 = sparkTransformer2.transform(binaryData)
    val expectedOutput2 = getOutput(result2)

    parityTest(sparkTransformer2, input, expectedOutput2)
  }

  test("MLOR w/ intercept") {
    val sparkTransformer = clf.fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("MLOR w/o intercept") {
    val sparkTransformer = clf.setFitIntercept(false).fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("MLOR w/ thresholds") {
    val sparkTransformer = clf.setThresholds(Array(0.1, 0.6, 0.3)).fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("MLOR w/ thresholds - one zero") {
    val sparkTransformer = clf.setThresholds(Array(0.0, 0.6, 0.3)).fit(multiData)
    val result = sparkTransformer.transform(multiData)
    val input =  withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
    val expectedOutput = getOutput(result)

    parityTest(sparkTransformer, input, expectedOutput)
  }

}

Source File: PipelineExampleTest.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.ml

import com.github.dnvriend.TestSpec
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{ HashingTF, Tokenizer }
import org.apache.spark.ml.{ Pipeline, PipelineModel }
import org.apache.spark.sql.Row

class PipelineExampleTest extends TestSpec {

  it should "PipelineExample" in withSparkSession { spark =>
    import spark.implicits._

    // Prepare training documents from a list of (id, text, label) tuples.
    val training = Seq(
      (0L, "a b c d e spark", 1.0),
      (1L, "b d", 0.0),
      (2L, "spark f g h", 1.0),
      (3L, "hadoop mapreduce", 0.0)
    ).toDF("id", "text", "label")

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words")
    val hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")
    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.01)
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, lr))

    // Fit the pipeline to training documents.
    val model = pipeline.fit(training)

    // Now we can optionally save the fitted pipeline to disk
    model.write.overwrite().save("/tmp/spark-logistic-regression-model")

    // We can also save this unfit pipeline to disk
    pipeline.write.overwrite().save("/tmp/unfit-lr-model")

    // And load it back in during production
    val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")

    // Prepare test documents, which are unlabeled (id, text) tuples.
    val test = Seq(
      (4L, "spark i j k"),
      (5L, "l m n"),
      (6L, "mapreduce spark"),
      (7L, "apache hadoop"),
      (8L, "spark f g h"),
      (9L, "d e f spark a b c"),
      (10L, "spark baz bar a b c"),
      (11L, "foo bar a b c spark"),
      (12L, "a b c scala d e f"),
      (13L, "spark mapreduce")
    ).toDF("id", "text")

    // Make predictions on test documents.
    model.transform(test)
      .select("id", "text", "probability", "prediction")
      .collect()
      .foreach {
        case Row(id: Long, text: String, prob, prediction: Double) =>
          println(s"($id, $text) --> prob=$prob, prediction=$prediction")
      }
  }
}

Source File: MyPipeLine.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License

5 votes

package spark.ml.cookbook.chapter4

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession
import org.apache.log4j.{Level, Logger}

object MyPipeLine {

  def main(args: Array[String]): Unit = {


    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)

    // setup SparkSession to use for interactions with Spark
    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("My PipeLine")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    val trainset = spark.createDataFrame(Seq(
      (1L, 1, "spark rocks"),
      (2L, 0, "flink is the best"),
      (3L, 1, "Spark rules"),
      (4L, 0, "mapreduce forever"),
      (5L, 0, "Kafka is great")
    )).toDF("id", "label", "words")

    val tokenizer = new Tokenizer()
      .setInputCol("words")
      .setOutputCol("tokens")

    val hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")

    val lr = new LogisticRegression()
      .setMaxIter(15)
      .setRegParam(0.01)

    // three stage pipeline
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, lr))

    // Fit the pipeline to training documents.
    val model = pipeline.fit(trainset)

    val testSet = spark.createDataFrame(Seq(
      (10L, 1, "use spark please"),
      (11L, 2, "Kafka")
    )).toDF("id", "label", "words")

    model.transform(testSet).select("probability","prediction").show(false)

    spark.stop()
  }
}

Source File: OnevsRest.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License

5 votes

package spark.ml.cookbook.chapter5

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

object OnevsRest {

  def main(args: Array[String]): Unit = {

    import org.apache.log4j.Logger
    import org.apache.log4j.Level

    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("MLP")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    val data = spark.read.format("libsvm")
      .load("../data/sparkml2/chapter5/iris.scale.txt")

    data.show(false)

    val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = System.currentTimeMillis())

    // logistic regression classifier
    val lrc = new LogisticRegression()
      .setMaxIter(15)
      .setTol(1E-3)
      .setFitIntercept(true)

    val ovr = new OneVsRest().setClassifier(lrc)

    val ovrModel = ovr.fit(train)

    val predictions = ovrModel.transform(test)
    predictions.show(false)

    val eval = new MulticlassClassificationEvaluator()
      .setMetricName("accuracy")

    // compute the classification error on test data.
    val accuracy = eval.evaluate(predictions)
    println("Accuracy: " + eval.evaluate(predictions))
  }
}

Source File: MLPipelineTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.spark.mlpipeline

import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.param.ParamMap
import org.scalaml.Logging
import org.scalaml.spark.ResourcesLoader
import org.scalatest.{FlatSpec, Matchers}

final class MLPipelineTest extends FlatSpec with Matchers with Logging {
  protected val name = "Spark ML pipeline"

  final val trainFile = "/data/spark/mlpipeline_training.csv"
  final val testFile = "/data/spark/mlpipeline_test.csv"


  final val columns = Array[String]("date", "asset", "region", "agent")

  it should s"$name simple predictor" in {
    show(s"$name simple predictor")

    (for {
      trainPath <- ResourcesLoader.getPath(trainFile)
      testPath <- ResourcesLoader.getPath(testFile)
    } yield {
      val predictor = new SimplePredictor[LogisticRegressionModel](
        new LogisticRegression().setMaxIter(5).setRegParam(0.1),
        columns,
        trainPath
      )

      (predictor, predictor.classify(predictor(), testPath))
    }).map {
      case (predictor, output) => {
        output.printSchema
        val predictedValues = output.select("prediction").collect.map(_.getDouble(0))
        output.show

        predictor.stop
        predictedValues(0)
      } should be(0.0)
    }
  }

  
  it should s"$name cross validation" in {
    show(s"$name cross validation")

    (for {
      trainPath <- ResourcesLoader.getPath(trainFile)
      testPath <- ResourcesLoader.getPath(testFile)
    } yield {
      val lr = new LogisticRegression().setMaxIter(5).setRegParam(0.1)
      val paramsMap = new ParamMap().put(lr.maxIter -> 30).put(lr.regParam -> 0.1)
      val validator = new ValidatedPredictor[LogisticRegressionModel](lr, columns, trainPath)

      val (f1, auROC) = validator.trainingWithSummary.getOrElse((Double.NaN, Double.NaN))
      println(s"F1-measure = ${f1} auROC = ${auROC}")
      validator.stop
      f1 should be(0.025 +- 0.005)
      auROC should be(0.600 +- 0.005)
    })
  }
}

// --------------------------------  EOF ---------------------------------------------

Source File: ModelPersistence.scala From reactive-machine-learning-systems with MIT License

5 votes

package com.reactivemachinelearning

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{QuantileDiscretizer, VectorAssembler}
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}
import org.apache.spark.sql.SparkSession

object ModelPersistence extends App {

  val session = SparkSession.builder.appName("ModelPersistence").getOrCreate()

  val data = Seq(
    (0, 18.0, 0),
    (1, 20.0, 0),
    (2, 8.0, 1),
    (3, 5.0, 1),
    (4, 2.0, 0),
    (5, 21.0, 0),
    (6, 7.0, 1),
    (7, 18.0, 0),
    (8, 3.0, 1),
    (9, 22.0, 0),
    (10, 8.0, 1),
    (11, 2.0, 0),
    (12, 5.0, 1),
    (13, 4.0, 1),
    (14, 1.0, 0),
    (15, 11.0, 0),
    (16, 7.0, 1),
    (17, 15.0, 0),
    (18, 3.0, 1),
    (19, 20.0, 0))

  val instances = session.createDataFrame(data)
    .toDF("id", "seeds", "label")

  val discretizer = new QuantileDiscretizer()
    .setInputCol("seeds")
    .setOutputCol("discretized")
    .setNumBuckets(3)

  val assembler = new VectorAssembler()
    .setInputCols(Array("discretized"))
    .setOutputCol("features")

  val classifier = new LogisticRegression()
    .setMaxIter(5)

  val pipeline = new Pipeline()
    .setStages(Array(discretizer, assembler, classifier))

  val paramMaps = new ParamGridBuilder()
    .addGrid(classifier.regParam, Array(0.0, 0.1))
    .build()

  val evaluator = new BinaryClassificationEvaluator()

  val crossValidator = new CrossValidator()
    .setEstimator(pipeline)
    .setEvaluator(evaluator)
    .setNumFolds(2)
    .setEstimatorParamMaps(paramMaps)

  val model = crossValidator.fit(instances)

  model.write.overwrite().save("my-model")

  val persistedModel = CrossValidatorModel.load("./my-model")
  println(s"UID: ${persistedModel.uid}")

}

Source File: FeatureCrossSelectorExample.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.feature.examples

import org.apache.spark.SparkConf
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.operator.{VarianceSelector, VectorCartesian}
import org.apache.spark.sql.SparkSession

object FeatureCrossSelectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()

    val input = conf.get("spark.input.path", "data/a9a/a9a_123d_train_trans.libsvm")
    val numFeatures = conf.get("spark.num.feature", "123")
    val twoOrderNumFeatures = conf.getInt("spark.two.order.num.feature", 123)
    val threeOrderNumFeatures = conf.getInt("spark.three.order.num.feature", 123)

    val spark = SparkSession.builder().master("local").config(conf).getOrCreate()

    val data = spark.read.format("libsvm")
      .option("numFeatures", numFeatures)
      .load(input)
      .persist()

    val cartesian = new VectorCartesian()
      .setInputCols(Array("features", "features"))
      .setOutputCol("f_f")

    val selector = new VarianceSelector()
      .setFeaturesCol("f_f")
      .setOutputCol("selected_f_f")
      .setNumTopFeatures(twoOrderNumFeatures)

    val cartesian2 = new VectorCartesian()
      .setInputCols(Array("features", "selected_f_f"))
      .setOutputCol("f_f_f")

    val selector2 = new VarianceSelector()
      .setFeaturesCol("f_f_f")
      .setOutputCol("selected_f_f_f")
      .setNumTopFeatures(threeOrderNumFeatures)

    val assembler = new VectorAssembler()
      .setInputCols(Array("features", "selected_f_f", "selected_f_f_f"))
      .setOutputCol("assembled_features")

    val pipeline = new Pipeline()
      .setStages(Array(cartesian, selector, cartesian2, selector2, assembler))

    val crossDF = pipeline.fit(data).transform(data).persist()
    data.unpersist()
    crossDF.drop("f_f", "f_f_f", "selected_f_f", "selected_f_f_f")
    crossDF.show(1)

    val splitDF = crossDF.randomSplit(Array(0.9, 0.1))

    val trainDF = splitDF(0).persist()
    val testDF = splitDF(1).persist()

    val originalLR = new LogisticRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setMaxIter(20)
      .setRegParam(0.01)

    val originalPredictions = originalLR.fit(trainDF).transform(testDF)
    originalPredictions.show(1)
    val originalEvaluator = new BinaryClassificationEvaluator()
      .setLabelCol("label")
      .setRawPredictionCol("rawPrediction")
      .setMetricName("areaUnderROC")
    val originalAUC = originalEvaluator.evaluate(originalPredictions)
    println(s"original features auc: $originalAUC")

    val crossLR = new LogisticRegression()
      .setFeaturesCol("assembled_features")
      .setLabelCol("label")
      .setMaxIter(20)
      .setRegParam(0.01)

    val crossPredictions = crossLR.fit(trainDF).transform(testDF)
    crossPredictions.show(1)
    val crossEvaluator = new BinaryClassificationEvaluator()
      .setLabelCol("label")
      .setRawPredictionCol("rawPrediction")
      .setMetricName("areaUnderROC")
    val crossAUC = crossEvaluator.evaluate(crossPredictions)
    println(s"cross features auc: $crossAUC")

    spark.close()
  }
}

Source File: InnerProductNNCtrModel.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{LogisticRegression, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class InnerProductNNCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)

    val prePipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct)

    val preparedSamples = prePipelineModel.transform(samplesWithInnerProduct)

    //network architecture, better to keep tuning it until metrics converge
    val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length,
      preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2)


    val nnModel = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(128)
      .setSeed(1234L)
      .setMaxIter(150)      //max iterations, keep increasing it if loss function or metrics don't converge
      .setStepSize(0.005)   //learning step size, larger size will lead to loss vibration
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")

    val pipelineStages = prePipelineModel.stages ++ Array(nnModel)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithInnerProduct)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    _pipelineModel.transform(samplesWithInnerProduct)
  }
}

Source File: LogisticRegressionCtrModel.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.DataFrame

class LogisticRegressionCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {

    val featureEngineeringStages:Array[PipelineStage] = FeatureEngineering.preProcessSamplesStages()

    val model:LogisticRegression = new LogisticRegression()
      .setMaxIter(20)           //max iteration
      .setRegParam(0.0)         //regularization parameter
      .setElasticNetParam(0.0)  //0-L2 regularization 1-L1 regularization
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")

    val pipelineStages = featureEngineeringStages ++ Array(model)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samples)
  }
}

Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.ChrunPrediction

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.max
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

import org.apache.spark._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset

import org.apache.spark.ml.linalg.{ Matrix, Vectors }
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

object Describe {
  case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
    international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
    total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
    total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
    total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
    total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
    total_international_num_calls: Double, churn: String)

  val schema = StructType(Array(
    StructField("state_code", StringType, true),
    StructField("account_length", IntegerType, true),
    StructField("area_code", StringType, true),
    StructField("international_plan", StringType, true),
    StructField("voice_mail_plan", StringType, true),
    StructField("num_voice_mail", DoubleType, true),
    StructField("total_day_mins", DoubleType, true),
    StructField("total_day_calls", DoubleType, true),
    StructField("total_day_charge", DoubleType, true),
    StructField("total_evening_mins", DoubleType, true),
    StructField("total_evening_calls", DoubleType, true),
    StructField("total_evening_charge", DoubleType, true),
    StructField("total_night_mins", DoubleType, true),
    StructField("total_night_calls", DoubleType, true),
    StructField("total_night_charge", DoubleType, true),
    StructField("total_international_mins", DoubleType, true),
    StructField("total_international_calls", DoubleType, true),
    StructField("total_international_charge", DoubleType, true),
    StructField("total_international_num_calls", DoubleType, true),
    StructField("churn", StringType, true)))

  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Desribe")
      .getOrCreate()

    spark.conf.set("spark.debug.maxToStringFields", 10000)
    val DEFAULT_MAX_TO_STRING_FIELDS = 2500
    if (SparkEnv.get != null) {
      SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
    } else {
      DEFAULT_MAX_TO_STRING_FIELDS
    }
    import spark.implicits._

    val trainSet: Dataset[CustomerAccount] = spark.read.
      option("inferSchema", "false")
      .format("com.databricks.spark.csv")
      .schema(schema)
      .load("data/churn-bigml-80.csv")
      .as[CustomerAccount]

    val statsDF = trainSet.describe()   
    statsDF.show()

    trainSet.createOrReplaceTempView("UserAccount")
    spark.catalog.cacheTable("UserAccount")
    
    spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
    spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
    trainSet.groupBy("churn").count.show()
    spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn")
    
  }
}

Source File: ChurnPredictionLR.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.ChrunPrediction

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

object ChurnPredictionLR {
  def main(args: Array[String]) {
    val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionLogisticRegression")
    import spark.implicits._

    val numFolds = 10
    val MaxIter: Seq[Int] = Seq(100)
    val RegParam: Seq[Double] = Seq(1.0) // L2 regularization param, set 0.10 with L1 reguarization
    val Tol: Seq[Double] = Seq(1e-8)
    val ElasticNetParam: Seq[Double] = Seq(1.0) // Combination of L1 and L2

    val lr = new LogisticRegression()
                    .setLabelCol("label")
                    .setFeaturesCol("features")

    // Chain indexers and tree in a Pipeline.
    val pipeline = new Pipeline()
      .setStages(Array(PipelineConstruction.ipindexer,
        PipelineConstruction.labelindexer,
        PipelineConstruction.assembler,
        lr))

    // Search through decision tree's maxDepth parameter for best model                               
    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.maxIter, MaxIter)
      .addGrid(lr.regParam, RegParam)
      .addGrid(lr.tol, Tol)
      .addGrid(lr.elasticNetParam, ElasticNetParam)
      .build()

    val evaluator = new BinaryClassificationEvaluator()
                  .setLabelCol("label")
                  .setRawPredictionCol("prediction")

    // Set up 10-fold cross validation
    val crossval = new CrossValidator()
      .setEstimator(pipeline)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumFolds(numFolds)

    val cvModel = crossval.fit(Preprocessing.trainDF)   

    val predictions = cvModel.transform(Preprocessing.testSet)
    val result = predictions.select("label", "prediction", "probability")
    val resutDF = result.withColumnRenamed("prediction", "Predicted_label")
    resutDF.show(10)
    
    val accuracy = evaluator.evaluate(predictions)
    println("Classification accuracy: " + accuracy)    

    // Compute other performence metrices
    val predictionAndLabels = predictions
      .select("prediction", "label")
      .rdd.map(x => (x(0).asInstanceOf[Double], x(1)
        .asInstanceOf[Double]))

    val metrics = new BinaryClassificationMetrics(predictionAndLabels)
    val areaUnderPR = metrics.areaUnderPR
    println("Area under the precision-recall curve: " + areaUnderPR)
    
    val areaUnderROC = metrics.areaUnderROC
    println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC)

    

    val lp = predictions.select("label", "prediction")
    val counttotal = predictions.count()
    val correct = lp.filter($"label" === $"prediction").count()
    val wrong = lp.filter(not($"label" === $"prediction")).count()
    val ratioWrong = wrong.toDouble / counttotal.toDouble
    val ratioCorrect = correct.toDouble / counttotal.toDouble
    val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble
    val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble
    val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble
    val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble

    println("Total Count: " + counttotal)
    println("Correct: " + correct)
    println("Wrong: " + wrong)
    println("Ratio wrong: " + ratioWrong)
    println("Ratio correct: " + ratioCorrect)
    println("Ratio true positive: " + truep)
    println("Ratio false positive: " + falsep)
    println("Ratio true negative: " + truen)
    println("Ratio false negative: " + falsen)
  }
}

Source File: VLORRealDataExample.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.example

import org.apache.spark.ml.classification.{LogisticRegression, VLogisticRegression}
import org.apache.spark.sql.{Dataset, SparkSession}

object VLORRealDataExample {

  // https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a
  def main(args: Array[String]) = {
    val spark = SparkSession
      .builder()
      .appName("VLogistic Regression real data example")
      .getOrCreate()

    val sc = spark.sparkContext

    val dataset1: Dataset[_] = spark.read.format("libsvm").load("data/a9a")

    val trainer = new LogisticRegression()
      .setFitIntercept(false)
      .setRegParam(0.5)
    val model = trainer.fit(dataset1)

    val vtrainer = new VLogisticRegression()
      .setColsPerBlock(100)
      .setRowsPerBlock(10)
      .setColPartitions(3)
      .setRowPartitions(3)
      .setRegParam(0.5)
    val vmodel = vtrainer.fit(dataset1)

    println(s"VLogistic regression coefficients: ${vmodel.coefficients}")
    println(s"Logistic regression coefficients: ${model.coefficients}")

    sc.stop()
  }
}

Source File: OneVsRestParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.classification

import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.sql.DataFrame


class OneVsRestParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new VectorAssembler().
      setInputCols(Array("fico_index", "dti")).
      setOutputCol("features"),
    new OneVsRest().setClassifier(new LogisticRegression()).
      setLabelCol("fico_index").
      setFeaturesCol("features").
      setPredictionCol("prediction"))).fit(dataset)

  override val unserializedParams = Set("stringOrderType", "classifier", "labelCol")
}

Source File: TestSparkMl.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.databricks.runtime.testkit

import java.io.File
import java.nio.file.{Files, StandardCopyOption}

import ml.combust.bundle.BundleFile
import org.apache.spark.ml.bundle.SparkBundleContext
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.sql.SparkSession
import com.databricks.spark.avro._
import ml.combust.mleap.spark.SparkSupport._
import ml.combust.mleap.runtime.MleapSupport._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression

class TestSparkMl(session: SparkSession) extends Runnable {
  override def run(): Unit = {
    val sqlContext = session.sqlContext

    // Create a temporary file and copy the contents of the resource avro to it
    val path = Files.createTempFile("mleap-databricks-runtime-testkit", ".avro")
    Files.copy(getClass.getClassLoader.getResource("datasources/lending_club_sample.avro").openStream(),
      path,
      StandardCopyOption.REPLACE_EXISTING)

    val sampleData = sqlContext.read.avro(path.toString)
    sampleData.show()

    val stringIndexer = new StringIndexer().
      setInputCol("fico_score_group_fnl").
      setOutputCol("fico_index")

    val featureAssembler = new VectorAssembler().
      setInputCols(Array(stringIndexer.getOutputCol, "dti", "loan_amount")).
      setOutputCol("features")

    val logisticRegression = new LogisticRegression().
      setFeaturesCol(featureAssembler.getOutputCol).
      setLabelCol("approved").
      setPredictionCol("prediction")

    val pipeline = new Pipeline().setStages(Array(stringIndexer, featureAssembler, logisticRegression))

    val model = pipeline.fit(sampleData)

    val modelPath = Files.createTempFile("mleap-databricks-runtime-testkit", ".zip")
    Files.delete(modelPath)

    // Save the model
    {
      println("Writing model to...", modelPath)
      implicit val sbc = SparkBundleContext.defaultContext.withDataset(model.transform(sampleData))
      val bf = BundleFile(new File(modelPath.toString))
      model.writeBundle.save(bf).get
      bf.close()
    }

    // Load the model
    {
      val bf = BundleFile(new File(modelPath.toString))
      bf.loadMleapBundle().get
      bf.close()
    }
  }
}

Source File: MulticlassLogisticRegressionWithElasticNetExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.classification.LogisticRegression
// $example off$
import org.apache.spark.sql.SparkSession

object MulticlassLogisticRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MulticlassLogisticRegressionWithElasticNetExample")
      .getOrCreate()

    // $example on$
    // Load training data
    val training = spark
      .read
      .format("libsvm")
      .load("data/mllib/sample_multiclass_classification_data.txt")

    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for multinomial logistic regression
    println(s"Coefficients: \n${lrModel.coefficientMatrix}")
    println(s"Intercepts: ${lrModel.interceptVector}")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls")
    holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls")

    savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)

    holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls")
    holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls")

    savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv")
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: TextClassificationPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.textclassifier

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
import org.utils.StandaloneSpark



object TextClassificationPipeline {

  def main(args: Array[String]): Unit = {
    val spark = StandaloneSpark.getSparkInstance()

    // Prepare training documents from a list of (id, text, label) tuples.
    val training = spark.createDataFrame(Seq(
      (0L, "a b c d e spark", 1.0),
      (1L, "b d", 0.0),
      (2L, "spark f g h", 1.0),
      (3L, "hadoop mapreduce", 0.0)
    )).toDF("id", "text", "label")

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words")
    val hashingTF = new HashingTF()
      .setNumFeatures(1000)
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")
    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.001)
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, lr))

    // Fit the pipeline to training documents.
    val model = pipeline.fit(training)

    // Now we can optionally save the fitted pipeline to disk
    model.write.overwrite().save("/tmp/spark-logistic-regression-model")

    // We can also save this unfit pipeline to disk
    pipeline.write.overwrite().save("/tmp/unfit-lr-model")

    // And load it back in during production
    val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")

    // Prepare test documents, which are unlabeled (id, text) tuples.
    val test = spark.createDataFrame(Seq(
      (4L, "spark i j k"),
      (5L, "l m n"),
      (6L, "spark hadoop spark"),
      (7L, "apache hadoop")
    )).toDF("id", "text")

    // Make predictions on test documents.
    model.transform(test)
      .select("id", "text", "probability", "prediction")
      .collect()
      .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
        println(s"($id, $text) --> prob=$prob, prediction=$prediction")
      }
  }

}

Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.DataFrame


object LogisticRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val lr = new LogisticRegression()

    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept)
      .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0))
      .build()

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr))

    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      // 80% of the data will be used for training and the remaining 20% for validation.
      .setTrainRatio(0.8)

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
    //val model = trainValidationSplit.fit(training)
    val model = trainValidationSplit.fit(dataFrame)

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // have to do a type conversion for RegressionMetrics
    val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double])))

    logger.info("Test Metrics")
    logger.info("Test Explained Variance:")
    logger.info(rm.explainedVariance)
    logger.info("Test R^2 Coef:")
    logger.info(rm.r2)
    logger.info("Test MSE:")
    logger.info(rm.meanSquaredError)
    logger.info("Test RMSE:")
    logger.info(rm.rootMeanSquaredError)

    val totalPoints = dataFrame.count()
    val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum()
    val accuracy = lrTotalCorrect/totalPoints
    println("Accuracy of LogisticRegression is: ", accuracy)
  }

  def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = {
    println("Mean Squared Error:", regressionMetrics.meanSquaredError)
    println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError)

    predictions
      .coalesce(1)
      .write.format("com.databricks.spark.csv")
      .option("header", "true")
      .save(filePath)
  }
}

Source File: LogisticRegressionWorkload.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload.ml

import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator => BCE}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

// ¯\_(ツ)_/¯
// the logic for this workload came from:
// https://github.com/szilard/benchm-ml/blob/master/1-linear/5-spark.txt
// ¯\_(ツ)_/¯

case class LogisticRegressionResult(
                                     name: String,
                                     appid: String,
                                     start_time: Long,
                                     input: String,
                                     train_count: Long,
                                     train_time: Long,
                                     test_file: String,
                                     test_count: Long,
                                     test_time: Long,
                                     load_time: Long,
                                     count_time: Long,
                                     total_runtime: Long,
                                     area_under_roc: Double
                                   )

object LogisticRegressionWorkload extends WorkloadDefaults {
  val name = "lr-bml"
  def apply(m: Map[String, Any]) = new LogisticRegressionWorkload(
    input = Some(getOrThrow(m, "input").asInstanceOf[String]),
    output = getOrDefault[Option[String]](m, "workloadresultsoutputdir", None),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    testFile = getOrThrow(m, "testfile").asInstanceOf[String],
    numPartitions = getOrDefault[Int](m, "numpartitions", 32),
    cacheEnabled = getOrDefault[Boolean](m, "cacheenabled", true)
  )

}

case class LogisticRegressionWorkload(
                                       input: Option[String],
                                       output: Option[String],
                                       saveMode: String,
                                       testFile: String,
                                       numPartitions: Int,
                                       cacheEnabled: Boolean
  ) extends Workload {

  private[ml] def load(filename: String)(implicit spark: SparkSession): DataFrame = {
    import spark.implicits._
    spark.sparkContext.textFile(filename)
      .map { line =>
        val vv = line.split(',').map(_.toDouble)
        val label = vv(0)
        val features = Vectors.dense(vv.slice(1, vv.length)).toSparse
        (label, features)
      }.toDF("label", "features")
  }

  private[ml] def ld(fn: String)(implicit spark: SparkSession) = time {
    val ds = load(fn)(spark).repartition(numPartitions)
    if (cacheEnabled) ds.cache
    ds
  }

  override def doWorkload(df: Option[DataFrame], spark: SparkSession): DataFrame = {
    val startTime = System.currentTimeMillis
    val (ltrainTime, d_train) = ld(s"${input.get}")(spark)
    val (ltestTime, d_test) = ld(s"$testFile")(spark)
    val (countTime, (trainCount, testCount)) = time { (d_train.count(), d_test.count()) }
    val (trainTime, model) = time(new LogisticRegression().setTol(1e-4).fit(d_train))
    val (testTime, areaUnderROC) = time(new BCE().setMetricName("areaUnderROC").evaluate(model.transform(d_test)))

    val loadTime = ltrainTime + ltestTime

    //spark.createDataFrame(Seq(SleepResult("sleep", timestamp, t)))

    spark.createDataFrame(Seq(LogisticRegressionResult(
      name = "lr-bml",
      appid = spark.sparkContext.applicationId,
      startTime,
      input.get,
      train_count = trainCount,
      trainTime,
      testFile,
      test_count = testCount,
      testTime,
      loadTime,
      countTime,
      loadTime + trainTime + testTime,
      areaUnderROC
    )))
  }
}

Source File: MulticlassLogisticRegressionWithElasticNetExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.classification.LogisticRegression
// $example off$
import org.apache.spark.sql.SparkSession

object MulticlassLogisticRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MulticlassLogisticRegressionWithElasticNetExample")
      .getOrCreate()

    // $example on$
    // Load training data
    val training = spark
      .read
      .format("libsvm")
      .load("data/mllib/sample_multiclass_classification_data.txt")

    val lr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for multinomial logistic regression
    println(s"Coefficients: \n${lrModel.coefficientMatrix}")
    println(s"Intercepts: ${lrModel.interceptVector}")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: ClassifiersImpl.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.machinelearning.common

import org.apache.spark.ml.classification.{DecisionTreeClassifier, GBTClassifier, LogisticRegression, NaiveBayes}
import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator}
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.sql._

object ClassifiersImpl {
  def logisticRegression(trainingLabeledPointDf: DataFrame,
                         testPercentage:Double): Unit = {
    val mlr = new LogisticRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage))

    val model = mlr.fit(splits(0))

    val trainTransformed = model.transform(splits(1))

    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(trainTransformed)
    println("Test set accuracy of logisticRegression = " + accuracy)

    //println(model)
  }

  def gbtClassifer(trainingLabeledPointDf: DataFrame,
                   testPercentage:Double): Unit = {
    val gbt = new GBTClassifier()

    val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage))

    val model = gbt.fit(splits(0))

    val trainTransformed = model.transform(splits(1))

    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(trainTransformed)
    println("Test set accuracy of gbtClassifier = " + accuracy)

    //println(model)
    //println(model.toDebugString)
  }

  def randomForestRegressor(trainingLabeledPointDf: DataFrame,
                            impurity:String,
                            maxDepth:Int,
                            maxBins:Int,
                            testPercentage:Double): Unit = {
    val rf = new RandomForestRegressor()

    rf.setImpurity(impurity)
    rf.setMaxDepth(maxDepth)
    rf.setMaxBins(maxBins)

    val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage))

    val model = rf.fit(splits(0))
    val trainTransformed = model.transform(splits(1))

    

    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(trainTransformed)
    println("Test set accuracy of NaiveBayer = " + accuracy)
  }
}

Source File: TrainNewsClassWithLRDemo.scala From CkoocNLP with Apache License 2.0

5 votes

package applications.mining

import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession


object TrainNewsClassWithLRDemo extends Serializable {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkSession
      .builder
      .master("local[2]")
      .appName("train news with LR Demo")
      .getOrCreate()

    val args = Array("ckooc-ml/data/classnews/train")
    val filePath = args(0)

    import spark.implicits._
    val data = spark.sparkContext.textFile(filePath).flatMap { line =>
      val tokens: Array[String] = line.split("\u00ef")
      if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
    }.toDF("label", "title", "time", "content")
    data.persist()

    val preprocessor = new Preprocessor
    val pipeline = preprocessor.preprocess(data)

    //LR模型训练
    val params = new ClassParams
    val logisticRegression = new LogisticRegression()
      .setTol(params.converTol)
      .setMaxIter(params.maxIteration)
      .setRegParam(params.regParam)
      .setElasticNetParam(params.elasticNetParam)
      .setLabelCol("indexedLabel")
      .setFeaturesCol("features")

    val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
    //索引标签化
    val labelConverter = new IndexToString()
      .setLabels(indexModel.labels)
      .setInputCol(logisticRegression.getPredictionCol)
      .setOutputCol("predictedLabel")

    val stages = pipeline.getStages ++ Array(logisticRegression, labelConverter)
    pipeline.setStages(stages)

    val model = pipeline.fit(data)
    model.write.overwrite().save(params.LRModelPath)

    data.unpersist()
    spark.stop()
  }
}

org.apache.spark.ml.classification.LogisticRegression Scala Examples