org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS Scala Examples
The following examples show how to use org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: BinaryClassification.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --algorithm LR --regType L2 --regParam 1.0 \ | data/mllib/sample_binary_classification_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"BinaryClassification with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val model = params.algorithm match { case LR => val algorithm = new LogisticRegressionWithLBFGS() algorithm.optimizer .setNumIterations(params.numIterations) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() case SVM => val algorithm = new SVMWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() } val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val metrics = new BinaryClassificationMetrics(predictionAndLabel) println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.") println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.") sc.stop() } } // scalastyle:on println
Example 2
Source File: LRAccuracyTest.scala From SparseML with Apache License 2.0 | 5 votes |
package MLlib import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkContext, SparkConf} object LRAccuracyTest { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map( l => LabeledPoint(l.label, l.features.toSparse)) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new SparseLogisticRegressionWithLBFGS() .setNumClasses(5) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Get evaluation metrics. val metrics = new MulticlassMetrics(predictionAndLabels) val precision = metrics.precision println("Precision = " + precision) } }
Example 3
Source File: LogisticRegressionExample.scala From spark-tutorial with Apache License 2.0 | 5 votes |
package se.uu.farmbio.tutorial import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils object LogisticRegressionExample { def main(args: Array[String]) = { //Start the Spark context val conf = new SparkConf() .setAppName("LogisticRegression") .setMaster("local[*]") val sc = new SparkContext(conf) //Load pubchem.svm val data = MLUtils.loadLibSVMFile(sc, "pubchem.svm") //Split the data in training and test val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) //Train the model using Logistic Regression with LBFGS val lbfgs = new LogisticRegressionWithLBFGS() val model = lbfgs.run(training) model.clearThreshold() //Compute the probability to be in the positive class for each of the test examples val probAndLabels = test.map { testExample => val probability = model.predict(testExample.features) (probability, testExample.label) } //Compute the area under the ROC curve using the Spark's BinaryClassificationMetrics class val metrics = new BinaryClassificationMetrics(probAndLabels) val auROC = metrics.areaUnderROC() println("Area under ROC = " + auROC) //print the area under the ROC //Stop the Spark context sc.stop } }
Example 4
Source File: SVMWithSGDDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 5
Source File: LogisticRegressionWithLBFGSDeom.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = modelBFGS.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 6
Source File: LogisticRegressionWithLBFGSExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors val points = Array( //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(0.0,Vectors.dense(0.245)), LabeledPoint(0.0,Vectors.dense(0.247)), LabeledPoint(1.0,Vectors.dense(0.285)), LabeledPoint(1.0,Vectors.dense(0.299)), LabeledPoint(1.0,Vectors.dense(0.327)), LabeledPoint(1.0,Vectors.dense(0.347)), LabeledPoint(0.0,Vectors.dense(0.356)), LabeledPoint(1.0,Vectors.dense(0.36)), LabeledPoint(0.0,Vectors.dense(0.363)), LabeledPoint(1.0,Vectors.dense(0.364)), LabeledPoint(0.0,Vectors.dense(0.398)), LabeledPoint(1.0,Vectors.dense(0.4)), LabeledPoint(0.0,Vectors.dense(0.409)), LabeledPoint(1.0,Vectors.dense(0.421)), LabeledPoint(0.0,Vectors.dense(0.432)), LabeledPoint(1.0,Vectors.dense(0.473)), LabeledPoint(1.0,Vectors.dense(0.509)), LabeledPoint(1.0,Vectors.dense(0.529)), LabeledPoint(0.0,Vectors.dense(0.561)), LabeledPoint(0.0,Vectors.dense(0.569)), LabeledPoint(1.0,Vectors.dense(0.594)), LabeledPoint(1.0,Vectors.dense(0.638)), LabeledPoint(1.0,Vectors.dense(0.656)), LabeledPoint(1.0,Vectors.dense(0.816)), LabeledPoint(1.0,Vectors.dense(0.853)), LabeledPoint(1.0,Vectors.dense(0.938)), LabeledPoint(1.0,Vectors.dense(1.036)), LabeledPoint(1.0,Vectors.dense(1.045))) //创建之前数据的RDD val spiderRDD = sc.parallelize(points) //使用数据训练模型(当所有预测值为0的时候,拦截是有意义的) //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val lr = new LogisticRegressionWithLBFGS().setIntercept(true) val model = lr.run(spiderRDD) //预测0.938尺度的蜘蛛的现状 val predict = model.predict(Vectors.dense(0.938)) } }
Example 7
Source File: MultilayerPerceptronClassifierSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSparkContext { test("XOR function learning as binary classification problem with two outputs.") { val dataFrame = sqlContext.createDataFrame(Seq( (Vectors.dense(0.0, 0.0), 0.0), (Vectors.dense(0.0, 1.0), 1.0), (Vectors.dense(1.0, 0.0), 1.0), (Vectors.dense(1.0, 1.0), 0.0)) ).toDF("features", "label") val layers = Array[Int](2, 5, 2) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) .setMaxIter(100) val model = trainer.fit(dataFrame) val result = model.transform(dataFrame) val predictionAndLabels = result.select("prediction", "label").collect() predictionAndLabels.foreach { case Row(p: Double, l: Double) => assert(p == l) } } // TODO: implement a more rigorous test test("3 class classification with 2 hidden layers") { val nPoints = 1000 // The following coefficients are taken from OneVsRestSuite.scala // they represent 3-class iris dataset val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) // the input seed is somewhat magic, to make this test pass val rdd = sc.parallelize(generateMultinomialLogisticInput( coefficients, xMean, xVariance, true, nPoints, 1), 2) val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features") val numClasses = 3 val numIterations = 100 val layers = Array[Int](4, 5, 4, numClasses) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) // currently this seed is ignored .setMaxIter(numIterations) val model = trainer.fit(dataFrame) val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label") .map { case Row(p: Double, l: Double) => (p, l) } // train multinomial logistic regression val lr = new LogisticRegressionWithLBFGS() .setIntercept(true) .setNumClasses(numClasses) lr.optimizer.setRegParam(0.0) .setNumIterations(numIterations) val lrModel = lr.run(rdd) val lrPredictionAndLabels = lrModel.predict(rdd.map(_.features)).zip(rdd.map(_.label)) // MLP's predictions should not differ a lot from LR's. val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels) val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels) assert(mlpMetrics.confusionMatrix ~== lrMetrics.confusionMatrix absTol 100) } }
Example 8
Source File: LogisticRegression.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object LogisticRegression { def main(args: Array[String]): Unit = { var inputPath = "" if (args.length == 1) { inputPath = args(0) } val conf = new SparkConf().setAppName("LogisticRegressionWithLBFGS") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) // $example on$ // Load training data in LIBSVM format. val data: RDD[LabeledPoint] = sc.objectFile(inputPath) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } val accuracy = predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count() println(s"Accuracy = $accuracy") sc.stop() } } // scalastyle:on println
Example 9
Source File: MyBinaryClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils object MyBinaryClassification { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .appName("myBinaryClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() // Load training data in LIBSVM format //https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html val data = MLUtils.loadLibSVMFile(spark.sparkContext, "../data/sparkml2/chapter4/myBinaryClassificationData.txt") // Split data into training (60%) and test (40%) val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) training.cache() // Run training algorithm to build the model val model = new LogisticRegressionWithLBFGS() .setNumClasses(2) .run(training) // Clear the prediction threshold so the model will return probabilities model.clearThreshold // Compute raw scores on the test set val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Instantiate metrics object val metrics = new BinaryClassificationMetrics(predictionAndLabels) // Precision by threshold val precision = metrics.precisionByThreshold precision.foreach { case (t, p) => println(s"Threshold: $t, Precision: $p") } // Recall by threshold val recall = metrics.recallByThreshold recall.foreach { case (t, r) => println(s"Threshold: $t, Recall: $r") } val PRC = metrics.pr val f1Score = metrics.fMeasureByThreshold f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 1") } val beta = 0.5 val fScore = metrics.fMeasureByThreshold(beta) f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 0.5") } val auPRC = metrics.areaUnderPR println("Area under precision-recall curve = " + auPRC) val thresholds = precision.map(_._1) val roc = metrics.roc val auROC = metrics.areaUnderROC println("Area under ROC = " + auROC) spark.stop() } }