org.apache.spark.mllib.classification.SVMWithSGD Scala Examples
The following examples show how to use org.apache.spark.mllib.classification.SVMWithSGD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: BinaryClassification.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.optimization.{L1Updater, SquaredL2Updater} import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --algorithm LR --regType L2 --regParam 1.0 \ | data/mllib/sample_binary_classification_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"BinaryClassification with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val model = params.algorithm match { case LR => val algorithm = new LogisticRegressionWithLBFGS() algorithm.optimizer .setNumIterations(params.numIterations) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() case SVM => val algorithm = new SVMWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) algorithm.run(training).clearThreshold() } val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val metrics = new BinaryClassificationMetrics(predictionAndLabel) println(s"Test areaUnderPR = ${metrics.areaUnderPR()}.") println(s"Test areaUnderROC = ${metrics.areaUnderROC()}.") sc.stop() } } // scalastyle:on println
Example 2
Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.SVMWithSGD class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("numIterations", numIterations)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = SVMWithSGD.train(trainingData, numIterations) log.debug("Classification Model:" + model) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationSvm { def apply(numIterations: Int = 100) = { new PipeClassificationSvm(numIterations) } }
Example 3
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 4
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 5
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 6
Source File: SVMWithSGDDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 7
Source File: LogisticRegressionWithLBFGSDeom.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = modelBFGS.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 8
Source File: SVMWithSGDExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils import org.apache.spark.SparkConf object SVMWithSGDExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SVMWithSGDExample").setMaster("local[4]") val sc = new SparkContext(conf) //把数据加载成RDD val svmData = MLUtils.loadLibSVMFile(sc, "../data/mllib/sample_libsvm_data.txt") //计算记录的数目 svmData.count //把数据集分成两半,一半训练数据和一半测试数据 val trainingAndTest = svmData.randomSplit(Array(0.5, 0.5)) //训练数据和测试数据赋值 val trainingData = trainingAndTest(0) val testData = trainingAndTest(1) //训练算法产并经过100次迭代构建模型 (SGD随机梯度下降) val model = SVMWithSGD.train(trainingData, 100) //用模型去为任意数据集预测标签,使用测试数据中的第一个点测试标签 val label = model.predict(testData.first.features) //创建一个元组,其中第一个元素是测试数据的预测标签,第二个元素是实际标签 val predictionsAndLabels = testData.map(r => (model.predict(r.features), r.label)) //计算有多少预测标签和实际标签不匹配的记录 predictionsAndLabels.filter(p => p._1 != p._2).count } }
Example 9
Source File: SVMWithSGDExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object SVMWithSGDExample { case class Params( numIterations: Int = 100, stepSize: Double = 1.0, regParam: Double = 0.01, dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("SVM") { head("SVM: an example of SVM for classification.") opt[Int]("numIterations") .text(s"numIterations, default: ${defaultParams.numIterations}") .action((x,c) => c.copy(numIterations = x)) opt[Double]("stepSize") .text(s"stepSize, default: ${defaultParams.stepSize}") .action((x,c) => c.copy(stepSize = x)) opt[Double]("regParam") .text(s"regParam, default: ${defaultParams.regParam}") .action((x,c) => c.copy(regParam = x)) arg[String]("<dataPath>") .required() .text("data path of SVM") .action((x, c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SVM with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val dataPath = params.dataPath val numIterations = params.numIterations val stepSize = params.stepSize val regParam = params.regParam val data: RDD[LabeledPoint] = sc.objectFile(dataPath) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = SVMWithSGD.train(training, numIterations, stepSize, regParam) // Clear the default threshold. model.clearThreshold() // Compute raw scores on the test set. val scoreAndLabels = test.map { point => val score = model.predict(point.features) (score, point.label) } // Get evaluation metrics. val metrics = new BinaryClassificationMetrics(scoreAndLabels) val auROC = metrics.areaUnderROC() println("Area under ROC = " + auROC) sc.stop() } }