org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD Scala Examples
The following examples show how to use org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingLogisticRegression.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import com.bigchange.util.{FileUtil, TimeUtil} import org.apache.spark.SparkConf import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) // model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() model.predictOnValues(testData.map(lp => (lp.label, lp.features))).map(x => x._1 +"\t" +x._2).foreachRDD(rdd =>{ val value = rdd.collect() FileUtil.normalFileWriter("F:\\datatest\\ai\\StreamingLogisticRegression\\"+TimeUtil.getCurrentHour,value) }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 2
Source File: L9-9LogisticRegression.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD object LogisticRegressionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(4)) .setStepSize(0.0001) .setNumIterations(1) model.trainOn(train) model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd .map(v => math.pow((v._1 - v._2), 2)).mean()))) ssc.start() ssc.awaitTermination() } }
Example 3
Source File: StreamingLogisticRegression.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") //批次间隔 val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) //SGD基于梯度下降,仅支持2分类 val model = new StreamingLogisticRegressionWithSGD() //initialWeights初始取值,默认是0向量 .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 4
Source File: LogisticStreaming.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.mutable.Queue object LogisticStreaming { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getRootLogger.setLevel(Level.WARN) val spark = SparkSession .builder .master("local[*]") .appName("Logistic Streaming App") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ val ssc = new StreamingContext(spark.sparkContext, Seconds(2)) val rawDF = spark.read .text("../data/sparkml2/chapter13/pima-indians-diabetes.data").as[String] val buf = rawDF.rdd.map(value => { val data = value.split(",") (data.init.toSeq, data.last) }) val lps = buf.map{ case (feature: Seq[String], label: String) => val featureVector = feature.map(_.toDouble).toArray[Double] LabeledPoint(label.toDouble, Vectors.dense(featureVector)) } val trainQueue = new Queue[RDD[LabeledPoint]]() val testQueue = new Queue[RDD[LabeledPoint]]() val trainingStream = ssc.queueStream(trainQueue) val testStream = ssc.queueStream(testQueue) val numFeatures = 8 val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) .setNumIterations(15) .setStepSize(0.5) .setMiniBatchFraction(0.25) model.trainOn(trainingStream) val result = model.predictOnValues(testStream.map(lp => (lp.label, lp.features))) result.map{ case (label: Double, prediction: Double) => (label, prediction) }.print() ssc.start() val Array(trainData, test) = lps.randomSplit(Array(.80, .20)) trainQueue += trainData Thread.sleep(4000) val testGroups = test.randomSplit(Array(.50, .50)) testGroups.foreach(group => { testQueue += group Thread.sleep(2000) }) ssc.stop() } }