org.apache.spark.mllib.clustering.StreamingKMeans Scala Examples
The following examples show how to use org.apache.spark.mllib.clustering.StreamingKMeans.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingKMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 2
Source File: L9-10KMeans.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object KMeansClusteringApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val orientationStream = substream .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray) .map(arr => arr.map(_.toDouble)) .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingKMeans() .setK(3) .setDecayFactor(0) .setRandomCenters(18, 0.0) model.trainOn(train.map(v => v.features)) val prediction = model.predictOnValues(test.map(v => (v.label, v.features))) ssc.start() ssc.awaitTermination() } }
Example 3
Source File: StreamingKMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 4
Source File: HandsOnKMeanStreaming.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.mllib.clustering.StreamingKMeans object HandsOnKMeanStreaming { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnKMeanStreaming") val ssc = new StreamingContext(conf, Seconds(10)) val model = new StreamingKMeans(). setK(4). // number of clusters is 4 setDecayFactor(1.0). // decay factor (the forgetfulness of the previous centroids) setRandomCenters(3, 0.0) // 3 dimensions and 0 weight import org.apache.spark.mllib.linalg.Vectors val trainingData = ssc.textFileStream("file:/tmp/k-means-train-data").map(Vectors.parse).cache() trainingData.print() import org.apache.spark.mllib.regression.LabeledPoint val testData = ssc.textFileStream("file:/tmp/k-means-test-data").map(LabeledPoint.parse) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes) } }
Example 5
Source File: StreamingKMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 6
Source File: StreamingKMeansExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } }
Example 7
Source File: StreamingKMeansExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") //批次间隔 val ssc = new StreamingContext(conf, Seconds(3.toLong)) //文件流,训练目录,解析向量 val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) //测试目录 val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() //聚类的个数 .setK(args(3).toInt) //直接设置衰减因子 .setDecayFactor(1.0) //随机中心数 .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData)//对数据集进行聚类训练 //predict 对新的数据点进行所属聚类的预测 model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 8
Source File: StreamingKMeansExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 9
Source File: StreamingKMeansExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 10
Source File: MyStreamingKMeans.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter8 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object MyStreamingKMeans { def main(args: Array[String]) { val trainingDir = "../data/sparkml2/chapter8/trainingDir" val testDir = "../data/sparkml2/chapter8/testDir" val batchDuration = 10 val numClusters = 2 val numDimensions = 3 Logger.getLogger("org").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("myStreamingKMeans") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(batchDuration.toLong)) val trainingData = ssc.textFileStream(trainingDir).map(Vectors.parse) val testData = ssc.textFileStream(testDir).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(numClusters) .setDecayFactor(1.0) .setRandomCenters(numDimensions, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 11
Source File: KMeansStreaming.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.mutable.Queue object KMeansStreaming { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("KMean Streaming App") .config("spark.sql.warehouse.dir", ".") .config("spark.executor.memory", "2g") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(1)) Logger.getRootLogger.setLevel(Level.WARN) val irisData = IrisData.readFromFile(spark.sparkContext) val lookup = IrisData.buildLabelLookup(irisData) val trainQueue = new Queue[RDD[LabeledPoint]]() val testQueue = new Queue[RDD[LabeledPoint]]() val trainingStream = ssc.queueStream(trainQueue) val testStream = ssc.queueStream(testQueue) val model = new StreamingKMeans().setK(3) .setDecayFactor(1.0) .setRandomCenters(4, 0.0) model.trainOn(trainingStream.map(lp => lp.features)) val values = model.predictOnValues(testStream.map(lp => (lp.label, lp.features))) values.foreachRDD(n => n.foreach(v => { println(v._2, v._1, lookup(v._1.toLong)) })) ssc.start() val irisLabelPoints = irisData.map(record => IrisData.toLabelPoints(record)) val Array(trainData, test) = irisLabelPoints.randomSplit(Array(.80, .20)) trainQueue += irisLabelPoints Thread.sleep(2000) val testGroups = test.randomSplit(Array(.25, .25, .25, .25)) testGroups.foreach(group => { testQueue += group println("-" * 25) Thread.sleep(1000) }) ssc.stop() } }