org.apache.spark.mllib.clustering.KMeans Scala Examples
The following examples show how to use org.apache.spark.mllib.clustering.KMeans.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 2
Source File: MnistExample.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object MnistExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate() val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8) .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => Vectors.dense(arr.slice(1, 785))) val model = new KMeans() .setK(10) .setInitializationMode("random") .setMaxIterations(10) .run(trainRDD) println("final clusters:") println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) } }
Example 3
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.etl.machinelearning.kudu import com.cloudera.sa.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 4
Source File: KmeansModelSaveToOss.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.mllib import org.apache.spark.mllib.clustering.KMeans._ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SparkSession object KmeansModelSaveToOss { val modelOssDir = "oss://bucket/kmeans-model" def main(args: Array[String]) { //1. train and save the model val spark = SparkSession .builder() .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") .appName("KmeansModelSaveToOss") .getOrCreate() val sc = spark.sparkContext val points = Seq( Vectors.dense(0.0, 0.0), Vectors.dense(0.0, 0.1), Vectors.dense(0.1, 0.0), Vectors.dense(9.0, 0.0), Vectors.dense(9.0, 0.2), Vectors.dense(9.2, 0.0) ) val rdd = sc.parallelize(points, 3) val initMode = K_MEANS_PARALLEL val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode) val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect println("modelOssDir=" + modelOssDir) model.save(sc, modelOssDir) //2. predict from the oss model val modelLoadOss = KMeansModel.load(sc, modelOssDir) val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect assert(predictResult1.size == predictResult2.size) predictResult2.foreach(result2 => assert(predictResult1.contains(result2))) } }
Example 5
Source File: KmeansModelSaveToOss.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.mllib import org.apache.spark.mllib.clustering.KMeans._ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} object KmeansModelSaveToOss { val modelOssDir = "oss://bucket/kmeans-model" def main(args: Array[String]) { //1. train and save the model val conf = new SparkConf().setAppName("KmeansModelSaveToOss") conf.set("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") conf.set("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") conf.set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") val sc = new SparkContext(conf) val points = Seq( Vectors.dense(0.0, 0.0), Vectors.dense(0.0, 0.1), Vectors.dense(0.1, 0.0), Vectors.dense(9.0, 0.0), Vectors.dense(9.0, 0.2), Vectors.dense(9.2, 0.0) ) val rdd = sc.parallelize(points, 3) val initMode = K_MEANS_PARALLEL val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode) val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect println("modelOssDir=" + modelOssDir) model.save(sc, modelOssDir) //2. predict from the oss model val modelLoadOss = KMeansModel.load(sc, modelOssDir) val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect assert(predictResult1.size == predictResult2.size) predictResult2.foreach(result2 => assert(predictResult1.contains(result2))) } }
Example 6
Source File: DenseKMeans.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.hadoop.io.LongWritable import org.apache.log4j.{Level, Logger} import org.apache.mahout.math.VectorWritable import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} import scopt.OptionParser object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import com.intel.hibench.sparkbench.ml.DenseKMeans.InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default; ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) // Logger.getRootLogger.setLevel(Level.WARN) val data = sc.sequenceFile[LongWritable, VectorWritable](params.input) val examples = data.map { case (k, v) => var vector: Array[Double] = new Array[Double](v.get().size) for (i <- 0 until v.get().size) vector(i) = v.get().get(i) Vectors.dense(vector) }.cache() // val examples = sc.textFile(params.input).map { line => // Vectors.dense(line.split(' ').map(_.toDouble)) // }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } }
Example 7
Source File: DenseKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 8
Source File: KMeansExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println(s"Within Set Sum of Squared Errors = $WSSSE") // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: DenseKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 10
Source File: KMeansExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors object KMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering") val sc = new SparkContext(sparkConf) //加载saratoga到RDD val data = sc.textFile("../data/mllib/saratoga.csv") //把数据转换成密集向量的RDD val parsedData = data.map( line => Vectors.dense(line.split(',').map(_.toDouble))) //以4个簇和5次迭代训练模型 val kmmodel= KMeans.train(parsedData,4,5) //把parsedData数据收集本地数据集 val houses = parsedData.collect //预测第1个元素的簇,KMeans算法会从0给出簇的ID, val prediction1 = kmmodel.predict(houses(0)) //预测houses(18)的数据,占地面积876,价格66.5属于那个簇 val prediction2 = kmmodel.predict(houses(18)) //预测houses(35)的数据,占地面积15750,价格112属于那个簇 val prediction3 = kmmodel.predict(houses(35)) //预测houses(6)的数据,占地面积38768,价格272属于那个簇 val prediction4 = kmmodel.predict(houses(6)) //预测houses(15)的数据,占地面积69696,价格275属于那个簇 val prediction5 = kmmodel.predict(houses(15)) } }
Example 11
Source File: KMeansClustering_IBM.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.mllib.clustering.{ KMeans, KMeansModel } import org.apache.spark.mllib.linalg.Vectors Vectors.dense(line.split(",").map(_.trim).filter(!"".equals(_)).map(_.toDouble)) }) parsedTestData.collect().foreach(testDataLine => { //计算测试数据分别属于那个簇类 val predictedClusterIndex: Int = clusters.predict(testDataLine) println("测试样本: " + testDataLine.toString + " 属于聚类 " + predictedClusterIndex) }) println("Spark MLlib K-means clustering test finished.") //评估KMeans模型 如何选择K值 val ks: Array[Int] = Array(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 50, 80, 100) ks.foreach(cluster => { //parsedTrainingData训练模型数据 val model: KMeansModel = KMeans.train(parsedTrainingData, cluster, 30, 1) //KMeansModel 类里提供了 computeCost 方法,该方法通过计算所有数据点到其最近的中心点的平方和来评估聚类的效果。 //统计聚类错误的样本比例 val ssd = model.computeCost(parsedTrainingData) //model.predict(point) println("sum of squared distances of points to their nearest center when k=" + cluster + " -> " + ssd) }) } //过滤标题行 private def isColumnNameLine(line: String): Boolean = { if (line != null && line.contains("Channel")) true else false } }
Example 12
Source File: DenseKMeans.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } }
Example 13
Source File: KMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 14
Source File: DenseKMeans.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 15
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 16
Source File: KMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 17
Source File: DenseKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 18
Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.linalg._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.MemoryStream import org.scalatest.FunSuite import org.apache.log4j.{Level, Logger} case class TestRow(features: Vector) class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase { override def beforeAll(): Unit = { super.beforeAll() Logger.getLogger("org").setLevel(Level.OFF) } test("streaming model with one center should converge to true center") { import spark.implicits._ val k = 1 val dim = 5 val clusterSpread = 0.1 val seed = 63 // TODO: this test is very flaky. The centers do not converge for some // (most?) random seeds val (batches, trueCenters) = StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed) val inputStream = MemoryStream[TestRow] val ds = inputStream.toDS() val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01) val query = skm.evilTrain(ds.toDF()) val streamingModels = batches.map { batch => inputStream.addData(batch) query.processAllAvailable() skm.getModel } // TODO: use spark's testing suite streamingModels.last.centers.zip(trueCenters).foreach { case (center, trueCenter) => val centers = center.toArray.mkString(",") val trueCenters = trueCenter.toArray.mkString(",") println(s"${centers} | ${trueCenters}") assert(center.toArray.zip(trueCenter.toArray).forall( x => math.abs(x._1 - x._2) < 0.1)) } query.stop() } def compareBatchAndStreaming( batchModel: KMeansModel, streamingModel: StreamingKMeansModel, validationData: DataFrame): Unit = { assert(batchModel.clusterCenters === streamingModel.centers) // TODO: implement prediction comparison } } object StreamingKMeansSuite { def generateBatches( numPoints: Int, numBatches: Int, k: Int, d: Int, r: Double, seed: Int, initCenters: Array[Vector] = null): (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = { val rand = scala.util.Random rand.setSeed(seed) val centers = initCenters match { case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian()))) case _ => initCenters } val data = (0 until numBatches).map { i => (0 until numPoints).map { idx => val center = centers(idx % k) val vec = Vectors.dense( Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r)) TestRow(vec) } } (data, centers) } }
Example 19
Source File: BasicSparkSQLExamples.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object BasicSparkSQLExamples { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tablename> <runLocal>") } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val tableName = args(1) val runLocal = args(2).equals("l") println("starting") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } try { println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName) println("Query 1: SELECT count(*) FROM " + tableName) val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)" println("Query 3: " + q3) val startTimeQ3 = System.currentTimeMillis() sqlContext.sql(q3).take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble) Vectors.dense(array) }) val clusters = KMeans.train(parsedData, 3, 4) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } finally { sc.stop() } } }
Example 20
Source File: GamerSparkSQLExample.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.aggregates import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object GamerSparkSQLExample { def main(args:Array[String]): Unit = { if (args.length == 0) { println("{kudumaster} {runLocal}") return } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val runLocal = args(1).equals("l") println("Loading Spark Context") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } println("Loading Spark Context: Finished") println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer") println("Query 1: SELECT count(*) FROM gamer") val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT * FROM gamer limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100") val startTimeQ3 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer") val startTimeQ4 = System.currentTimeMillis() sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4)) println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble) Vectors.dense(array) }) val dataCount = parsedData.count() if (dataCount > 0) { val clusters = KMeans.train(parsedData, 3, 5) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) } //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } }
Example 21
Source File: value_model.scala From Spark_Personas with MIT License | 5 votes |
val input_df = hiveContext.sql("select t.lenovo_id,t.monetary,cast(t.frequency as int) as frequency,t.recency from model_input_rfm_t t") val row_nums = input_df.count.toInt //获得总行数 val row_partition = row_nums / 5 //获得5分区点 val row_partition6 = row_nums / 6 //获得6分区点 val input_sort_monetary = input_df.sort($"monetary".desc).collect() val input_sort_frequency = input_df.sort($"frequency".desc).collect() //wrong val input_sort_recency = input_df.sort($"recency".desc).collect() //monetary的分区 val monetary_1 = input_sort_monetary(row_partition * 1).get(1).asInstanceOf[Number].intValue val monetary_2 = input_sort_monetary(row_partition * 2).get(1).asInstanceOf[Number].intValue val monetary_3 = input_sort_monetary(row_partition * 3).get(1).asInstanceOf[Number].intValue val monetary_4 = input_sort_monetary(row_partition * 4).get(1).asInstanceOf[Number].intValue //frequency的分区 val frequency_1 = input_sort_frequency (row_partition * 1).get(2).asInstanceOf[Integer].toInt val frequency_2 = input_sort_frequency (row_partition * 2).get(2).asInstanceOf[Integer].toInt val frequency_3 = input_sort_frequency (row_partition * 3).get(2).asInstanceOf[Integer].toInt val frequency_4 = input_sort_frequency (row_partition * 4).get(2).asInstanceOf[Integer].toInt //recency的分区 val result= input_sort_recency(row_partition6 * 1).get(3).asInstanceOf[String].toString val recency_1 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 2).get(3).asInstanceOf[String].toString val recency_2 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 3).get(3).asInstanceOf[String].toString val recency_3 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 4).get(3).asInstanceOf[String].toString val recency_4 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 5).get(3).asInstanceOf[String].toString val recency_5 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val io_monetary = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1 where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,(case when t2.monetary > "+monetary_1+ " then 5 when t2.monetary >"+monetary_2+" then 4 when t2.monetary > "+monetary_3+" then 3 when t2.monetary >"+monetary_4 + " then 2 else 1 end) as points, ' ', ' ' from model_input_rfm_t t2)") io_monetary .registerTempTable("temporary_monetary") //金额临时表 val io_frequency = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1 where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,(case when t2.frequency> "+frequency_1 + " then (50+t3.points) when t2.frequency>"+frequency_2 +" then (40+t3.points) when t2.frequency> "+frequency_3 +" then (30+t3.points) when t2.frequency>"+frequency_4 + " then (20+t3.points) else (10+t3.points) end) as points, ' ', ' ' from model_input_rfm_t t2,temporary_monetary t3 where t2.lenovo_id = t3.lenovo_id)") io_frequency.registerTempTable("temporary_frequency") //频率临时表 //归一化 val result = hiveContext.sql("select max(cast(frequency as int)) from model_input_rfm_t") //求最大频率 val max_frequency = result.collect()(0).get(0).asInstanceOf[Integer].toInt val result = hiveContext.sql("select min(cast(frequency as int)) from temporary_frequency") //最小频率 val min_frequency = result.collect()(0).get(0).asInstanceOf[Integer].toInt val region_frequency = max_frequency - min_frequency val result = hiveContext.sql("select max(unix_timestamp(concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2)),'yyyyMMdd')) from temporary_frequency t2") val max_recency = result.collect()(0).get(0).asInstanceOf[Long] //最大时间 val result = hiveContext.sql("select min(unix_timestamp(concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2)),'yyyyMMdd')) from temporary_frequency t2") val min_recency = result.collect()(0).get(0).asInstanceOf[Long] //最小时间 val region_recency = max_recency - min_recency //时间最大区间 val result =hiveContext.sql("select max(monetary) from model_input_rfm_t") val max_monetary = result.collect()(0).get(0).asInstanceOf[Float] //最大金额 //val result =hiveContext.sql("select min(monetary) from model_input_rfm_t") //val min_monetary = result.collect()(0).get(0).asInstanceOf[Float] //最小金额 val min_monetary = 0 val region_monetary = max_monetary - min_monetary //金额最大区间 val io_recency = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1 where 1=0 ) union all (select t2.lenovo_id, ((t2.frequency - "+min_frequency+")/" + region_frequency + ") as frequency, ((t2.monetary - "+min_monetary+") /" + region_monetary+") as monetary, ((unix_timestamp(t2.recency,'yyyy-MM-dd')- "+min_recency+") / " + region_recency + ") as recency,(case when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))> "+recency_1+ " then (600+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_2+" then (500+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))> "+recency_3+" then (400+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_4+ " then (300+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_5+ " then (200+t3.points) else (100+t3.points) end) as points, ' ', ' ' from model_input_rfm_t t2,temporary_frequency t3 where t2.lenovo_id = t3.lenovo_id)") io_recency.registerTempTable("temporary_recency") //日期临时表 //聚类算法 import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} //DataFrame转化为RDD,直接io_recency.rdd即可 val parsedData = io_recency.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble)) //.cache() val numClusters = 8 val numIterations = 20 val model = KMeans.train(parsedData, numClusters, numIterations) model.clusterCenters.foreach(println) val WSSSE = model.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) val insertData = io_recency.rdd.map( s => Vectors.dense(s.get(0).asInstanceOf[String].toLong,s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble,s.get(4).asInstanceOf[Integer].toInt,' ',model.predict(Vectors.dense(s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble))) ) //.cache() import spark.implicits._ case class Cluster(lenovo_id: Long, frequency:Double,monetary:Double,recency:Double,points:Double,flag:Double,cluster:Double) val rdd_df = insertData.map(attributes => Cluster(attributes(0).toLong, attributes(1).toDouble, attributes(2).toDouble, attributes(3).toDouble, attributes(4).toDouble, attributes(5).toDouble, attributes(6).toDouble)).toDF() rdd_df.registerTempTable("temporary_cluster") hiveContext.sql("insert overwrite table userfigure_local.model_output_rfm_t partition (l_day='2016-10-01') select * from temporary_cluster") val io_cluster = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,t2.points, t2.flag,t2.cluster from temporary_cluster t2)") hiveContext.sql("insert into model_output_rfm_t partition(l_day='2016-10-01') select * from table1")
Example 22
Source File: KmeansConfig.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mllib import org.apache.spark.mllib.clustering.KMeans private[spark] object KmeansConfig { private val MAX_NUM_CLUSTERS = 500 private val MAX_NUM_ITERS = 250 private val MAX_NUM_RUNS = 500 private def check(K: Int, maxNumIters: Int, numRuns: Int): Unit = { require(K > 0 && K < MAX_NUM_CLUSTERS, "Number of clusters K $K is out of range") require( maxNumIters > 0 && maxNumIters < MAX_NUM_ITERS, s"Maximum number of iterations $maxNumIters is out of range" ) require( numRuns > 0 && numRuns < MAX_NUM_RUNS, s"Maximum number of runs for K-means $numRuns is out of range" ) } } // -------------------------------------- EOF ---------------------------------------------------
Example 23
Source File: KMeans.scala From spark-tda with Apache License 2.0 | 5 votes |
import java.io.{File, PrintWriter} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.sql.functions._ def computeKMeans( pathToTextFile: String, quantity: Int, iteration: Int) { case class Point(x: Double, y: Double) def save(f: File)(func: PrintWriter => Unit) { val p = new PrintWriter(f) try { func(p) } finally { p.close() } } val filename = pathToTextFile.split("\\.")(0) val outputFilename = s"$filename-KMEANS-k${quantity}-i${iteration}.tsv" val points = sc .textFile(pathToTextFile) .map { line => line.trim.split("\\s+") } .map { row => Point(row(0).toDouble, row(1).toDouble) } val features = points .map { p => Vectors.dense(p.x, p.y) } features.cache() val kmeans = KMeans.train(features, quantity, iteration) val predictions = features .map { f => (f(0), f(1), model.predict(f) + 1) } .collect save(new File(outputFilename)) { println(s"OUTPUT TO: ${outputFilename}") f => predictions.foreach{ case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}") } } }
Example 24
Source File: get_labels_from_VT_signatures.scala From gsoc_relationship with Apache License 2.0 | 5 votes |
import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.clustering.KMeans import PreProcessingConfig._ case class VT_sample_label_rdd_class(sha256:String, label:Array[Double]) def OnehotEncode(number : Double): Array[Double]={ var Listnew = Array.iterate(0.0,kmeans_cluster_number)(a=>0.0) Listnew(number.toInt)=1 return Listnew } val VT_sample_signatures_final_array_rdd = spark.read.format("parquet").load(VT_sample_signatures_final_array_file).rdd.map(row => new VT_sample_signatures_final_array_rdd_class(row(0).toString,row(1).asInstanceOf[Seq[Double]].toArray)) val VT_sample_signatures_with_sha_rddvector = VT_sample_signatures_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results))) val VT_sample_signatures_rddvector = VT_sample_signatures_with_sha_rddvector.map(x=>x._2) val KMeans_Model = KMeans.train(VT_sample_signatures_rddvector,kmeans_cluster_number,30,2) val VT_sample_signatures_label_with_sha_rdd = VT_sample_signatures_with_sha_rddvector.map(x=>(x._1,KMeans_Model.predict(x._2))) val VT_sample_label_rdd = VT_sample_signatures_label_with_sha_rdd.map(x=>new VT_sample_label_rdd_class(x._1, OnehotEncode(x._2.toDouble))) VT_sample_label_rdd.toDF().write.format("parquet").save(VT_sample_label_file)
Example 25
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 26
Source File: DenseKMeans.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( var input: String = null, k: Int = 2, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() defaultParams.input = args(0) run(defaultParams) } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) // Return the K-means cost (sum of squared distances of points to their nearest center) for this val cost = model.computeCost(examples) // 获取质点(k个) val centerPoint = model.clusterCenters val one = centerPoint(0) val two = centerPoint(1) println(s"centerPoint=$one,$two.") println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 27
Source File: Main.scala From didactic-computing-machine with GNU Affero General Public License v3.0 | 5 votes |
package example import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} object Main extends App { val conf = new SparkConf() .setAppName("K means cluster") .setMaster("local") val sc = SparkContext .getOrCreate(conf) val data = sc.parallelize( Vector( Vector(-4.0, -1.0, -4.0), Vector(2.0, 0.0, 0.0), Vector(1.0, -2.0, 4.0), Vector(-3.0, -4.0, -1.0), Vector(2.0, -4.0, 0.0), Vector(2.0, 1.0, -5), Vector(3.0, -3.0, 0.0), Vector(-1.0, -1.0, 1.0) ).map(t => Vectors.dense(t.toArray))) val numOfClusters = 3 val numOfIterations = 100 val clusters = KMeans.train(data, numOfClusters, numOfIterations) println("Cluster centers") clusters.clusterCenters.foreach(println) println("Squared Errors") println(clusters.computeCost(data)) println("Predictions") println(clusters.predict(Vectors.dense(0.0, 0.0, 0.0))) println(clusters.predict(Vectors.dense(-3.0, -2.0, 1.5))) }
Example 28
Source File: KMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println