org.apache.spark.ml.clustering.KMeans Scala Examples
The following examples show how to use org.apache.spark.ml.clustering.KMeans.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: KMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml // scalastyle:off println // $example on$ import org.apache.spark.ml.clustering.KMeans // $example off$ import org.apache.spark.sql.SparkSession object KMeansExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() // $example on$ // Loads data. val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") // Trains a k-means model. val kmeans = new KMeans().setK(2).setSeed(1L) val model = kmeans.fit(dataset) // Evaluate clustering by computing Within Set Sum of Squared Errors. val WSSSE = model.computeCost(dataset) println(s"Within Set Sum of Squared Errors = $WSSSE") // Shows the result. println("Cluster Centers: ") model.clusterCenters.foreach(println) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: KmeansTraining.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.ml_clustering import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.ml.clustering.KMeans import org.apache.spark.sql.SparkSession class KmeansTraining extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Kmeans clustering" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var training_data_path:String =_ var model_save_path:String=_ var maxIter:String=_ var k:Int=_ var minTol:String=_ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //load data stored in libsvm format as a dataframe val data=spark.read.format("libsvm").load(training_data_path) //Param for maximum number of iterations (>= 0) var maxIterValue:Int=50 if(maxIter!=""){ maxIterValue=maxIter.toInt } //Param for the convergence tolerance for iterative algorithms (>= 0). var minTolValue:Double=1E-6 if(minTol!=""){ minTolValue=minTol.toDouble } //clustering with kmeans algorithm val model=new KMeans() .setMaxIter(maxIterValue) .setTol(minTolValue) .setK(k) .fit(data) //model persistence model.save(model_save_path) import spark.implicits._ val dfOut=Seq(model_save_path).toDF dfOut.show() out.write(dfOut) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map: Map[String, Any]): Unit = { training_data_path=MapUtil.get(map,key="training_data_path").asInstanceOf[String] model_save_path=MapUtil.get(map,key="model_save_path").asInstanceOf[String] maxIter=MapUtil.get(map,key="maxIter").asInstanceOf[String] minTol=MapUtil.get(map,key="minTol").asInstanceOf[String] k=Integer.parseInt(MapUtil.get(map,key="k").toString) } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val training_data_path = new PropertyDescriptor().name("training_data_path").displayName("TRAINING_DATA_PATH").defaultValue("").required(true) val model_save_path = new PropertyDescriptor().name("model_save_path").displayName("MODEL_SAVE_PATH").description("").defaultValue("").required(true) val maxIter=new PropertyDescriptor().name("maxIter").displayName("MAX_ITER").description("Param for maximum number of iterations (>= 0).").defaultValue("").required(false) val minTol=new PropertyDescriptor().name("minTol").displayName("MIN_TOL").description("Param for the convergence tolerance for iterative algorithms (>= 0).").defaultValue("").required(false) val k=new PropertyDescriptor().name("k").displayName("K").description("The number of clusters. ").defaultValue("").required(true) descriptor = training_data_path :: descriptor descriptor = model_save_path :: descriptor descriptor = maxIter :: descriptor descriptor = minTol :: descriptor descriptor = k :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/ml_clustering/KmeansTraining.png") } override def getGroup(): List[String] = { List(StopGroup.MLGroup.toString) } }
Example 3
Source File: KMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.KMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class KMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new KMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "initMode", "initSteps", "maxIter", "tol", "k", "seed") }
Example 4
Source File: GMMClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gmm // scalastyle:off println // $example on$ import org.apache.spark.SparkConf import org.apache.spark.ml.clustering.{GaussianMixture, KMeans} // $example off$ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object GMMClustering { def main(args: Array[String]): Unit = { val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val spark = SparkSession .builder() .appName("Spark SQL Example") .config(spConfig) .getOrCreate() val datasetUsers = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_users_libsvm/part-00000") datasetUsers.show(3) val gmmUsers = new GaussianMixture().setK(5).setSeed(1L) val modelUsers = gmmUsers.fit(datasetUsers) for (i <- 0 until modelUsers.gaussians.length) { println("Users : weight=%f\ncov=%s\nmean=\n%s\n" format (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean)) } val dataSetItems = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_items_libsvm/part-00000") val gmmItems = new GaussianMixture().setK(5).setSeed(1L) val modelItems = gmmItems.fit(dataSetItems) for (i <- 0 until modelItems.gaussians.length) { println("Items : weight=%f\ncov=%s\nmean=\n%s\n" format (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean)) } spark.stop() } def loadInLibSVMFormat(line: String, noOfFeatures : Int) : LabeledPoint = { val items = line.split(' ') val label = items.head.toDouble val (indices, values) = items.tail.filter(_.nonEmpty).map { item => val indexAndValue = item.split(':') val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based. val value = indexAndValue(1).toDouble (index, value) }.unzip // check if indices are one-based and in ascending order var previous = -1 var i = 0 val indicesLength = indices.length while (i < indicesLength) { val current = indices(i) require(current > previous, "indices should be one-based and in ascending order" ) previous = current i += 1 } (label, indices.toArray, values.toArray) import org.apache.spark.mllib.linalg.Vectors val d = noOfFeatures LabeledPoint(label, Vectors.sparse(d, indices, values)) } }
Example 5
Source File: BisectingKMeansMetrics.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.kmeans import org.apache.spark.SparkConf import org.apache.spark.ml.clustering.{BisectingKMeans, KMeans} import org.apache.spark.sql.SparkSession object BisectingKMeansMetrics { case class RatingX(userId: Int, movieId: Int, rating: Float, timestamp: Long) val DATA_PATH= "../../../data/ml-100k" val PATH_MOVIES = DATA_PATH + "/u.item" val dataSetUsers = null def main(args: Array[String]): Unit = { val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val spark = SparkSession .builder() .appName("Spark SQL Example") .config(spConfig) .getOrCreate() val datasetUsers = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_users_libsvm/part-00000") datasetUsers.show(3) val k = 5 val itr = Array(1,10,20,50,75,100) val result = new Array[String](itr.length) for(i <- 0 until itr.length){ val w = calculateWSSSE(spark,datasetUsers,itr(i),5) result(i) = itr(i) + "," + w } println("----------Users----------") for(j <- 0 until itr.length) { println(result(j)) } println("-------------------------") val datasetItems = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_items_libsvm/part-00000") val resultItems = new Array[String](itr.length) for(i <- 0 until itr.length){ val w = calculateWSSSE(spark,datasetItems,itr(i),5) resultItems(i) = itr(i) + "," + w } println("----------Items----------") for(j <- 0 until itr.length) { println(resultItems(j)) } println("-------------------------") spark.stop() } import org.apache.spark.sql.DataFrame def calculateWSSSE(spark : SparkSession, dataset : DataFrame, iterations : Int, k : Int) : Double = { val x = dataset.columns //val kmeans = new KMeans().setK(k).setSeed(seed).setMaxIter(iterations) val bKMeans = new BisectingKMeans() bKMeans.setMaxIter(iterations) bKMeans.setMinDivisibleClusterSize(k) val model = bKMeans.fit(dataset) val WSSSE = model.computeCost(dataset) return WSSSE } }
Example 6
Source File: MovieLensKMeansMetrics.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.kmeans import org.apache.spark.SparkConf import org.apache.spark.ml.clustering.KMeans import org.apache.spark.sql.SparkSession object MovieLensKMeansMetrics { case class RatingX(userId: Int, movieId: Int, rating: Float, timestamp: Long) val DATA_PATH= "../../../data/ml-100k" val PATH_MOVIES = DATA_PATH + "/u.item" val dataSetUsers = null def main(args: Array[String]): Unit = { val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val spark = SparkSession .builder() .appName("Spark SQL Example") .config(spConfig) .getOrCreate() val datasetUsers = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_users_libsvm/part-00000") datasetUsers.show(3) val k = 5 val itr = Array(1,10,20,50,75,100) val result = new Array[String](itr.length) for(i <- 0 until itr.length){ val w = calculateWSSSE(spark,datasetUsers,itr(i),5,1L) result(i) = itr(i) + "," + w } println("----------Users----------") for(j <- 0 until itr.length) { println(result(j)) } println("-------------------------") val datasetItems = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_items_libsvm/part-00000") val resultItems = new Array[String](itr.length) for(i <- 0 until itr.length){ val w = calculateWSSSE(spark,datasetItems,itr(i),5,1L) resultItems(i) = itr(i) + "," + w } println("----------Items----------") for(j <- 0 until itr.length) { println(resultItems(j)) } println("-------------------------") spark.stop() } import org.apache.spark.sql.DataFrame def calculateWSSSE(spark : SparkSession, dataset : DataFrame, iterations : Int, k : Int, seed : Long) : Double = { val x = dataset.columns val kmeans = new KMeans().setK(k).setSeed(seed).setMaxIter(iterations) val model = kmeans.fit(dataset) val WSSSEUsers = model.computeCost(dataset) return WSSSEUsers } }
Example 7
Source File: KMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml // scalastyle:off println // $example on$ import org.apache.spark.ml.clustering.KMeans // $example off$ import org.apache.spark.sql.SparkSession object KMeansExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() // $example on$ // Loads data. val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") // Trains a k-means model. val kmeans = new KMeans().setK(2).setSeed(1L) val model = kmeans.fit(dataset) // Evaluate clustering by computing Within Set Sum of Squared Errors. val WSSSE = model.computeCost(dataset) println(s"Within Set Sum of Squared Errors = $WSSSE") // Shows the result. println("Cluster Centers: ") model.clusterCenters.foreach(println) // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: KMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml // scalastyle:off println // $example on$ import org.apache.spark.ml.clustering.KMeans // $example off$ import org.apache.spark.sql.SparkSession object KMeansExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() // $example on$ // Loads data. val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") // Trains a k-means model. val kmeans = new KMeans().setK(2).setSeed(1L) val model = kmeans.fit(dataset) // Evaluate clustering by computing Within Set Sum of Squared Errors. val WSSSE = model.computeCost(dataset) println(s"Within Set Sum of Squared Errors = $WSSSE") // Shows the result. println("Cluster Centers: ") model.clusterCenters.foreach(println) // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: KMeansExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors // $example off$ import org.apache.spark.sql.{DataFrame, SQLContext} object KMeansExample { def main(args: Array[String]): Unit = { // Creates a Spark context and a SQL context val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ // Crates a DataFrame val dataset: DataFrame = sqlContext.createDataFrame(Seq( (1, Vectors.dense(0.0, 0.0, 0.0)), (2, Vectors.dense(0.1, 0.1, 0.1)), (3, Vectors.dense(0.2, 0.2, 0.2)), (4, Vectors.dense(9.0, 9.0, 9.0)), (5, Vectors.dense(9.1, 9.1, 9.1)), (6, Vectors.dense(9.2, 9.2, 9.2)) )).toDF("id", "features") // Trains a k-means model val kmeans = new KMeans() .setK(2) .setFeaturesCol("features") .setPredictionCol("prediction") val model = kmeans.fit(dataset) // Shows the result println("Final Centers: ") model.clusterCenters.foreach(println) // $example off$ sc.stop() } } // scalastyle:on println
Example 10
Source File: IrisKMeansClustering.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.clustering import com.github.mrpowers.spark.spec.Config import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.DataFrame object IrisKMeansClustering extends SparkSessionWrapper { val irisDF = spark .read .option("header", "true") .option("inferSchema", "true") .csv(Config.get("irisData")) val Array(trainingDF, testDF) = irisDF.randomSplit(Array(0.7, 0.3), seed = 12345) def withVectorizedFeatures( featureColNames: Array[String] = Array("SepalLengthCm", "SepalLengthCm", "PetalLengthCm", "PetalWidthCm"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def model(df: DataFrame = trainingDF): KMeansModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) new KMeans() .setK(3) // # of clusters .setSeed(2L) .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/iris_kMeans_model/") } }