org.apache.spark.ml.clustering.KMeansModel Scala Examples
The following examples show how to use org.apache.spark.ml.clustering.KMeansModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: KmeansPrediction.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.ml_clustering import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.ml.clustering.KMeansModel import org.apache.spark.sql.SparkSession class KmeansPrediction extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Use an existing KmeansModel to predict" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var test_data_path:String =_ var model_path:String=_ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //load data stored in libsvm format as a dataframe val data=spark.read.format("libsvm").load(test_data_path) //data.show() //load model val model=KMeansModel.load(model_path) val predictions=model.transform(data) predictions.show() out.write(predictions) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map: Map[String, Any]): Unit = { test_data_path=MapUtil.get(map,key="test_data_path").asInstanceOf[String] model_path=MapUtil.get(map,key="model_path").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val test_data_path = new PropertyDescriptor().name("test_data_path").displayName("TEST_DATA_PATH").defaultValue("").required(true) val model_path = new PropertyDescriptor().name("model_path").displayName("MODEL_PATH").defaultValue("").required(true) descriptor = test_data_path :: descriptor descriptor = model_path :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/ml_clustering/KmeansPrediction.png") } override def getGroup(): List[String] = { List(StopGroup.MLGroup.toString) } }
Example 2
Source File: LocalKMeansModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.clustering import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.clustering.KMeansModel import org.apache.spark.mllib.clustering.{KMeansModel => OldKMeansModel} import org.apache.spark.mllib.linalg.{Vector => MLlibVec} import scala.reflect.runtime.universe class LocalKMeansModel(override val sparkTransformer: KMeansModel) extends LocalTransformer[KMeansModel] { lazy val parent: OldKMeansModel = { val mirror = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader) val parentTerm = universe.typeOf[KMeansModel].decl(universe.TermName("parentModel")).asTerm mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldKMeansModel] } override def transform(localData: LocalData): LocalData = { import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val newColumn = LocalDataColumn( sparkTransformer.getPredictionCol, column.data.mapToMlLibVectors.map(x => parent.predict(x)) ) localData.withColumn(newColumn) case None => localData } } } object LocalKMeansModel extends SimpleModelLoader[KMeansModel] with TypedTransformerConverter[KMeansModel] { override def build(metadata: Metadata, data: LocalData): KMeansModel = { val mapRows = data.toMapList val centers = mapRows map { row => val vec = DataUtils.constructVector(row("clusterCenter").asInstanceOf[Map[String, Any]]) org.apache.spark.mllib.linalg.Vectors.fromML(vec) } val parentConstructor = classOf[OldKMeansModel].getDeclaredConstructor(classOf[Array[MLlibVec]]) parentConstructor.setAccessible(true) val mlk = parentConstructor.newInstance(centers.toArray) val constructor = classOf[KMeansModel].getDeclaredConstructor(classOf[String], classOf[OldKMeansModel]) constructor.setAccessible(true) var inst = constructor .newInstance(metadata.uid, mlk) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) inst = inst.set(inst.k, metadata.paramMap("k").asInstanceOf[Number].intValue()) inst = inst.set(inst.initMode, metadata.paramMap("initMode").asInstanceOf[String]) inst = inst.set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue()) inst = inst.set(inst.initSteps, metadata.paramMap("initSteps").asInstanceOf[Number].intValue()) inst = inst.set(inst.seed, metadata.paramMap("seed").toString.toLong) inst = inst.set(inst.tol, metadata.paramMap("tol").asInstanceOf[Double]) inst } override implicit def toLocal(transformer: KMeansModel) = new LocalKMeansModel(transformer) }
Example 3
Source File: KMeansOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.clustering.KMeansModel import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.mllib.clustering import org.apache.spark.mllib.linalg.Vectors class KMeansOp extends SimpleSparkOp[KMeansModel] { override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] { override val klazz: Class[KMeansModel] = classOf[KMeansModel] override def opName: String = Bundle.BuiltinOps.clustering.k_means override def store(model: Model, obj: KMeansModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))). withValue("num_features", Value.long(obj.clusterCenters.head.size)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): KMeansModel = { val clusterCenters = model.value("cluster_centers"). getTensorList[Double].toArray. map(t => Vectors.dense(t.toArray)) val mllibModel = new clustering.KMeansModel(clusterCenters) new KMeansModel(uid = "", parentModel = mllibModel) } } override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = { val clusterCenters = model.clusterCenters.map { case DenseVector(values) => Vectors.dense(values) case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values) } new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters)) } override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 4
Source File: IrisKMeansClustering.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.clustering import com.github.mrpowers.spark.spec.Config import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.DataFrame object IrisKMeansClustering extends SparkSessionWrapper { val irisDF = spark .read .option("header", "true") .option("inferSchema", "true") .csv(Config.get("irisData")) val Array(trainingDF, testDF) = irisDF.randomSplit(Array(0.7, 0.3), seed = 12345) def withVectorizedFeatures( featureColNames: Array[String] = Array("SepalLengthCm", "SepalLengthCm", "PetalLengthCm", "PetalWidthCm"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def model(df: DataFrame = trainingDF): KMeansModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) new KMeans() .setK(3) // # of clusters .setSeed(2L) .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/iris_kMeans_model/") } }
Example 5
Source File: KMeans.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.clustering import com.ibm.aardpfark.pfa.dsl.StringExpr import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument} import com.ibm.aardpfark.pfa.dsl._ import com.ibm.aardpfark.pfa.expression.PFAExpression import com.ibm.aardpfark.pfa.types.WithSchema import com.ibm.aardpfark.spark.ml.PFAModel import com.sksamuel.avro4s.{AvroNamespace, AvroSchema} import org.apache.avro.{Schema, SchemaBuilder} import org.apache.spark.ml.clustering.KMeansModel @AvroNamespace("com.ibm.aardpfark.exec.spark.ml.clustering") case class Cluster(id: Int, center: Seq[Double]) @AvroNamespace("com.ibm.aardpfark.exec.spark.ml.clustering") case class KMeansModelData(clusters: Seq[Cluster]) extends WithSchema { override def schema: Schema = AvroSchema[this.type] } class PFAKMeansModel(override val sparkTransformer: KMeansModel) extends PFAModel[KMeansModelData] { private val inputCol = sparkTransformer.getFeaturesCol private val outputCol = sparkTransformer.getPredictionCol private val inputExpr = StringExpr(s"input.${inputCol}") override def inputSchema = { SchemaBuilder.record(withUid(inputBaseName)).fields() .name(inputCol).`type`().array().items().doubleType().noDefault() .endRecord() } override def outputSchema = SchemaBuilder.record(withUid(outputBaseName)).fields() .name(outputCol).`type`().intType().noDefault() .endRecord() override def cell = { val clusters = sparkTransformer.clusterCenters.zipWithIndex.map { case (v, i) => Cluster(i, v.toArray) } Cell(KMeansModelData(clusters)) } override def action: PFAExpression = { val closest = model.cluster.closest(inputExpr, modelCell.ref("clusters")) NewRecord(outputSchema, Map(outputCol -> Attr(closest, "id"))) } override def pfa: PFADocument = { PFABuilder() .withName(sparkTransformer.uid) .withMetadata(getMetadata) .withInput(inputSchema) .withOutput(outputSchema) .withCell(modelCell) .withAction(action) .pfa } }