org.apache.spark.ml.clustering.KMeansModel Scala Examples

The following examples show how to use org.apache.spark.ml.clustering.KMeansModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: KmeansPrediction.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.ml_clustering

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.KMeansModel
import org.apache.spark.sql.SparkSession

class KmeansPrediction extends ConfigurableStop{
  val authorEmail: String = "[email protected]"
  val description: String = "Use an existing KmeansModel to predict"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)
  var test_data_path:String =_
  var model_path:String=_


  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    //load data stored in libsvm format as a dataframe
    val data=spark.read.format("libsvm").load(test_data_path)
    //data.show()

    //load model
    val model=KMeansModel.load(model_path)

    val predictions=model.transform(data)
    predictions.show()
    out.write(predictions)

  }

  def initialize(ctx: ProcessContext): Unit = {

  }


  def setProperties(map: Map[String, Any]): Unit = {
    test_data_path=MapUtil.get(map,key="test_data_path").asInstanceOf[String]
    model_path=MapUtil.get(map,key="model_path").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val test_data_path = new PropertyDescriptor().name("test_data_path").displayName("TEST_DATA_PATH").defaultValue("").required(true)
    val model_path = new PropertyDescriptor().name("model_path").displayName("MODEL_PATH").defaultValue("").required(true)
    descriptor = test_data_path :: descriptor
    descriptor = model_path :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/ml_clustering/KmeansPrediction.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.MLGroup.toString)
  }

} 
Example 2
Source File: LocalKMeansModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.clustering

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.clustering.KMeansModel
import org.apache.spark.mllib.clustering.{KMeansModel => OldKMeansModel}
import org.apache.spark.mllib.linalg.{Vector => MLlibVec}

import scala.reflect.runtime.universe

class LocalKMeansModel(override val sparkTransformer: KMeansModel)
  extends LocalTransformer[KMeansModel] {
  lazy val parent: OldKMeansModel = {
    val mirror     = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader)
    val parentTerm = universe.typeOf[KMeansModel].decl(universe.TermName("parentModel")).asTerm
    mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldKMeansModel]
  }

  override def transform(localData: LocalData): LocalData = {
    import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._

    localData.column(sparkTransformer.getFeaturesCol) match {
      case Some(column) =>
        val newColumn = LocalDataColumn(
          sparkTransformer.getPredictionCol,
          column.data.mapToMlLibVectors.map(x => parent.predict(x))
        )
        localData.withColumn(newColumn)
      case None => localData
    }
  }
}

object LocalKMeansModel
  extends SimpleModelLoader[KMeansModel]
  with TypedTransformerConverter[KMeansModel] {

  override def build(metadata: Metadata, data: LocalData): KMeansModel = {
    val mapRows = data.toMapList
    val centers = mapRows map { row =>
      val vec = DataUtils.constructVector(row("clusterCenter").asInstanceOf[Map[String, Any]])
      org.apache.spark.mllib.linalg.Vectors.fromML(vec)
    }
    val parentConstructor = classOf[OldKMeansModel].getDeclaredConstructor(classOf[Array[MLlibVec]])
    parentConstructor.setAccessible(true)
    val mlk = parentConstructor.newInstance(centers.toArray)

    val constructor =
      classOf[KMeansModel].getDeclaredConstructor(classOf[String], classOf[OldKMeansModel])
    constructor.setAccessible(true)
    var inst = constructor
      .newInstance(metadata.uid, mlk)
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])

    inst = inst.set(inst.k, metadata.paramMap("k").asInstanceOf[Number].intValue())
    inst = inst.set(inst.initMode, metadata.paramMap("initMode").asInstanceOf[String])
    inst = inst.set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue())
    inst = inst.set(inst.initSteps, metadata.paramMap("initSteps").asInstanceOf[Number].intValue())
    inst = inst.set(inst.seed, metadata.paramMap("seed").toString.toLong)
    inst = inst.set(inst.tol, metadata.paramMap("tol").asInstanceOf[Double])
    inst
  }
  override implicit def toLocal(transformer: KMeansModel) =
    new LocalKMeansModel(transformer)
} 
Example 3
Source File: KMeansOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.clustering.KMeansModel
import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.mllib.clustering
import org.apache.spark.mllib.linalg.Vectors


class KMeansOp extends SimpleSparkOp[KMeansModel] {
  override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] {
    override val klazz: Class[KMeansModel] = classOf[KMeansModel]

    override def opName: String = Bundle.BuiltinOps.clustering.k_means

    override def store(model: Model, obj: KMeansModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))).
        withValue("num_features", Value.long(obj.clusterCenters.head.size))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): KMeansModel = {
      val clusterCenters = model.value("cluster_centers").
        getTensorList[Double].toArray.
        map(t => Vectors.dense(t.toArray))
      val mllibModel = new clustering.KMeansModel(clusterCenters)

      new KMeansModel(uid = "", parentModel = mllibModel)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = {
    val clusterCenters = model.clusterCenters.map {
      case DenseVector(values) => Vectors.dense(values)
      case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values)
    }
    new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters))
  }

  override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol)
  }
} 
Example 4
Source File: IrisKMeansClustering.scala    From spark-spec   with MIT License 5 votes vote down vote up
package com.github.mrpowers.spark.spec.ml.clustering

import com.github.mrpowers.spark.spec.Config
import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame

object IrisKMeansClustering
  extends SparkSessionWrapper {

  val irisDF = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(Config.get("irisData"))

  val Array(trainingDF, testDF) = irisDF.randomSplit(Array(0.7, 0.3), seed = 12345)

  def withVectorizedFeatures(
    featureColNames: Array[String] = Array("SepalLengthCm", "SepalLengthCm", "PetalLengthCm", "PetalWidthCm"),
    outputColName: String = "features"
  )(df: DataFrame): DataFrame = {
    val assembler: VectorAssembler = new VectorAssembler()
      .setInputCols(featureColNames)
      .setOutputCol(outputColName)
    assembler.transform(df)
  }

  def model(df: DataFrame = trainingDF): KMeansModel = {
    val trainFeatures: DataFrame = df
      .transform(withVectorizedFeatures())

    new KMeans()
      .setK(3) // # of clusters
      .setSeed(2L)
      .fit(trainFeatures)
  }

  def persistModel(): Unit = {
    model().save("./tmp/iris_kMeans_model/")
  }

} 
Example 5
Source File: KMeans.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.clustering

import com.ibm.aardpfark.pfa.dsl.StringExpr
import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.dsl._
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.pfa.types.WithSchema
import com.ibm.aardpfark.spark.ml.PFAModel
import com.sksamuel.avro4s.{AvroNamespace, AvroSchema}
import org.apache.avro.{Schema, SchemaBuilder}

import org.apache.spark.ml.clustering.KMeansModel

@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.clustering")
case class Cluster(id: Int, center: Seq[Double])

@AvroNamespace("com.ibm.aardpfark.exec.spark.ml.clustering")
case class KMeansModelData(clusters: Seq[Cluster]) extends WithSchema {
  override def schema: Schema = AvroSchema[this.type]
}

class PFAKMeansModel(override val sparkTransformer: KMeansModel) extends PFAModel[KMeansModelData] {

  private val inputCol = sparkTransformer.getFeaturesCol
  private val outputCol = sparkTransformer.getPredictionCol
  private val inputExpr = StringExpr(s"input.${inputCol}")

  override def inputSchema = {
    SchemaBuilder.record(withUid(inputBaseName)).fields()
      .name(inputCol).`type`().array().items().doubleType().noDefault()
      .endRecord()
  }

  override def outputSchema = SchemaBuilder.record(withUid(outputBaseName)).fields()
    .name(outputCol).`type`().intType().noDefault()
    .endRecord()

  override def cell =  {
    val clusters = sparkTransformer.clusterCenters.zipWithIndex.map { case (v, i) =>
      Cluster(i, v.toArray)
    }
    Cell(KMeansModelData(clusters))
  }

  override def action: PFAExpression = {
    val closest = model.cluster.closest(inputExpr, modelCell.ref("clusters"))
    NewRecord(outputSchema, Map(outputCol -> Attr(closest, "id")))
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withCell(modelCell)
      .withAction(action)
      .pfa
  }

}