org.apache.spark.mllib.clustering.KMeansModel Scala Example

Source File: PMMLModelExportFactory.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.RegressionNormalizationMethodType

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.regression.LassoModel
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel

private[mllib] object PMMLModelExportFactory {

  
  def createPMMLModelExport(model: Any): PMMLModelExport = {
    model match {
      case kmeans: KMeansModel =>
        new KMeansPMMLModelExport(kmeans)
      case linear: LinearRegressionModel =>
        new GeneralizedLinearPMMLModelExport(linear, "linear regression")
      case ridge: RidgeRegressionModel =>
        new GeneralizedLinearPMMLModelExport(ridge, "ridge regression")
      case lasso: LassoModel =>
        new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
      case svm: SVMModel =>
        new BinaryClassificationPMMLModelExport(
          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
          svm.getThreshold.getOrElse(0.0))
      case logistic: LogisticRegressionModel =>
        if (logistic.numClasses == 2) {
          new BinaryClassificationPMMLModelExport(
            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
            logistic.getThreshold.getOrElse(0.5))
        } else {
          throw new IllegalArgumentException(
            "PMML Export not supported for Multinomial Logistic Regression")
        }
      case _ =>
        throw new IllegalArgumentException(
          "PMML Export not supported for model: " + model.getClass.getName)
    }
  }

}

Source File: PMMLModelExportFactorySuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: KMeansClustering_IBM.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.mllib.clustering.{ KMeans, KMeansModel }
import org.apache.spark.mllib.linalg.Vectors

        Vectors.dense(line.split(",").map(_.trim).filter(!"".equals(_)).map(_.toDouble))
      })
    parsedTestData.collect().foreach(testDataLine => {
      //计算测试数据分别属于那个簇类
      val predictedClusterIndex: Int = clusters.predict(testDataLine)
      println("测试样本: " + testDataLine.toString + " 属于聚类 " +
        predictedClusterIndex)
    })
    println("Spark MLlib K-means clustering test finished.")
    //评估KMeans模型 如何选择K值
    val ks: Array[Int] = Array(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 50, 80, 100)
    ks.foreach(cluster => {
      //parsedTrainingData训练模型数据
      val model: KMeansModel = KMeans.train(parsedTrainingData, cluster, 30, 1)
      //KMeansModel 类里提供了 computeCost 方法,该方法通过计算所有数据点到其最近的中心点的平方和来评估聚类的效果。  
      //统计聚类错误的样本比例
      val ssd = model.computeCost(parsedTrainingData)
      //model.predict(point)
      println("sum of squared distances of points to their nearest center when k=" + cluster + " -> " + ssd)
    })
  }
  //过滤标题行
  private def isColumnNameLine(line: String): Boolean = {
    if (line != null &&
      line.contains("Channel")) true
    else false
  }
}

Source File: KMeansExample.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

object KMeansExample {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering")
    val sc = new SparkContext(sparkConf)
    //加载saratoga到RDD
    val data = sc.textFile("../data/mllib/saratoga.csv")
    //把数据转换成密集向量的RDD
   val parsedData = data.map( line => Vectors.dense(line.split(',').map(_.toDouble)))
   //以4个簇和5次迭代训练模型
   val kmmodel= KMeans.train(parsedData,4,5)
   //把parsedData数据收集本地数据集
   val houses = parsedData.collect
   //预测第1个元素的簇,KMeans算法会从0给出簇的ID,
   val prediction1 = kmmodel.predict(houses(0))
   //预测houses(18)的数据,占地面积876,价格66.5属于那个簇
   val prediction2 = kmmodel.predict(houses(18))
   //预测houses(35)的数据,占地面积15750,价格112属于那个簇
   val prediction3 = kmmodel.predict(houses(35))
   //预测houses(6)的数据,占地面积38768,价格272属于那个簇
   val prediction4 = kmmodel.predict(houses(6))
   //预测houses(15)的数据,占地面积69696,价格275属于那个簇
   val prediction5 = kmmodel.predict(houses(15))
    
  }
 
}

Source File: KMeansPMMLModelExport.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import scala.{Array => SArray}

import org.dmg.pmml._

import org.apache.spark.mllib.clustering.KMeansModel


  private def populateKMeansPMML(model : KMeansModel): Unit = {
    pmml.getHeader.setDescription("k-means clustering")

    if (model.clusterCenters.length > 0) {
      val clusterCenter = model.clusterCenters(0)
      val fields = new SArray[FieldName](clusterCenter.size)
      val dataDictionary = new DataDictionary
      val miningSchema = new MiningSchema
      val comparisonMeasure = new ComparisonMeasure()
        .withKind(ComparisonMeasure.Kind.DISTANCE)
        .withMeasure(new SquaredEuclidean())
      val clusteringModel = new ClusteringModel()
        .withModelName("k-means")
        .withMiningSchema(miningSchema)
        .withComparisonMeasure(comparisonMeasure)
        .withFunctionName(MiningFunctionType.CLUSTERING)
        .withModelClass(ClusteringModel.ModelClass.CENTER_BASED)
        .withNumberOfClusters(model.clusterCenters.length)

      for (i <- 0 until clusterCenter.size) {
        fields(i) = FieldName.create("field_" + i)
        dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
        miningSchema
          .withMiningFields(new MiningField(fields(i))
          .withUsageType(FieldUsageType.ACTIVE))
        clusteringModel.withClusteringFields(
          new ClusteringField(fields(i)).withCompareFunction(CompareFunctionType.ABS_DIFF))
      }

      dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)

      for (i <- 0 until model.clusterCenters.length) {
        val cluster = new Cluster()
          .withName("cluster_" + i)
          .withArray(new org.dmg.pmml.Array()
          .withType(Array.Type.REAL)
          .withN(clusterCenter.size)
          .withValue(model.clusterCenters(i).toArray.mkString(" ")))
        // we don't have the size of the single cluster but only the centroids (withValue)
        // .withSize(value)
        clusteringModel.withClusters(cluster)
      }

      pmml.setDataDictionary(dataDictionary)
      pmml.withModels(clusteringModel)
    }
  }
}

Source File: PMMLModelExportFactory.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.RegressionNormalizationMethodType

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.regression.LassoModel
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel

private[mllib] object PMMLModelExportFactory {

  
  def createPMMLModelExport(model: Any): PMMLModelExport = {
    model match {
      case kmeans: KMeansModel =>
        new KMeansPMMLModelExport(kmeans)
      case linear: LinearRegressionModel =>
        new GeneralizedLinearPMMLModelExport(linear, "linear regression")
      case ridge: RidgeRegressionModel =>
        new GeneralizedLinearPMMLModelExport(ridge, "ridge regression")
      case lasso: LassoModel =>
        new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
      case svm: SVMModel =>
        new BinaryClassificationPMMLModelExport(
          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
          svm.getThreshold.getOrElse(0.0))
      case logistic: LogisticRegressionModel =>
        if (logistic.numClasses == 2) {
          new BinaryClassificationPMMLModelExport(
            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
            logistic.getThreshold.getOrElse(0.5))
        } else {
          throw new IllegalArgumentException(
            "PMML Export not supported for Multinomial Logistic Regression")
        }
      case _ =>
        throw new IllegalArgumentException(
          "PMML Export not supported for model: " + model.getClass.getName)
    }
  }

}

Source File: KMeansPMMLModelExportSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
     //clusterCenters聚类中心点
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      //numClasses 分类数
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: KMeansExample.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object KMeansExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("KMeansExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/kmeans_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(parsedData)
    println(s"Within Set Sum of Squared Errors = $WSSSE")

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: KMeansPMMLModelExport.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import scala.{Array => SArray}

import org.dmg.pmml._

import org.apache.spark.mllib.clustering.KMeansModel


  private def populateKMeansPMML(model: KMeansModel): Unit = {
    pmml.getHeader.setDescription("k-means clustering")

    if (model.clusterCenters.length > 0) {
      val clusterCenter = model.clusterCenters(0)
      val fields = new SArray[FieldName](clusterCenter.size)
      val dataDictionary = new DataDictionary
      val miningSchema = new MiningSchema
      val comparisonMeasure = new ComparisonMeasure()
        .setKind(ComparisonMeasure.Kind.DISTANCE)
        .setMeasure(new SquaredEuclidean())
      val clusteringModel = new ClusteringModel()
        .setModelName("k-means")
        .setMiningSchema(miningSchema)
        .setComparisonMeasure(comparisonMeasure)
        .setFunctionName(MiningFunctionType.CLUSTERING)
        .setModelClass(ClusteringModel.ModelClass.CENTER_BASED)
        .setNumberOfClusters(model.clusterCenters.length)

      for (i <- 0 until clusterCenter.size) {
        fields(i) = FieldName.create("field_" + i)
        dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
        miningSchema
          .addMiningFields(new MiningField(fields(i))
          .setUsageType(FieldUsageType.ACTIVE))
        clusteringModel.addClusteringFields(
          new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF))
      }

      dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)

      for (i <- model.clusterCenters.indices) {
        val cluster = new Cluster()
          .setName("cluster_" + i)
          .setArray(new org.dmg.pmml.Array()
          .setType(Array.Type.REAL)
          .setN(clusterCenter.size)
          .setValue(model.clusterCenters(i).toArray.mkString(" ")))
        // we don't have the size of the single cluster but only the centroids (withValue)
        // .withSize(value)
        clusteringModel.addClusters(cluster)
      }

      pmml.setDataDictionary(dataDictionary)
      pmml.addModels(clusteringModel)
    }
  }
}

Source File: KMeansPMMLModelExportSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: KMeansPMMLModelExportSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: KMeansPMMLModelExport.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import scala.{Array => SArray}

import org.dmg.pmml._

import org.apache.spark.mllib.clustering.KMeansModel


  private def populateKMeansPMML(model : KMeansModel): Unit = {
    pmml.getHeader.setDescription("k-means clustering")

    if (model.clusterCenters.length > 0) {
      val clusterCenter = model.clusterCenters(0)
      val fields = new SArray[FieldName](clusterCenter.size)
      val dataDictionary = new DataDictionary
      val miningSchema = new MiningSchema
      val comparisonMeasure = new ComparisonMeasure()
        .withKind(ComparisonMeasure.Kind.DISTANCE)
        .withMeasure(new SquaredEuclidean())
      val clusteringModel = new ClusteringModel()
        .withModelName("k-means")
        .withMiningSchema(miningSchema)
        .withComparisonMeasure(comparisonMeasure)
        .withFunctionName(MiningFunctionType.CLUSTERING)
        .withModelClass(ClusteringModel.ModelClass.CENTER_BASED)
        .withNumberOfClusters(model.clusterCenters.length)

      for (i <- 0 until clusterCenter.size) {
        fields(i) = FieldName.create("field_" + i)
        dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
        miningSchema
          .withMiningFields(new MiningField(fields(i))
          .withUsageType(FieldUsageType.ACTIVE))
        clusteringModel.withClusteringFields(
          new ClusteringField(fields(i)).withCompareFunction(CompareFunctionType.ABS_DIFF))
      }

      dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)

      for (i <- 0 until model.clusterCenters.length) {
        val cluster = new Cluster()
          .withName("cluster_" + i)
          .withArray(new org.dmg.pmml.Array()
          .withType(Array.Type.REAL)
          .withN(clusterCenter.size)
          .withValue(model.clusterCenters(i).toArray.mkString(" ")))
        // we don't have the size of the single cluster but only the centroids (withValue)
        // .withSize(value)
        clusteringModel.withClusters(cluster)
      }

      pmml.setDataDictionary(dataDictionary)
      pmml.withModels(clusteringModel)
    }
  }
}

Source File: PMMLModelExportFactory.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.RegressionNormalizationMethodType

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.regression.LassoModel
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel

private[mllib] object PMMLModelExportFactory {

  
  def createPMMLModelExport(model: Any): PMMLModelExport = {
    model match {
      case kmeans: KMeansModel =>
        new KMeansPMMLModelExport(kmeans)
      case linear: LinearRegressionModel =>
        new GeneralizedLinearPMMLModelExport(linear, "linear regression")
      case ridge: RidgeRegressionModel =>
        new GeneralizedLinearPMMLModelExport(ridge, "ridge regression")
      case lasso: LassoModel =>
        new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
      case svm: SVMModel =>
        new BinaryClassificationPMMLModelExport(
          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
          svm.getThreshold.getOrElse(0.0))
      case logistic: LogisticRegressionModel =>
        if (logistic.numClasses == 2) {
          new BinaryClassificationPMMLModelExport(
            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
            logistic.getThreshold.getOrElse(0.5))
        } else {
          throw new IllegalArgumentException(
            "PMML Export not supported for Multinomial Logistic Regression")
        }
      case _ =>
        throw new IllegalArgumentException(
          "PMML Export not supported for model: " + model.getClass.getName)
    }
  }

}

Source File: KMeansPMMLModelExportSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: KmeansModelSaveToOss.scala From MaxCompute-Spark with Apache License 2.0

5 votes

package com.aliyun.odps.spark.examples.mllib

import org.apache.spark.mllib.clustering.KMeans._
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkConf, SparkContext}

object KmeansModelSaveToOss {
  val modelOssDir = "oss://bucket/kmeans-model"

  def main(args: Array[String]) {

    //1. train and save the model
    val conf = new SparkConf().setAppName("KmeansModelSaveToOss")
    conf.set("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider")
    conf.set("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole")
    conf.set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com")

    val sc = new SparkContext(conf)
    val points = Seq(
      Vectors.dense(0.0, 0.0),
      Vectors.dense(0.0, 0.1),
      Vectors.dense(0.1, 0.0),
      Vectors.dense(9.0, 0.0),
      Vectors.dense(9.0, 0.2),
      Vectors.dense(9.2, 0.0)
    )
    val rdd = sc.parallelize(points, 3)
    val initMode = K_MEANS_PARALLEL
    val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode)
    val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
    println("modelOssDir=" + modelOssDir)
    model.save(sc, modelOssDir)

    //2. predict from the oss model
    val modelLoadOss = KMeansModel.load(sc, modelOssDir)
    val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
    assert(predictResult1.size == predictResult2.size)
    predictResult2.foreach(result2 => assert(predictResult1.contains(result2)))
  }
}

Source File: KmeansModelSaveToOss.scala From MaxCompute-Spark with Apache License 2.0

5 votes

package com.aliyun.odps.spark.examples.mllib

import org.apache.spark.mllib.clustering.KMeans._
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SparkSession

object KmeansModelSaveToOss {
  val modelOssDir = "oss://bucket/kmeans-model"

  def main(args: Array[String]) {

    //1. train and save the model
    val spark = SparkSession
      .builder()
      .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider")
      .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole")
      .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com")
      .appName("KmeansModelSaveToOss")
      .getOrCreate()

    val sc = spark.sparkContext
    val points = Seq(
      Vectors.dense(0.0, 0.0),
      Vectors.dense(0.0, 0.1),
      Vectors.dense(0.1, 0.0),
      Vectors.dense(9.0, 0.0),
      Vectors.dense(9.0, 0.2),
      Vectors.dense(9.2, 0.0)
    )
    val rdd = sc.parallelize(points, 3)
    val initMode = K_MEANS_PARALLEL
    val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode)
    val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
    println("modelOssDir=" + modelOssDir)
    model.save(sc, modelOssDir)

    //2. predict from the oss model
    val modelLoadOss = KMeansModel.load(sc, modelOssDir)
    val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
    assert(predictResult1.size == predictResult2.size)
    predictResult2.foreach(result2 => assert(predictResult1.contains(result2)))
  }
}

Source File: PMMLModelExportFactory.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.RegressionNormalizationMethodType

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.regression.LassoModel
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel

private[mllib] object PMMLModelExportFactory {

  
  def createPMMLModelExport(model: Any): PMMLModelExport = {
    model match {
      case kmeans: KMeansModel =>
        new KMeansPMMLModelExport(kmeans)
      case linear: LinearRegressionModel =>
        new GeneralizedLinearPMMLModelExport(linear, "linear regression")
      case ridge: RidgeRegressionModel =>
        new GeneralizedLinearPMMLModelExport(ridge, "ridge regression")
      case lasso: LassoModel =>
        new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
      case svm: SVMModel =>
        new BinaryClassificationPMMLModelExport(
          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
          svm.getThreshold.getOrElse(0.0))
      case logistic: LogisticRegressionModel =>
        if (logistic.numClasses == 2) {
          new BinaryClassificationPMMLModelExport(
            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
            logistic.getThreshold.getOrElse(0.5))
        } else {
          throw new IllegalArgumentException(
            "PMML Export not supported for Multinomial Logistic Regression")
        }
      case _ =>
        throw new IllegalArgumentException(
          "PMML Export not supported for model: " + model.getClass.getName)
    }
  }

}

Source File: PMMLModelExportFactory.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.RegressionNormalizationMethodType

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.regression.LassoModel
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel

private[mllib] object PMMLModelExportFactory {

  
  def createPMMLModelExport(model: Any): PMMLModelExport = {
    model match {
      case kmeans: KMeansModel =>
        new KMeansPMMLModelExport(kmeans)
      case linear: LinearRegressionModel =>
        new GeneralizedLinearPMMLModelExport(linear, "linear regression")
      case ridge: RidgeRegressionModel =>
        new GeneralizedLinearPMMLModelExport(ridge, "ridge regression")
      case lasso: LassoModel =>
        new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
      case svm: SVMModel =>
        new BinaryClassificationPMMLModelExport(
          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
          svm.getThreshold.getOrElse(0.0))
      case logistic: LogisticRegressionModel =>
        if (logistic.numClasses == 2) {
          new BinaryClassificationPMMLModelExport(
            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
            logistic.getThreshold.getOrElse(0.5))
        } else {
          throw new IllegalArgumentException(
            "PMML Export not supported for Multinomial Logistic Regression")
        }
      case _ =>
        throw new IllegalArgumentException(
          "PMML Export not supported for model: " + model.getClass.getName)
    }
  }

}

Source File: KMeansPMMLModelExportSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0

5 votes

package com.highperformancespark.examples.structuredstreaming

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.scalatest.FunSuite
import org.apache.log4j.{Level, Logger}

case class TestRow(features: Vector)

class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase {

  override def beforeAll(): Unit = {
    super.beforeAll()
    Logger.getLogger("org").setLevel(Level.OFF)
  }

  test("streaming model with one center should converge to true center") {
    import spark.implicits._
    val k = 1
    val dim = 5
    val clusterSpread = 0.1
    val seed = 63
    // TODO: this test is very flaky. The centers do not converge for some
    // (most?) random seeds
    val (batches, trueCenters) =
      StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed)
    val inputStream = MemoryStream[TestRow]
    val ds = inputStream.toDS()
    val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01)
    val query = skm.evilTrain(ds.toDF())
    val streamingModels = batches.map { batch =>
      inputStream.addData(batch)
      query.processAllAvailable()
      skm.getModel
    }
    // TODO: use spark's testing suite
    streamingModels.last.centers.zip(trueCenters).foreach {
      case (center, trueCenter) =>
        val centers = center.toArray.mkString(",")
        val trueCenters = trueCenter.toArray.mkString(",")
        println(s"${centers} | ${trueCenters}")
        assert(center.toArray.zip(trueCenter.toArray).forall(
          x => math.abs(x._1 - x._2) < 0.1))
    }
    query.stop()
  }

  def compareBatchAndStreaming(
      batchModel: KMeansModel,
      streamingModel: StreamingKMeansModel,
      validationData: DataFrame): Unit = {
    assert(batchModel.clusterCenters === streamingModel.centers)
    // TODO: implement prediction comparison
  }

}

object StreamingKMeansSuite {

  def generateBatches(
      numPoints: Int,
      numBatches: Int,
      k: Int,
      d: Int,
      r: Double,
      seed: Int,
      initCenters: Array[Vector] = null):
      (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = {
    val rand = scala.util.Random
    rand.setSeed(seed)
    val centers = initCenters match {
      case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian())))
      case _ => initCenters
    }
    val data = (0 until numBatches).map { i =>
      (0 until numPoints).map { idx =>
        val center = centers(idx % k)
        val vec = Vectors.dense(
          Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r))
        TestRow(vec)
      }
    }
    (data, centers)
  }
}

Source File: KMeansExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object KMeansExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("KMeansExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/kmeans_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(parsedData)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: KMeansPMMLModelExport.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import scala.{Array => SArray}

import org.dmg.pmml._

import org.apache.spark.mllib.clustering.KMeansModel


  private def populateKMeansPMML(model: KMeansModel): Unit = {
    pmml.getHeader.setDescription("k-means clustering")

    if (model.clusterCenters.length > 0) {
      val clusterCenter = model.clusterCenters(0)
      val fields = new SArray[FieldName](clusterCenter.size)
      val dataDictionary = new DataDictionary
      val miningSchema = new MiningSchema
      val comparisonMeasure = new ComparisonMeasure()
        .setKind(ComparisonMeasure.Kind.DISTANCE)
        .setMeasure(new SquaredEuclidean())
      val clusteringModel = new ClusteringModel()
        .setModelName("k-means")
        .setMiningSchema(miningSchema)
        .setComparisonMeasure(comparisonMeasure)
        .setFunctionName(MiningFunctionType.CLUSTERING)
        .setModelClass(ClusteringModel.ModelClass.CENTER_BASED)
        .setNumberOfClusters(model.clusterCenters.length)

      for (i <- 0 until clusterCenter.size) {
        fields(i) = FieldName.create("field_" + i)
        dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
        miningSchema
          .addMiningFields(new MiningField(fields(i))
          .setUsageType(FieldUsageType.ACTIVE))
        clusteringModel.addClusteringFields(
          new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF))
      }

      dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)

      for (i <- model.clusterCenters.indices) {
        val cluster = new Cluster()
          .setName("cluster_" + i)
          .setArray(new org.dmg.pmml.Array()
          .setType(Array.Type.REAL)
          .setN(clusterCenter.size)
          .setValue(model.clusterCenters(i).toArray.mkString(" ")))
        // we don't have the size of the single cluster but only the centroids (withValue)
        // .withSize(value)
        clusteringModel.addClusters(cluster)
      }

      pmml.setDataDictionary(dataDictionary)
      pmml.addModels(clusteringModel)
    }
  }
}

Source File: PMMLModelExportFactory.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.RegressionNormalizationMethodType

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.regression.LassoModel
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel

private[mllib] object PMMLModelExportFactory {

  
  def createPMMLModelExport(model: Any): PMMLModelExport = {
    model match {
      case kmeans: KMeansModel =>
        new KMeansPMMLModelExport(kmeans)
      case linear: LinearRegressionModel =>
        new GeneralizedLinearPMMLModelExport(linear, "linear regression")
      case ridge: RidgeRegressionModel =>
        new GeneralizedLinearPMMLModelExport(ridge, "ridge regression")
      case lasso: LassoModel =>
        new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
      case svm: SVMModel =>
        new BinaryClassificationPMMLModelExport(
          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
          svm.getThreshold.getOrElse(0.0))
      case logistic: LogisticRegressionModel =>
        if (logistic.numClasses == 2) {
          new BinaryClassificationPMMLModelExport(
            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
            logistic.getThreshold.getOrElse(0.5))
        } else {
          throw new IllegalArgumentException(
            "PMML Export not supported for Multinomial Logistic Regression")
        }
      case _ =>
        throw new IllegalArgumentException(
          "PMML Export not supported for model: " + model.getClass.getName)
    }
  }

}

Source File: KMeansPMMLModelExportSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: KMeansExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object KMeansExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("KMeansExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/kmeans_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(parsedData)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: KMeansPMMLModelExport.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import scala.{Array => SArray}

import org.dmg.pmml._

import org.apache.spark.mllib.clustering.KMeansModel


  private def populateKMeansPMML(model: KMeansModel): Unit = {
    pmml.getHeader.setDescription("k-means clustering")

    if (model.clusterCenters.length > 0) {
      val clusterCenter = model.clusterCenters(0)
      val fields = new SArray[FieldName](clusterCenter.size)
      val dataDictionary = new DataDictionary
      val miningSchema = new MiningSchema
      val comparisonMeasure = new ComparisonMeasure()
        .setKind(ComparisonMeasure.Kind.DISTANCE)
        .setMeasure(new SquaredEuclidean())
      val clusteringModel = new ClusteringModel()
        .setModelName("k-means")
        .setMiningSchema(miningSchema)
        .setComparisonMeasure(comparisonMeasure)
        .setFunctionName(MiningFunctionType.CLUSTERING)
        .setModelClass(ClusteringModel.ModelClass.CENTER_BASED)
        .setNumberOfClusters(model.clusterCenters.length)

      for (i <- 0 until clusterCenter.size) {
        fields(i) = FieldName.create("field_" + i)
        dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
        miningSchema
          .addMiningFields(new MiningField(fields(i))
          .setUsageType(FieldUsageType.ACTIVE))
        clusteringModel.addClusteringFields(
          new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF))
      }

      dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)

      for (i <- model.clusterCenters.indices) {
        val cluster = new Cluster()
          .setName("cluster_" + i)
          .setArray(new org.dmg.pmml.Array()
          .setType(Array.Type.REAL)
          .setN(clusterCenter.size)
          .setValue(model.clusterCenters(i).toArray.mkString(" ")))
        // we don't have the size of the single cluster but only the centroids (withValue)
        // .withSize(value)
        clusteringModel.addClusters(cluster)
      }

      pmml.setDataDictionary(dataDictionary)
      pmml.addModels(clusteringModel)
    }
  }
}

Source File: PMMLModelExportFactory.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.RegressionNormalizationMethodType

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.regression.LassoModel
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.RidgeRegressionModel

private[mllib] object PMMLModelExportFactory {

  
  def createPMMLModelExport(model: Any): PMMLModelExport = {
    model match {
      case kmeans: KMeansModel =>
        new KMeansPMMLModelExport(kmeans)
      case linear: LinearRegressionModel =>
        new GeneralizedLinearPMMLModelExport(linear, "linear regression")
      case ridge: RidgeRegressionModel =>
        new GeneralizedLinearPMMLModelExport(ridge, "ridge regression")
      case lasso: LassoModel =>
        new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
      case svm: SVMModel =>
        new BinaryClassificationPMMLModelExport(
          svm, "linear SVM", RegressionNormalizationMethodType.NONE,
          svm.getThreshold.getOrElse(0.0))
      case logistic: LogisticRegressionModel =>
        if (logistic.numClasses == 2) {
          new BinaryClassificationPMMLModelExport(
            logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT,
            logistic.getThreshold.getOrElse(0.5))
        } else {
          throw new IllegalArgumentException(
            "PMML Export not supported for Multinomial Logistic Regression")
        }
      case _ =>
        throw new IllegalArgumentException(
          "PMML Export not supported for model: " + model.getClass.getName)
    }
  }

}

Source File: KMeansPMMLModelExportSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: KMeansPMMLModelExport.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import scala.{Array => SArray}

import org.dmg.pmml._

import org.apache.spark.mllib.clustering.KMeansModel


  private def populateKMeansPMML(model : KMeansModel): Unit = {
    pmml.getHeader.setDescription("k-means clustering")

    if (model.clusterCenters.length > 0) {
      val clusterCenter = model.clusterCenters(0)
      val fields = new SArray[FieldName](clusterCenter.size)
      val dataDictionary = new DataDictionary
      val miningSchema = new MiningSchema
      val comparisonMeasure = new ComparisonMeasure()
        .withKind(ComparisonMeasure.Kind.DISTANCE)
        .withMeasure(new SquaredEuclidean())
      val clusteringModel = new ClusteringModel()
        .withModelName("k-means")
        .withMiningSchema(miningSchema)
        .withComparisonMeasure(comparisonMeasure)
        .withFunctionName(MiningFunctionType.CLUSTERING)
        .withModelClass(ClusteringModel.ModelClass.CENTER_BASED)
        .withNumberOfClusters(model.clusterCenters.length)

      for (i <- 0 until clusterCenter.size) {
        fields(i) = FieldName.create("field_" + i)
        dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
        miningSchema
          .withMiningFields(new MiningField(fields(i))
          .withUsageType(FieldUsageType.ACTIVE))
        clusteringModel.withClusteringFields(
          new ClusteringField(fields(i)).withCompareFunction(CompareFunctionType.ABS_DIFF))
      }

      dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)

      for (i <- 0 until model.clusterCenters.length) {
        val cluster = new Cluster()
          .withName("cluster_" + i)
          .withArray(new org.dmg.pmml.Array()
          .withType(Array.Type.REAL)
          .withN(clusterCenter.size)
          .withValue(model.clusterCenters(i).toArray.mkString(" ")))
        // we don't have the size of the single cluster but only the centroids (withValue)
        // .withSize(value)
        clusteringModel.withClusters(cluster)
      }

      pmml.setDataDictionary(dataDictionary)
      pmml.withModels(clusteringModel)
    }
  }
}

Source File: KMeansExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object KMeansExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("KMeansExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/kmeans_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(parsedData)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: KMeansPMMLModelExport.scala From drizzle-spark with Apache License 2.0

4 votes

package org.apache.spark.mllib.pmml.export

import scala.{Array => SArray}

import org.dmg.pmml._

import org.apache.spark.mllib.clustering.KMeansModel


  private def populateKMeansPMML(model: KMeansModel): Unit = {
    pmml.getHeader.setDescription("k-means clustering")

    if (model.clusterCenters.length > 0) {
      val clusterCenter = model.clusterCenters(0)
      val fields = new SArray[FieldName](clusterCenter.size)
      val dataDictionary = new DataDictionary
      val miningSchema = new MiningSchema
      val comparisonMeasure = new ComparisonMeasure()
        .setKind(ComparisonMeasure.Kind.DISTANCE)
        .setMeasure(new SquaredEuclidean())
      val clusteringModel = new ClusteringModel()
        .setModelName("k-means")
        .setMiningSchema(miningSchema)
        .setComparisonMeasure(comparisonMeasure)
        .setFunctionName(MiningFunctionType.CLUSTERING)
        .setModelClass(ClusteringModel.ModelClass.CENTER_BASED)
        .setNumberOfClusters(model.clusterCenters.length)

      for (i <- 0 until clusterCenter.size) {
        fields(i) = FieldName.create("field_" + i)
        dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
        miningSchema
          .addMiningFields(new MiningField(fields(i))
          .setUsageType(FieldUsageType.ACTIVE))
        clusteringModel.addClusteringFields(
          new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF))
      }

      dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size)

      for (i <- model.clusterCenters.indices) {
        val cluster = new Cluster()
          .setName("cluster_" + i)
          .setArray(new org.dmg.pmml.Array()
          .setType(Array.Type.REAL)
          .setN(clusterCenter.size)
          .setValue(model.clusterCenters(i).toArray.mkString(" ")))
        // we don't have the size of the single cluster but only the centroids (withValue)
        // .withSize(value)
        clusteringModel.addClusters(cluster)
      }

      pmml.setDataDictionary(dataDictionary)
      pmml.addModels(clusteringModel)
    }
  }
}

org.apache.spark.mllib.clustering.KMeansModel Scala Examples