org.apache.spark.mllib.clustering.KMeansModel Scala Examples
The following examples show how to use org.apache.spark.mllib.clustering.KMeansModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PMMLModelExportFactory.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.RegressionNormalizationMethodType import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.regression.LassoModel import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.RidgeRegressionModel private[mllib] object PMMLModelExportFactory { def createPMMLModelExport(model: Any): PMMLModelExport = { model match { case kmeans: KMeansModel => new KMeansPMMLModelExport(kmeans) case linear: LinearRegressionModel => new GeneralizedLinearPMMLModelExport(linear, "linear regression") case ridge: RidgeRegressionModel => new GeneralizedLinearPMMLModelExport(ridge, "ridge regression") case lasso: LassoModel => new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( svm, "linear SVM", RegressionNormalizationMethodType.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( "PMML Export not supported for Multinomial Logistic Regression") } case _ => throw new IllegalArgumentException( "PMML Export not supported for model: " + model.getClass.getName) } } }
Example 2
Source File: PMMLModelExportFactorySuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 3
Source File: KMeansClustering_IBM.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.mllib.clustering.{ KMeans, KMeansModel } import org.apache.spark.mllib.linalg.Vectors Vectors.dense(line.split(",").map(_.trim).filter(!"".equals(_)).map(_.toDouble)) }) parsedTestData.collect().foreach(testDataLine => { //计算测试数据分别属于那个簇类 val predictedClusterIndex: Int = clusters.predict(testDataLine) println("测试样本: " + testDataLine.toString + " 属于聚类 " + predictedClusterIndex) }) println("Spark MLlib K-means clustering test finished.") //评估KMeans模型 如何选择K值 val ks: Array[Int] = Array(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 50, 80, 100) ks.foreach(cluster => { //parsedTrainingData训练模型数据 val model: KMeansModel = KMeans.train(parsedTrainingData, cluster, 30, 1) //KMeansModel 类里提供了 computeCost 方法,该方法通过计算所有数据点到其最近的中心点的平方和来评估聚类的效果。 //统计聚类错误的样本比例 val ssd = model.computeCost(parsedTrainingData) //model.predict(point) println("sum of squared distances of points to their nearest center when k=" + cluster + " -> " + ssd) }) } //过滤标题行 private def isColumnNameLine(line: String): Boolean = { if (line != null && line.contains("Channel")) true else false } }
Example 4
Source File: KMeansExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors object KMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering") val sc = new SparkContext(sparkConf) //加载saratoga到RDD val data = sc.textFile("../data/mllib/saratoga.csv") //把数据转换成密集向量的RDD val parsedData = data.map( line => Vectors.dense(line.split(',').map(_.toDouble))) //以4个簇和5次迭代训练模型 val kmmodel= KMeans.train(parsedData,4,5) //把parsedData数据收集本地数据集 val houses = parsedData.collect //预测第1个元素的簇,KMeans算法会从0给出簇的ID, val prediction1 = kmmodel.predict(houses(0)) //预测houses(18)的数据,占地面积876,价格66.5属于那个簇 val prediction2 = kmmodel.predict(houses(18)) //预测houses(35)的数据,占地面积15750,价格112属于那个簇 val prediction3 = kmmodel.predict(houses(35)) //预测houses(6)的数据,占地面积38768,价格272属于那个簇 val prediction4 = kmmodel.predict(houses(6)) //预测houses(15)的数据,占地面积69696,价格275属于那个簇 val prediction5 = kmmodel.predict(houses(15)) } }
Example 5
Source File: KMeansPMMLModelExport.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} import org.dmg.pmml._ import org.apache.spark.mllib.clustering.KMeansModel private def populateKMeansPMML(model : KMeansModel): Unit = { pmml.getHeader.setDescription("k-means clustering") if (model.clusterCenters.length > 0) { val clusterCenter = model.clusterCenters(0) val fields = new SArray[FieldName](clusterCenter.size) val dataDictionary = new DataDictionary val miningSchema = new MiningSchema val comparisonMeasure = new ComparisonMeasure() .withKind(ComparisonMeasure.Kind.DISTANCE) .withMeasure(new SquaredEuclidean()) val clusteringModel = new ClusteringModel() .withModelName("k-means") .withMiningSchema(miningSchema) .withComparisonMeasure(comparisonMeasure) .withFunctionName(MiningFunctionType.CLUSTERING) .withModelClass(ClusteringModel.ModelClass.CENTER_BASED) .withNumberOfClusters(model.clusterCenters.length) for (i <- 0 until clusterCenter.size) { fields(i) = FieldName.create("field_" + i) dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .withMiningFields(new MiningField(fields(i)) .withUsageType(FieldUsageType.ACTIVE)) clusteringModel.withClusteringFields( new ClusteringField(fields(i)).withCompareFunction(CompareFunctionType.ABS_DIFF)) } dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size) for (i <- 0 until model.clusterCenters.length) { val cluster = new Cluster() .withName("cluster_" + i) .withArray(new org.dmg.pmml.Array() .withType(Array.Type.REAL) .withN(clusterCenter.size) .withValue(model.clusterCenters(i).toArray.mkString(" "))) // we don't have the size of the single cluster but only the centroids (withValue) // .withSize(value) clusteringModel.withClusters(cluster) } pmml.setDataDictionary(dataDictionary) pmml.withModels(clusteringModel) } } }
Example 6
Source File: PMMLModelExportFactory.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.RegressionNormalizationMethodType import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.regression.LassoModel import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.RidgeRegressionModel private[mllib] object PMMLModelExportFactory { def createPMMLModelExport(model: Any): PMMLModelExport = { model match { case kmeans: KMeansModel => new KMeansPMMLModelExport(kmeans) case linear: LinearRegressionModel => new GeneralizedLinearPMMLModelExport(linear, "linear regression") case ridge: RidgeRegressionModel => new GeneralizedLinearPMMLModelExport(ridge, "ridge regression") case lasso: LassoModel => new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( svm, "linear SVM", RegressionNormalizationMethodType.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( "PMML Export not supported for Multinomial Logistic Regression") } case _ => throw new IllegalArgumentException( "PMML Export not supported for model: " + model.getClass.getName) } } }
Example 7
Source File: KMeansPMMLModelExportSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size //clusterCenters聚类中心点 assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 8
Source File: PMMLModelExportFactorySuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, //numClasses 分类数 numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 9
Source File: KMeansExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println(s"Within Set Sum of Squared Errors = $WSSSE") // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 10
Source File: KMeansPMMLModelExport.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} import org.dmg.pmml._ import org.apache.spark.mllib.clustering.KMeansModel private def populateKMeansPMML(model: KMeansModel): Unit = { pmml.getHeader.setDescription("k-means clustering") if (model.clusterCenters.length > 0) { val clusterCenter = model.clusterCenters(0) val fields = new SArray[FieldName](clusterCenter.size) val dataDictionary = new DataDictionary val miningSchema = new MiningSchema val comparisonMeasure = new ComparisonMeasure() .setKind(ComparisonMeasure.Kind.DISTANCE) .setMeasure(new SquaredEuclidean()) val clusteringModel = new ClusteringModel() .setModelName("k-means") .setMiningSchema(miningSchema) .setComparisonMeasure(comparisonMeasure) .setFunctionName(MiningFunctionType.CLUSTERING) .setModelClass(ClusteringModel.ModelClass.CENTER_BASED) .setNumberOfClusters(model.clusterCenters.length) for (i <- 0 until clusterCenter.size) { fields(i) = FieldName.create("field_" + i) dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(fields(i)) .setUsageType(FieldUsageType.ACTIVE)) clusteringModel.addClusteringFields( new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF)) } dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size) for (i <- model.clusterCenters.indices) { val cluster = new Cluster() .setName("cluster_" + i) .setArray(new org.dmg.pmml.Array() .setType(Array.Type.REAL) .setN(clusterCenter.size) .setValue(model.clusterCenters(i).toArray.mkString(" "))) // we don't have the size of the single cluster but only the centroids (withValue) // .withSize(value) clusteringModel.addClusters(cluster) } pmml.setDataDictionary(dataDictionary) pmml.addModels(clusteringModel) } } }
Example 11
Source File: KMeansPMMLModelExportSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 12
Source File: KMeansPMMLModelExportSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 13
Source File: PMMLModelExportFactorySuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 14
Source File: KMeansPMMLModelExport.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} import org.dmg.pmml._ import org.apache.spark.mllib.clustering.KMeansModel private def populateKMeansPMML(model : KMeansModel): Unit = { pmml.getHeader.setDescription("k-means clustering") if (model.clusterCenters.length > 0) { val clusterCenter = model.clusterCenters(0) val fields = new SArray[FieldName](clusterCenter.size) val dataDictionary = new DataDictionary val miningSchema = new MiningSchema val comparisonMeasure = new ComparisonMeasure() .withKind(ComparisonMeasure.Kind.DISTANCE) .withMeasure(new SquaredEuclidean()) val clusteringModel = new ClusteringModel() .withModelName("k-means") .withMiningSchema(miningSchema) .withComparisonMeasure(comparisonMeasure) .withFunctionName(MiningFunctionType.CLUSTERING) .withModelClass(ClusteringModel.ModelClass.CENTER_BASED) .withNumberOfClusters(model.clusterCenters.length) for (i <- 0 until clusterCenter.size) { fields(i) = FieldName.create("field_" + i) dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .withMiningFields(new MiningField(fields(i)) .withUsageType(FieldUsageType.ACTIVE)) clusteringModel.withClusteringFields( new ClusteringField(fields(i)).withCompareFunction(CompareFunctionType.ABS_DIFF)) } dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size) for (i <- 0 until model.clusterCenters.length) { val cluster = new Cluster() .withName("cluster_" + i) .withArray(new org.dmg.pmml.Array() .withType(Array.Type.REAL) .withN(clusterCenter.size) .withValue(model.clusterCenters(i).toArray.mkString(" "))) // we don't have the size of the single cluster but only the centroids (withValue) // .withSize(value) clusteringModel.withClusters(cluster) } pmml.setDataDictionary(dataDictionary) pmml.withModels(clusteringModel) } } }
Example 15
Source File: PMMLModelExportFactory.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.RegressionNormalizationMethodType import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.regression.LassoModel import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.RidgeRegressionModel private[mllib] object PMMLModelExportFactory { def createPMMLModelExport(model: Any): PMMLModelExport = { model match { case kmeans: KMeansModel => new KMeansPMMLModelExport(kmeans) case linear: LinearRegressionModel => new GeneralizedLinearPMMLModelExport(linear, "linear regression") case ridge: RidgeRegressionModel => new GeneralizedLinearPMMLModelExport(ridge, "ridge regression") case lasso: LassoModel => new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( svm, "linear SVM", RegressionNormalizationMethodType.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( "PMML Export not supported for Multinomial Logistic Regression") } case _ => throw new IllegalArgumentException( "PMML Export not supported for model: " + model.getClass.getName) } } }
Example 16
Source File: KMeansPMMLModelExportSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 17
Source File: PMMLModelExportFactorySuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 18
Source File: KmeansModelSaveToOss.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.mllib import org.apache.spark.mllib.clustering.KMeans._ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} object KmeansModelSaveToOss { val modelOssDir = "oss://bucket/kmeans-model" def main(args: Array[String]) { //1. train and save the model val conf = new SparkConf().setAppName("KmeansModelSaveToOss") conf.set("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") conf.set("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") conf.set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") val sc = new SparkContext(conf) val points = Seq( Vectors.dense(0.0, 0.0), Vectors.dense(0.0, 0.1), Vectors.dense(0.1, 0.0), Vectors.dense(9.0, 0.0), Vectors.dense(9.0, 0.2), Vectors.dense(9.2, 0.0) ) val rdd = sc.parallelize(points, 3) val initMode = K_MEANS_PARALLEL val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode) val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect println("modelOssDir=" + modelOssDir) model.save(sc, modelOssDir) //2. predict from the oss model val modelLoadOss = KMeansModel.load(sc, modelOssDir) val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect assert(predictResult1.size == predictResult2.size) predictResult2.foreach(result2 => assert(predictResult1.contains(result2))) } }
Example 19
Source File: KmeansModelSaveToOss.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.mllib import org.apache.spark.mllib.clustering.KMeans._ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SparkSession object KmeansModelSaveToOss { val modelOssDir = "oss://bucket/kmeans-model" def main(args: Array[String]) { //1. train and save the model val spark = SparkSession .builder() .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") .appName("KmeansModelSaveToOss") .getOrCreate() val sc = spark.sparkContext val points = Seq( Vectors.dense(0.0, 0.0), Vectors.dense(0.0, 0.1), Vectors.dense(0.1, 0.0), Vectors.dense(9.0, 0.0), Vectors.dense(9.0, 0.2), Vectors.dense(9.2, 0.0) ) val rdd = sc.parallelize(points, 3) val initMode = K_MEANS_PARALLEL val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode) val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect println("modelOssDir=" + modelOssDir) model.save(sc, modelOssDir) //2. predict from the oss model val modelLoadOss = KMeansModel.load(sc, modelOssDir) val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect assert(predictResult1.size == predictResult2.size) predictResult2.foreach(result2 => assert(predictResult1.contains(result2))) } }
Example 20
Source File: PMMLModelExportFactory.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.RegressionNormalizationMethodType import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.regression.LassoModel import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.RidgeRegressionModel private[mllib] object PMMLModelExportFactory { def createPMMLModelExport(model: Any): PMMLModelExport = { model match { case kmeans: KMeansModel => new KMeansPMMLModelExport(kmeans) case linear: LinearRegressionModel => new GeneralizedLinearPMMLModelExport(linear, "linear regression") case ridge: RidgeRegressionModel => new GeneralizedLinearPMMLModelExport(ridge, "ridge regression") case lasso: LassoModel => new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( svm, "linear SVM", RegressionNormalizationMethodType.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( "PMML Export not supported for Multinomial Logistic Regression") } case _ => throw new IllegalArgumentException( "PMML Export not supported for model: " + model.getClass.getName) } } }
Example 21
Source File: PMMLModelExportFactory.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.RegressionNormalizationMethodType import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.regression.LassoModel import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.RidgeRegressionModel private[mllib] object PMMLModelExportFactory { def createPMMLModelExport(model: Any): PMMLModelExport = { model match { case kmeans: KMeansModel => new KMeansPMMLModelExport(kmeans) case linear: LinearRegressionModel => new GeneralizedLinearPMMLModelExport(linear, "linear regression") case ridge: RidgeRegressionModel => new GeneralizedLinearPMMLModelExport(ridge, "ridge regression") case lasso: LassoModel => new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( svm, "linear SVM", RegressionNormalizationMethodType.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( "PMML Export not supported for Multinomial Logistic Regression") } case _ => throw new IllegalArgumentException( "PMML Export not supported for model: " + model.getClass.getName) } } }
Example 22
Source File: KMeansPMMLModelExportSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 23
Source File: PMMLModelExportFactorySuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 24
Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.linalg._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.MemoryStream import org.scalatest.FunSuite import org.apache.log4j.{Level, Logger} case class TestRow(features: Vector) class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase { override def beforeAll(): Unit = { super.beforeAll() Logger.getLogger("org").setLevel(Level.OFF) } test("streaming model with one center should converge to true center") { import spark.implicits._ val k = 1 val dim = 5 val clusterSpread = 0.1 val seed = 63 // TODO: this test is very flaky. The centers do not converge for some // (most?) random seeds val (batches, trueCenters) = StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed) val inputStream = MemoryStream[TestRow] val ds = inputStream.toDS() val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01) val query = skm.evilTrain(ds.toDF()) val streamingModels = batches.map { batch => inputStream.addData(batch) query.processAllAvailable() skm.getModel } // TODO: use spark's testing suite streamingModels.last.centers.zip(trueCenters).foreach { case (center, trueCenter) => val centers = center.toArray.mkString(",") val trueCenters = trueCenter.toArray.mkString(",") println(s"${centers} | ${trueCenters}") assert(center.toArray.zip(trueCenter.toArray).forall( x => math.abs(x._1 - x._2) < 0.1)) } query.stop() } def compareBatchAndStreaming( batchModel: KMeansModel, streamingModel: StreamingKMeansModel, validationData: DataFrame): Unit = { assert(batchModel.clusterCenters === streamingModel.centers) // TODO: implement prediction comparison } } object StreamingKMeansSuite { def generateBatches( numPoints: Int, numBatches: Int, k: Int, d: Int, r: Double, seed: Int, initCenters: Array[Vector] = null): (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = { val rand = scala.util.Random rand.setSeed(seed) val centers = initCenters match { case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian()))) case _ => initCenters } val data = (0 until numBatches).map { i => (0 until numPoints).map { idx => val center = centers(idx % k) val vec = Vectors.dense( Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r)) TestRow(vec) } } (data, centers) } }
Example 25
Source File: KMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 26
Source File: KMeansPMMLModelExport.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} import org.dmg.pmml._ import org.apache.spark.mllib.clustering.KMeansModel private def populateKMeansPMML(model: KMeansModel): Unit = { pmml.getHeader.setDescription("k-means clustering") if (model.clusterCenters.length > 0) { val clusterCenter = model.clusterCenters(0) val fields = new SArray[FieldName](clusterCenter.size) val dataDictionary = new DataDictionary val miningSchema = new MiningSchema val comparisonMeasure = new ComparisonMeasure() .setKind(ComparisonMeasure.Kind.DISTANCE) .setMeasure(new SquaredEuclidean()) val clusteringModel = new ClusteringModel() .setModelName("k-means") .setMiningSchema(miningSchema) .setComparisonMeasure(comparisonMeasure) .setFunctionName(MiningFunctionType.CLUSTERING) .setModelClass(ClusteringModel.ModelClass.CENTER_BASED) .setNumberOfClusters(model.clusterCenters.length) for (i <- 0 until clusterCenter.size) { fields(i) = FieldName.create("field_" + i) dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(fields(i)) .setUsageType(FieldUsageType.ACTIVE)) clusteringModel.addClusteringFields( new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF)) } dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size) for (i <- model.clusterCenters.indices) { val cluster = new Cluster() .setName("cluster_" + i) .setArray(new org.dmg.pmml.Array() .setType(Array.Type.REAL) .setN(clusterCenter.size) .setValue(model.clusterCenters(i).toArray.mkString(" "))) // we don't have the size of the single cluster but only the centroids (withValue) // .withSize(value) clusteringModel.addClusters(cluster) } pmml.setDataDictionary(dataDictionary) pmml.addModels(clusteringModel) } } }
Example 27
Source File: PMMLModelExportFactory.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.RegressionNormalizationMethodType import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.regression.LassoModel import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.RidgeRegressionModel private[mllib] object PMMLModelExportFactory { def createPMMLModelExport(model: Any): PMMLModelExport = { model match { case kmeans: KMeansModel => new KMeansPMMLModelExport(kmeans) case linear: LinearRegressionModel => new GeneralizedLinearPMMLModelExport(linear, "linear regression") case ridge: RidgeRegressionModel => new GeneralizedLinearPMMLModelExport(ridge, "ridge regression") case lasso: LassoModel => new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( svm, "linear SVM", RegressionNormalizationMethodType.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( "PMML Export not supported for Multinomial Logistic Regression") } case _ => throw new IllegalArgumentException( "PMML Export not supported for model: " + model.getClass.getName) } } }
Example 28
Source File: KMeansPMMLModelExportSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 29
Source File: PMMLModelExportFactorySuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 30
Source File: KMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 31
Source File: KMeansPMMLModelExport.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} import org.dmg.pmml._ import org.apache.spark.mllib.clustering.KMeansModel private def populateKMeansPMML(model: KMeansModel): Unit = { pmml.getHeader.setDescription("k-means clustering") if (model.clusterCenters.length > 0) { val clusterCenter = model.clusterCenters(0) val fields = new SArray[FieldName](clusterCenter.size) val dataDictionary = new DataDictionary val miningSchema = new MiningSchema val comparisonMeasure = new ComparisonMeasure() .setKind(ComparisonMeasure.Kind.DISTANCE) .setMeasure(new SquaredEuclidean()) val clusteringModel = new ClusteringModel() .setModelName("k-means") .setMiningSchema(miningSchema) .setComparisonMeasure(comparisonMeasure) .setFunctionName(MiningFunctionType.CLUSTERING) .setModelClass(ClusteringModel.ModelClass.CENTER_BASED) .setNumberOfClusters(model.clusterCenters.length) for (i <- 0 until clusterCenter.size) { fields(i) = FieldName.create("field_" + i) dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(fields(i)) .setUsageType(FieldUsageType.ACTIVE)) clusteringModel.addClusteringFields( new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF)) } dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size) for (i <- model.clusterCenters.indices) { val cluster = new Cluster() .setName("cluster_" + i) .setArray(new org.dmg.pmml.Array() .setType(Array.Type.REAL) .setN(clusterCenter.size) .setValue(model.clusterCenters(i).toArray.mkString(" "))) // we don't have the size of the single cluster but only the centroids (withValue) // .withSize(value) clusteringModel.addClusters(cluster) } pmml.setDataDictionary(dataDictionary) pmml.addModels(clusteringModel) } } }
Example 32
Source File: PMMLModelExportFactory.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.RegressionNormalizationMethodType import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.regression.LassoModel import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.RidgeRegressionModel private[mllib] object PMMLModelExportFactory { def createPMMLModelExport(model: Any): PMMLModelExport = { model match { case kmeans: KMeansModel => new KMeansPMMLModelExport(kmeans) case linear: LinearRegressionModel => new GeneralizedLinearPMMLModelExport(linear, "linear regression") case ridge: RidgeRegressionModel => new GeneralizedLinearPMMLModelExport(ridge, "ridge regression") case lasso: LassoModel => new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( svm, "linear SVM", RegressionNormalizationMethodType.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( "PMML Export not supported for Multinomial Logistic Regression") } case _ => throw new IllegalArgumentException( "PMML Export not supported for model: " + model.getClass.getName) } } }
Example 33
Source File: KMeansPMMLModelExportSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 34
Source File: PMMLModelExportFactorySuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 35
Source File: KMeansPMMLModelExport.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} import org.dmg.pmml._ import org.apache.spark.mllib.clustering.KMeansModel private def populateKMeansPMML(model : KMeansModel): Unit = { pmml.getHeader.setDescription("k-means clustering") if (model.clusterCenters.length > 0) { val clusterCenter = model.clusterCenters(0) val fields = new SArray[FieldName](clusterCenter.size) val dataDictionary = new DataDictionary val miningSchema = new MiningSchema val comparisonMeasure = new ComparisonMeasure() .withKind(ComparisonMeasure.Kind.DISTANCE) .withMeasure(new SquaredEuclidean()) val clusteringModel = new ClusteringModel() .withModelName("k-means") .withMiningSchema(miningSchema) .withComparisonMeasure(comparisonMeasure) .withFunctionName(MiningFunctionType.CLUSTERING) .withModelClass(ClusteringModel.ModelClass.CENTER_BASED) .withNumberOfClusters(model.clusterCenters.length) for (i <- 0 until clusterCenter.size) { fields(i) = FieldName.create("field_" + i) dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .withMiningFields(new MiningField(fields(i)) .withUsageType(FieldUsageType.ACTIVE)) clusteringModel.withClusteringFields( new ClusteringField(fields(i)).withCompareFunction(CompareFunctionType.ABS_DIFF)) } dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size) for (i <- 0 until model.clusterCenters.length) { val cluster = new Cluster() .withName("cluster_" + i) .withArray(new org.dmg.pmml.Array() .withType(Array.Type.REAL) .withN(clusterCenter.size) .withValue(model.clusterCenters(i).toArray.mkString(" "))) // we don't have the size of the single cluster but only the centroids (withValue) // .withSize(value) clusteringModel.withClusters(cluster) } pmml.setDataDictionary(dataDictionary) pmml.withModels(clusteringModel) } } }
Example 36
Source File: KMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 37
Source File: KMeansPMMLModelExport.scala From drizzle-spark with Apache License 2.0 | 4 votes |
package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} import org.dmg.pmml._ import org.apache.spark.mllib.clustering.KMeansModel private def populateKMeansPMML(model: KMeansModel): Unit = { pmml.getHeader.setDescription("k-means clustering") if (model.clusterCenters.length > 0) { val clusterCenter = model.clusterCenters(0) val fields = new SArray[FieldName](clusterCenter.size) val dataDictionary = new DataDictionary val miningSchema = new MiningSchema val comparisonMeasure = new ComparisonMeasure() .setKind(ComparisonMeasure.Kind.DISTANCE) .setMeasure(new SquaredEuclidean()) val clusteringModel = new ClusteringModel() .setModelName("k-means") .setMiningSchema(miningSchema) .setComparisonMeasure(comparisonMeasure) .setFunctionName(MiningFunctionType.CLUSTERING) .setModelClass(ClusteringModel.ModelClass.CENTER_BASED) .setNumberOfClusters(model.clusterCenters.length) for (i <- 0 until clusterCenter.size) { fields(i) = FieldName.create("field_" + i) dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(fields(i)) .setUsageType(FieldUsageType.ACTIVE)) clusteringModel.addClusteringFields( new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF)) } dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size) for (i <- model.clusterCenters.indices) { val cluster = new Cluster() .setName("cluster_" + i) .setArray(new org.dmg.pmml.Array() .setType(Array.Type.REAL) .setN(clusterCenter.size) .setValue(model.clusterCenters(i).toArray.mkString(" "))) // we don't have the size of the single cluster but only the centroids (withValue) // .withSize(value) clusteringModel.addClusters(cluster) } pmml.setDataDictionary(dataDictionary) pmml.addModels(clusteringModel) } } }