org.apache.spark.ml.classification.DecisionTreeClassificationModel Scala Examples
The following examples show how to use org.apache.spark.ml.classification.DecisionTreeClassificationModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DecisionTreePrediction.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.ml_classification import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.sql.SparkSession class DecisionTreePrediction extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Use an existing decision tree model to predict." val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var test_data_path:String =_ var model_path:String=_ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //load data stored in libsvm format as a dataframe val data=spark.read.format("libsvm").load(test_data_path) //data.show() //load model val model=DecisionTreeClassificationModel.load(model_path) val predictions=model.transform(data) predictions.show() out.write(predictions) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map: Map[String, Any]): Unit = { test_data_path=MapUtil.get(map,key="test_data_path").asInstanceOf[String] model_path=MapUtil.get(map,key="model_path").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val test_data_path = new PropertyDescriptor().name("test_data_path").displayName("TEST_DATA_PATH").defaultValue("").required(true) val model_path = new PropertyDescriptor().name("model_path").displayName("MODEL_PATH").defaultValue("").required(true) descriptor = test_data_path :: descriptor descriptor = model_path :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/ml_classification/DecisionTreePrediction.png") } override def getGroup(): List[String] = { List(StopGroup.MLGroup.toString) } }
Example 2
Source File: LocalDecisionTreeClassificationModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.tree.Node class LocalDecisionTreeClassificationModel( override val sparkTransformer: DecisionTreeClassificationModel ) extends LocalProbabilisticClassificationModel[DecisionTreeClassificationModel] {} object LocalDecisionTreeClassificationModel extends SimpleModelLoader[DecisionTreeClassificationModel] with TypedTransformerConverter[DecisionTreeClassificationModel] { override implicit def toLocal( sparkTransformer: DecisionTreeClassificationModel ): LocalDecisionTreeClassificationModel = { new LocalDecisionTreeClassificationModel(sparkTransformer) } override def build(metadata: Metadata, data: LocalData): DecisionTreeClassificationModel = { createTree(metadata, data) } def createTree(metadata: Metadata, data: LocalData): DecisionTreeClassificationModel = { val ctor = classOf[DecisionTreeClassificationModel].getDeclaredConstructor( classOf[String], classOf[Node], classOf[Int], classOf[Int] ) ctor.setAccessible(true) val inst = ctor.newInstance( metadata.uid, DataUtils.createNode(0, metadata, data), metadata.numFeatures.get.asInstanceOf[java.lang.Integer], metadata.numClasses.get.asInstanceOf[java.lang.Integer] ) inst .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) inst .set(inst.seed, metadata.paramMap("seed").toString.toLong) .set(inst.cacheNodeIds, metadata.paramMap("cacheNodeIds").toString.toBoolean) .set(inst.maxDepth, metadata.paramMap("maxDepth").toString.toInt) .set(inst.labelCol, metadata.paramMap("labelCol").toString) .set(inst.minInfoGain, metadata.paramMap("minInfoGain").toString.toDouble) .set(inst.checkpointInterval, metadata.paramMap("checkpointInterval").toString.toInt) .set(inst.minInstancesPerNode, metadata.paramMap("minInstancesPerNode").toString.toInt) .set(inst.maxMemoryInMB, metadata.paramMap("maxMemoryInMB").toString.toInt) .set(inst.maxBins, metadata.paramMap("maxBins").toString.toInt) .set(inst.impurity, metadata.paramMap("impurity").toString) } }
Example 3
Source File: RandomForestClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.serializer.ModelSerializer import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.bundle.tree.decision.SparkNodeWrapper import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, RandomForestClassificationModel} class RandomForestClassifierOp extends SimpleSparkOp[RandomForestClassificationModel] { implicit val nodeWrapper = SparkNodeWrapper override val Model: OpModel[SparkBundleContext, RandomForestClassificationModel] = new OpModel[SparkBundleContext, RandomForestClassificationModel] { override val klazz: Class[RandomForestClassificationModel] = classOf[RandomForestClassificationModel] override def opName: String = Bundle.BuiltinOps.classification.random_forest_classifier override def store(model: Model, obj: RandomForestClassificationModel) (implicit context: BundleContext[SparkBundleContext]): Model = { var i = 0 val trees = obj.trees.map { tree => val name = s"tree$i" ModelSerializer(context.bundleContext(name)).write(tree).get i = i + 1 name } val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None model.withValue("num_features", Value.long(obj.numFeatures)). withValue("num_classes", Value.long(obj.numClasses)). withValue("tree_weights", Value.doubleList(obj.treeWeights)). withValue("trees", Value.stringList(trees)). withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): RandomForestClassificationModel = { val numFeatures = model.value("num_features").getLong.toInt val numClasses = model.value("num_classes").getLong.toInt val treeWeights = model.value("tree_weights").getDoubleList // TODO: get rid of this when Spark supports setting tree weights for(weight <- treeWeights) { require(weight == 1.0, "tree weights must be 1.0 for Spark") } val models = model.value("trees").getStringList.map { tree => ModelSerializer(context.bundleContext(tree)).read().get.asInstanceOf[DecisionTreeClassificationModel] }.toArray val m = new RandomForestClassificationModel(uid = "", numFeatures = numFeatures, numClasses = numClasses, _trees = models) model.getValue("thresholds"). map(t => m.setThresholds(t.getDoubleList.toArray)). getOrElse(m) } } override def sparkLoad(uid: String, shape: NodeShape, model: RandomForestClassificationModel): RandomForestClassificationModel = { val r = new RandomForestClassificationModel(uid = uid, _trees = model.trees, numFeatures = model.numFeatures, numClasses = model.numClasses) if (model.isDefined(model.thresholds)) { r.setThresholds(model.getThresholds) } r } override def sparkInputs(obj: RandomForestClassificationModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: RandomForestClassificationModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 4
Source File: DecisionTreeClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import org.apache.spark.ml.tree import ml.combust.bundle.dsl._ import ml.combust.bundle.tree.decision.TreeSerializer import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.bundle.tree.decision.SparkNodeWrapper import org.apache.spark.ml.classification.DecisionTreeClassificationModel class DecisionTreeClassifierOp extends SimpleSparkOp[DecisionTreeClassificationModel] { implicit val nodeWrapper = SparkNodeWrapper override val Model: OpModel[SparkBundleContext, DecisionTreeClassificationModel] = new OpModel[SparkBundleContext, DecisionTreeClassificationModel] { override val klazz: Class[DecisionTreeClassificationModel] = classOf[DecisionTreeClassificationModel] override def opName: String = Bundle.BuiltinOps.classification.decision_tree_classifier override def store(model: Model, obj: DecisionTreeClassificationModel) (implicit context: BundleContext[SparkBundleContext]): Model = { TreeSerializer[tree.Node](context.file("tree"), withImpurities = true).write(obj.rootNode) val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None model.withValue("num_features", Value.long(obj.numFeatures)). withValue("num_classes", Value.long(obj.numClasses)). withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): DecisionTreeClassificationModel = { val rootNode = TreeSerializer[tree.Node](context.file("tree"), withImpurities = true).read().get val dt = new DecisionTreeClassificationModel(uid = "", rootNode = rootNode, numClasses = model.value("num_classes").getLong.toInt, numFeatures = model.value("num_features").getLong.toInt) model.getValue("thresholds"). map(t => dt.setThresholds(t.getDoubleList.toArray)). getOrElse(dt) } } override def sparkLoad(uid: String, shape: NodeShape, model: DecisionTreeClassificationModel): DecisionTreeClassificationModel = { val r = new DecisionTreeClassificationModel(uid = uid, rootNode = model.rootNode, numFeatures = model.numFeatures, numClasses = model.numClasses) if (model.isDefined(model.thresholds)) { r.setThresholds(model.getThresholds) } r } override def sparkInputs(obj: DecisionTreeClassificationModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: DecisionTreeClassificationModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 5
Source File: OpDecisionTreeClassifier.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.classification import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, OpDecisionTreeClassifierParams} import scala.reflect.runtime.universe.TypeTag class OpDecisionTreeClassificationModel ( sparkModel: DecisionTreeClassificationModel, uid: String = UID[OpDecisionTreeClassificationModel], operationName: String = classOf[DecisionTreeClassifier].getSimpleName )( implicit tti1: TypeTag[RealNN], tti2: TypeTag[OPVector], tto: TypeTag[Prediction], ttov: TypeTag[Prediction#Value] ) extends OpProbabilisticClassifierModel[DecisionTreeClassificationModel]( sparkModel = sparkModel, uid = uid, operationName = operationName ) { @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") @transient lazy val probability2predictionMirror = reflectMethod(getSparkMlStage().get, "probability2prediction") }
Example 6
Source File: OpDecisionTreeClassifierTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.classification import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.PredictionEquality import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpDecisionTreeClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[DecisionTreeClassificationModel], OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]] with PredictionEquality { override def specName: String = Spec[OpDecisionTreeClassifier] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector ) ) val feature1 = rawFeature1.copy(isResponse = true) val estimator = new OpDecisionTreeClassifier().setInput(feature1, feature2) val expectedResult = Seq( Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)) ) it should "allow the user to set the desired spark parameters" in { estimator .setMaxDepth(6) .setMaxBins(2) .setMinInstancesPerNode(2) .setMinInfoGain(0.1) estimator.fit(inputData) estimator.predictor.getMaxDepth shouldBe 6 estimator.predictor.getMaxBins shouldBe 2 estimator.predictor.getMinInstancesPerNode shouldBe 2 estimator.predictor.getMinInfoGain shouldBe 0.1 } }
Example 7
Source File: RandomForestSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.tree.{ContinuousSplit, DecisionTreeModel, LeafNode, Node} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.tree.impurity.GiniCalculator import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.collection.OpenHashMap val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0)) val left = new LeafNode(0.0, leftImp.calculate(), leftImp) val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0)) val right = new LeafNode(2.0, rightImp.calculate(), rightImp) val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5)) val parentImp = parent.impurityStats val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0)) val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp) val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0)) val grandImp = grandParent.impurityStats // Test feature importance computed at different subtrees. def testNode(node: Node, expected: Map[Int, Double]): Unit = { val map = new OpenHashMap[Int, Double]() RandomForest.computeFeatureImportance(node, map) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } // Leaf node testNode(left, Map.empty[Int, Double]) // Internal node with 2 leaf children val feature0importance = parentImp.calculate() * parentImp.count - (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count) testNode(parent, Map(0 -> feature0importance)) // Full tree val feature1importance = grandImp.calculate() * grandImp.count - (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count) testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance)) // Forest consisting of (full tree) + (internal node with 2 leafs) val trees = Array(parent, grandParent).map { root => new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3) .asInstanceOf[DecisionTreeModel] } val importances: Vector = RandomForest.featureImportances(trees, 2) val tree2norm = feature0importance + feature1importance val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0, (feature1importance / tree2norm) / 2.0) assert(importances ~== expected relTol 0.01) } test("normalizeMapValues") { val map = new OpenHashMap[Int, Double]() map(0) = 1.0 map(2) = 2.0 RandomForest.normalizeMapValues(map) val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } } private object RandomForestSuite { def mapToVec(map: Map[Int, Double]): Vector = { val size = (map.keys.toSeq :+ 0).max + 1 val (indices, values) = map.toSeq.sortBy(_._1).unzip Vectors.sparse(size, indices.toArray, values.toArray) } }
Example 8
Source File: RandomForestClassificationModelToMleap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter.runtime.classification import com.truecar.mleap.core.classification.RandomForestClassification import com.truecar.mleap.runtime.transformer import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, RandomForestClassificationModel} import org.apache.spark.ml.mleap.converter.runtime.TransformerToMleap object RandomForestClassificationModelToMleap extends TransformerToMleap[RandomForestClassificationModel, transformer.RandomForestClassificationModel] { override def toMleap(t: RandomForestClassificationModel): transformer.RandomForestClassificationModel = { val trees = t.trees.asInstanceOf[Array[DecisionTreeClassificationModel]] .map(tree => DecisionTreeClassificationModelToMleap(tree).toMleap) val model = RandomForestClassification(trees, t.numFeatures, t.numClasses) transformer.RandomForestClassificationModel(t.getFeaturesCol, t.getPredictionCol, model) } }
Example 9
Source File: MleapSparkSupport.scala From mleap with Apache License 2.0 | 5 votes |
package com.truecar.mleap.spark import com.truecar.mleap.core.linalg import com.truecar.mleap.runtime.transformer.{Transformer => MleapTransformer} import com.truecar.mleap.runtime.{types, Row => MleapRow} import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.mleap.converter._ import org.apache.spark.ml.mleap.converter.runtime.{BaseTransformerConverter, TransformerToMleap} import org.apache.spark.ml.mleap.converter.runtime.classification.DecisionTreeClassificationModelToMleap import org.apache.spark.ml.mleap.converter.runtime.regression.DecisionTreeRegressionModelToMleap import org.apache.spark.ml.regression.DecisionTreeRegressionModel import org.apache.spark.ml.tree._ import org.apache.spark.ml.Transformer import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SQLContext} trait MleapSparkSupport extends BaseTransformerConverter { import scala.language.implicitConversions implicit def transformerToMleapLifted[T <: Transformer] (t: T) (implicit transformerToMleap: TransformerToMleap[T, _ <: MleapTransformer]): MleapTransformer = { transformerToMleap.toMleapLifted(t) } implicit def mleapTransformerWrapper[T <: MleapTransformer](t: T): MleapTransformerWrapper[T] = { MleapTransformerWrapper(t) } implicit def vectorToSpark(vector: linalg.Vector): VectorToSpark = VectorToSpark(vector) implicit def vectorToMleap(vector: Vector): VectorToMleap = VectorToMleap(vector) implicit def dataFrameToMleap(dataset: DataFrame): DataFrameToMleap = DataFrameToMleap(dataset) implicit def decisionTreeRegressionModelToMleap(tree: DecisionTreeRegressionModel): DecisionTreeRegressionModelToMleap = DecisionTreeRegressionModelToMleap(tree) implicit def decisionTreeClassificationModelToMleap(tree: DecisionTreeClassificationModel): DecisionTreeClassificationModelToMleap = DecisionTreeClassificationModelToMleap(tree) implicit def nodeToMleap(node: Node): NodeToMleap = NodeToMleap(node) implicit def splitToMleap(split: Split): SplitToMleap = SplitToMleap(split) implicit def structTypeToMleap(schema: StructType): StructTypeToMleap = StructTypeToMleap(schema) implicit def rowToSpark(row: MleapRow): RowToSpark = RowToSpark(row) implicit def structTypeToSpark(schema: types.StructType): StructTypeToSpark = StructTypeToSpark(schema) implicit def leapFrameToSpark[T: LeapFrameToSpark](frame: T): LeapFrameToSparkWrapper[T] = { LeapFrameToSparkWrapper(frame) } implicit def leapFrameToSparkConvert[T: LeapFrameToSpark](frame: T) (implicit sqlContext: SQLContext): DataFrame = { implicitly[LeapFrameToSpark[T]].toSpark(frame) } implicit def dataFrameToLeapFrame(dataFrame: DataFrame): SparkLeapFrame = dataFrame.toMleap } object MleapSparkSupport extends MleapSparkSupport