org.apache.spark.ml.classification.DecisionTreeClassifier Scala Examples
The following examples show how to use org.apache.spark.ml.classification.DecisionTreeClassifier.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OptimizedDecisionTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier} import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor} import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeRegressor(), testParams) val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams) val newModel = newTree.fit(train) val oldModel = oldTree.fit(train) OptimizedTreeTests.checkEqual(oldModel, newModel) } private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeClassifier(), testParams) val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams) val newModel = newTree.fit(train) val model = oldTree.fit(train) OptimizedTreeTests.checkEqual(model, newModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree with two feature values") { val data = sc.parallelize(Range(0, 8).map(x => { if (x > 3) { Instance(x, 1.0, Vectors.dense(0.0)) } else { Instance(x, 1.0, Vectors.dense(1.0)) }})) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 2
Source File: DecisionTreeClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DecisionTreeClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new DecisionTreeClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 3
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 4
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 5
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 6
Source File: ClassifiersImpl.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.machinelearning.common import org.apache.spark.ml.classification.{DecisionTreeClassifier, GBTClassifier, LogisticRegression, NaiveBayes} import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql._ object ClassifiersImpl { def logisticRegression(trainingLabeledPointDf: DataFrame, testPercentage:Double): Unit = { val mlr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.3) .setElasticNetParam(0.8) val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = mlr.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of logisticRegression = " + accuracy) //println(model) } def gbtClassifer(trainingLabeledPointDf: DataFrame, testPercentage:Double): Unit = { val gbt = new GBTClassifier() val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = gbt.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of gbtClassifier = " + accuracy) //println(model) //println(model.toDebugString) } def randomForestRegressor(trainingLabeledPointDf: DataFrame, impurity:String, maxDepth:Int, maxBins:Int, testPercentage:Double): Unit = { val rf = new RandomForestRegressor() rf.setImpurity(impurity) rf.setMaxDepth(maxDepth) rf.setMaxBins(maxBins) val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = rf.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of NaiveBayer = " + accuracy) } }
Example 7
Source File: TrainNewsClassWithDTDemo.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package applications.mining import config.paramconf.ClassParams import functions.Preprocessor import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature._ import org.apache.spark.sql.SparkSession object TrainNewsClassWithDTDemo { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.WARN) val spark = SparkSession .builder .master("local[2]") .appName("train news with DT Demo") .getOrCreate() val args = Array("ckooc-ml/data/classnews/train") val filePath = args(0) import spark.implicits._ val data = spark.sparkContext.textFile(filePath).flatMap { line => val tokens: Array[String] = line.split("\u00ef") if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None }.toDF("label", "title", "time", "content") data.persist() val preprocessor = new Preprocessor val pipeline = preprocessor.preprocess(data) // DT模型训练 val params = new ClassParams val dtClassifier = new DecisionTreeClassifier() .setMinInfoGain(params.minInfoGain) .setMaxDepth(params.maxDepth) //目前Spark只支持最大30层深度 .setLabelCol("indexedLabel") .setFeaturesCol("features") val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel] //索引标签化 val labelConverter = new IndexToString() .setLabels(indexModel.labels) .setInputCol(dtClassifier.getPredictionCol) .setOutputCol("predictedLabel") val stages = pipeline.getStages ++ Array(dtClassifier, labelConverter) pipeline.setStages(stages) val model = pipeline.fit(data) model.write.overwrite().save(params.DTModelPath) data.unpersist() spark.stop() } }
Example 8
Source File: OpDecisionTreeClassifier.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.classification import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier, OpDecisionTreeClassifierParams} import scala.reflect.runtime.universe.TypeTag class OpDecisionTreeClassificationModel ( sparkModel: DecisionTreeClassificationModel, uid: String = UID[OpDecisionTreeClassificationModel], operationName: String = classOf[DecisionTreeClassifier].getSimpleName )( implicit tti1: TypeTag[RealNN], tti2: TypeTag[OPVector], tto: TypeTag[Prediction], ttov: TypeTag[Prediction#Value] ) extends OpProbabilisticClassifierModel[DecisionTreeClassificationModel]( sparkModel = sparkModel, uid = uid, operationName = operationName ) { @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") @transient lazy val probability2predictionMirror = reflectMethod(getSparkMlStage().get, "probability2prediction") }
Example 9
Source File: OpDecisionTreeClassifierTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.classification import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.PredictionEquality import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier} import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpDecisionTreeClassifierTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[DecisionTreeClassificationModel], OpPredictorWrapper[DecisionTreeClassifier, DecisionTreeClassificationModel]] with PredictionEquality { override def specName: String = Spec[OpDecisionTreeClassifier] val (inputData, rawFeature1, feature2) = TestFeatureBuilder("label", "features", Seq[(RealNN, OPVector)]( 1.0.toRealNN -> Vectors.dense(12.0, 4.3, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.0, 0.3, 0.1).toOPVector, 0.0.toRealNN -> Vectors.dense(1.0, 3.9, 4.3).toOPVector, 1.0.toRealNN -> Vectors.dense(10.0, 1.3, 0.9).toOPVector, 1.0.toRealNN -> Vectors.dense(15.0, 4.7, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.5, 0.9, 10.1).toOPVector, 1.0.toRealNN -> Vectors.dense(11.5, 2.3, 1.3).toOPVector, 0.0.toRealNN -> Vectors.dense(0.1, 3.3, 0.1).toOPVector ) ) val feature1 = rawFeature1.copy(isResponse = true) val estimator = new OpDecisionTreeClassifier().setInput(feature1, feature2) val expectedResult = Seq( Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)), Prediction(1.0, Array(0.0, 4.0), Array(0.0, 1.0)), Prediction(0.0, Array(4.0, 0.0), Array(1.0, 0.0)) ) it should "allow the user to set the desired spark parameters" in { estimator .setMaxDepth(6) .setMaxBins(2) .setMinInstancesPerNode(2) .setMinInfoGain(0.1) estimator.fit(inputData) estimator.predictor.getMaxDepth shouldBe 6 estimator.predictor.getMaxBins shouldBe 2 estimator.predictor.getMinInstancesPerNode shouldBe 2 estimator.predictor.getMinInfoGain shouldBe 0.1 } }
Example 10
Source File: DecisionTreeClassifierSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import com.ibm.aardpfark.pfa.ClassifierResult import org.apache.spark.ml.classification.DecisionTreeClassifier class DecisionTreeClassifierSuite extends SparkClassifierPFASuiteBase[ClassifierResult] { val inputPath = "data/sample_multiclass_classification_data.txt" val data = spark.read.format("libsvm").load(inputPath) val dt = new DecisionTreeClassifier() .setMaxDepth(5) override val sparkTransformer = dt.fit(data) val result = sparkTransformer.transform(data) override val input = withColumnAsArray(result, dt.getFeaturesCol).toJSON.collect() override val expectedOutput = result.select(dt.getPredictionCol) .toJSON.collect() //TODO Test with raw prediction and probability }