org.apache.spark.ml.regression.DecisionTreeRegressor Scala Examples
The following examples show how to use org.apache.spark.ml.regression.DecisionTreeRegressor.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val distribTree = setParams(new DecisionTreeRegressor(), testParams) val localTree = setParams(new LocalDecisionTreeRegressor(), testParams) val localModel = localTree.fit(train) val model = distribTree.fit(train) OptimizedTreeTests.checkEqual(model, localModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 2
Source File: OptimizedDecisionTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier} import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor} import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeRegressor(), testParams) val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams) val newModel = newTree.fit(train) val oldModel = oldTree.fit(train) OptimizedTreeTests.checkEqual(oldModel, newModel) } private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeClassifier(), testParams) val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams) val newModel = newTree.fit(train) val model = oldTree.fit(train) OptimizedTreeTests.checkEqual(model, newModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree with two feature values") { val data = sc.parallelize(Range(0, 8).map(x => { if (x > 3) { Instance(x, 1.0, Vectors.dense(0.0)) } else { Instance(x, 1.0, Vectors.dense(1.0)) }})) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 3
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 4
Source File: DecisionTreeRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class DecisionTreeRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new DecisionTreeRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 5
Source File: TypedEncoderInstancesTests.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml import org.scalacheck.Prop._ import org.apache.spark.ml.linalg._ import org.apache.spark.ml.regression.DecisionTreeRegressor import Generators._ import scala.util.Random class TypedEncoderInstancesTests extends FramelessMlSuite { test("Vector encoding is injective using collect()") { val prop = forAll { vector: Vector => TypedDataset.create(Seq(vector)).collect().run() == Seq(vector) } check(prop) } test("Matrix encoding is injective using collect()") { val prop = forAll { matrix: Matrix => TypedDataset.create(Seq(matrix)).collect().run() == Seq(matrix) } check(prop) } test("Vector is encoded as VectorUDT and thus can be run in a Spark ML model") { case class Input(features: Vector, label: Double) val prop = forAll { trainingData: Matrix => (trainingData.numRows >= 1) ==> { val inputs = trainingData.rowIter.toVector.map(vector => Input(vector, 0D)) val inputsDS = TypedDataset.create(inputs) val model = new DecisionTreeRegressor() // this line would throw a runtime exception if Vector was not encoded as VectorUDT val trainedModel = model.fit(inputsDS.dataset) val randomInput = inputs(Random.nextInt(inputs.length)) val randomInputDS = TypedDataset.create(Seq(randomInput)) val prediction = trainedModel.transform(randomInputDS.dataset) .select("prediction") .head .getAs[Double](0) prediction == 0D } } check(prop, MinSize(1)) } }
Example 6
Source File: OpDecisionTreeRegressor.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.regression import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor, OpDecisionTreeRegressorParams} import scala.reflect.runtime.universe.TypeTag class OpDecisionTreeRegressionModel ( sparkModel: DecisionTreeRegressionModel, uid: String = UID[OpDecisionTreeRegressionModel], operationName: String = classOf[DecisionTreeRegressor].getSimpleName )( implicit tti1: TypeTag[RealNN], tti2: TypeTag[OPVector], tto: TypeTag[Prediction], ttov: TypeTag[Prediction#Value] ) extends OpPredictionModel[DecisionTreeRegressionModel]( sparkModel = sparkModel, uid = uid, operationName = operationName )
Example 7
Source File: OpDecisionTreeRegressorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.regression import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.PredictionEquality import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} import com.salesforce.op.test._ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpDecisionTreeRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[DecisionTreeRegressionModel], OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]] with PredictionEquality { override def specName: String = Spec[OpDecisionTreeRegressor] val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector), (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector), (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector), (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector) ) ) val label = rawLabel.copy(isResponse = true) val estimator = new OpDecisionTreeRegressor().setInput(label, features) val expectedResult = Seq( Prediction(10.0), Prediction(20.0), Prediction(30.0), Prediction(40.0), Prediction(50.0) ) it should "allow the user to set the desired spark parameters" in { estimator .setMaxDepth(6) .setMaxBins(2) .setMinInstancesPerNode(2) .setMinInfoGain(0.1) estimator.fit(inputData) estimator.predictor.getMaxDepth shouldBe 6 estimator.predictor.getMaxBins shouldBe 2 estimator.predictor.getMinInstancesPerNode shouldBe 2 estimator.predictor.getMinInfoGain shouldBe 0.1 } }
Example 8
Source File: SparkXGBoostRegressorSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.DecisionTreeRegressor import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.SquareLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class SparkXGBoostRegressorSuite extends FunSuite with TestData with MLlibTestSparkContext { test("Compare with DecisionTree using simple data") { val data = sqlContext.createDataFrame(sc.parallelize(simpleData, 2)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(1) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(1) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } test("Compare with DecisionTree using random data") { val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 40, 10, 2, 999)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(5) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } }