org.apache.spark.ml.regression.DecisionTreeRegressor Scala Examples

The following examples show how to use org.apache.spark.ml.regression.DecisionTreeRegressor. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: LocalTreeIntegrationSuite.scala    From oraf   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame


  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val distribTree = setParams(new DecisionTreeRegressor(), testParams)
    val localTree = setParams(new LocalDecisionTreeRegressor(), testParams)
    val localModel = localTree.fit(train)
    val model = distribTree.fit(train)
    OptimizedTreeTests.checkEqual(model, localModel)
  }


  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

} 
Example 2
Source File: OptimizedDecisionTreeIntegrationSuite.scala    From oraf   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier}
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor}
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame



  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeRegressor(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams)
    val newModel = newTree.fit(train)
    val oldModel = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(oldModel, newModel)
  }

  private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeClassifier(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams)
    val newModel = newTree.fit(train)
    val model = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(model, newModel)
  }

  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree with two feature values") {
    val data = sc.parallelize(Range(0, 8).map(x => {
     if (x > 3) {
       Instance(x, 1.0, Vectors.dense(0.0))
     } else {
       Instance(x, 1.0, Vectors.dense(1.0))
     }}))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

} 
Example 3
Source File: CrossValidatorParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.validation

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.DataFrame

class CrossValidatorParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount")
  override val sparkTransformer: Transformer = {
    val regressor = new RandomForestRegressor().
      setFeaturesCol("features").
      setLabelCol("loan_amount").
      setPredictionCol("prediction")
    val paramGrid = new ParamGridBuilder()
      .addGrid(regressor.numTrees, Array(2, 3, 4))
      .build()

    new Pipeline().setStages(Array(new StringIndexer().
      setInputCol("fico_score_group_fnl").
      setOutputCol("fico_index"),
      new VectorAssembler().
        setInputCols(Array("fico_index", "dti")).
        setOutputCol("features"),
      new CrossValidator().
        setEvaluator(new RegressionEvaluator().
          setLabelCol("loan_amount").
          setPredictionCol("prediction")).
        setEstimator(regressor).
        setEstimatorParamMaps(paramGrid))).fit(dataset)
  }

  override val ignoreSerializationTest = true
} 
Example 4
Source File: DecisionTreeRegressionParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.regression

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql._


class DecisionTreeRegressionParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new VectorAssembler().
      setInputCols(Array("fico_index", "dti")).
      setOutputCol("features"),
    new DecisionTreeRegressor().
      setFeaturesCol("features").
      setLabelCol("loan_amount").
      setPredictionCol("prediction"))).fit(dataset)

  override val unserializedParams = Set("stringOrderType", "labelCol", "seed")
} 
Example 5
Source File: TypedEncoderInstancesTests.scala    From frameless   with Apache License 2.0 5 votes vote down vote up
package frameless
package ml

import org.scalacheck.Prop._
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.regression.DecisionTreeRegressor
import Generators._
import scala.util.Random

class TypedEncoderInstancesTests extends FramelessMlSuite {

  test("Vector encoding is injective using collect()") {
    val prop = forAll { vector: Vector =>
      TypedDataset.create(Seq(vector)).collect().run() == Seq(vector)
    }
    check(prop)
  }

  test("Matrix encoding is injective using collect()") {
    val prop = forAll { matrix: Matrix =>
      TypedDataset.create(Seq(matrix)).collect().run() == Seq(matrix)
    }
    check(prop)
  }

  test("Vector is encoded as VectorUDT and thus can be run in a Spark ML model") {
    case class Input(features: Vector, label: Double)

    val prop = forAll { trainingData: Matrix =>
      (trainingData.numRows >= 1) ==> {
        val inputs = trainingData.rowIter.toVector.map(vector => Input(vector, 0D))
        val inputsDS = TypedDataset.create(inputs)

        val model = new DecisionTreeRegressor()

        // this line would throw a runtime exception if Vector was not encoded as VectorUDT
        val trainedModel = model.fit(inputsDS.dataset)

        val randomInput = inputs(Random.nextInt(inputs.length))
        val randomInputDS = TypedDataset.create(Seq(randomInput))

        val prediction = trainedModel.transform(randomInputDS.dataset)
          .select("prediction")
          .head
          .getAs[Double](0)

        prediction == 0D
      }

    }

    check(prop, MinSize(1))
  }

} 
Example 6
Source File: OpDecisionTreeRegressor.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.regression

import com.salesforce.op.UID
import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
import com.salesforce.op.stages.impl.CheckIsResponseValues
import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor, OpDecisionTreeRegressorParams}

import scala.reflect.runtime.universe.TypeTag


class OpDecisionTreeRegressionModel
(
  sparkModel: DecisionTreeRegressionModel,
  uid: String = UID[OpDecisionTreeRegressionModel],
  operationName: String = classOf[DecisionTreeRegressor].getSimpleName
)(
  implicit tti1: TypeTag[RealNN],
  tti2: TypeTag[OPVector],
  tto: TypeTag[Prediction],
  ttov: TypeTag[Prediction#Value]
) extends OpPredictionModel[DecisionTreeRegressionModel](
  sparkModel = sparkModel, uid = uid, operationName = operationName
) 
Example 7
Source File: OpDecisionTreeRegressorTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.regression

import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.PredictionEquality
import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
import com.salesforce.op.test._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class OpDecisionTreeRegressorTest extends OpEstimatorSpec[Prediction,
  OpPredictorWrapperModel[DecisionTreeRegressionModel],
  OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]] with PredictionEquality {

  override def specName: String = Spec[OpDecisionTreeRegressor]

  val (inputData, rawLabel, features) = TestFeatureBuilder(
    Seq[(RealNN, OPVector)](
      (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
      (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
      (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
      (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
      (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
    )
  )
  val label = rawLabel.copy(isResponse = true)
  val estimator = new OpDecisionTreeRegressor().setInput(label, features)

  val expectedResult = Seq(
    Prediction(10.0),
    Prediction(20.0),
    Prediction(30.0),
    Prediction(40.0),
    Prediction(50.0)
  )

  it should "allow the user to set the desired spark parameters" in {
    estimator
      .setMaxDepth(6)
      .setMaxBins(2)
      .setMinInstancesPerNode(2)
      .setMinInfoGain(0.1)
    estimator.fit(inputData)

    estimator.predictor.getMaxDepth shouldBe 6
    estimator.predictor.getMaxBins shouldBe 2
    estimator.predictor.getMinInstancesPerNode shouldBe 2
    estimator.predictor.getMinInfoGain shouldBe 0.1

  }
} 
Example 8
Source File: SparkXGBoostRegressorSuite.scala    From sparkxgboost   with Apache License 2.0 5 votes vote down vote up
package rotationsymmetry.sxgboost

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.scalatest.FunSuite
import rotationsymmetry.sxgboost.loss.SquareLoss
import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext
import rotationsymmetry.sxgboost.utils.TestingUtils._


class SparkXGBoostRegressorSuite extends FunSuite with TestData with MLlibTestSparkContext {
  test("Compare with DecisionTree using simple data") {

    val data = sqlContext.createDataFrame(sc.parallelize(simpleData, 2))

    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(2)
      .fit(data)

    val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss)
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(1)
      .setNumTrees(1)
    val sparkXGBoostPipeline = new Pipeline()
      .setStages(Array(featureIndexer, sparkXGBoostRegressor))
    val sXGBoostModel = sparkXGBoostPipeline.fit(data)

    val dt = new DecisionTreeRegressor()
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(1)
    val dtPipeLine = new Pipeline()
      .setStages(Array(featureIndexer, dt))
    val dtModel = dtPipeLine.fit(data)

    val evaluator = new RegressionEvaluator()
    val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data))
    val dtrmse = evaluator.evaluate(dtModel.transform(data))

    assert(sXGBoostrmse ~== dtrmse relTol 1e-5)
  }

  test("Compare with DecisionTree using random data") {

    val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 40, 10, 2, 999))

    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(2)
      .fit(data)

    val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss)
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(5)
      .setNumTrees(1)
    val sparkXGBoostPipeline = new Pipeline()
      .setStages(Array(featureIndexer, sparkXGBoostRegressor))
    val sXGBoostModel = sparkXGBoostPipeline.fit(data)

    val dt = new DecisionTreeRegressor()
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(5)
    val dtPipeLine = new Pipeline()
      .setStages(Array(featureIndexer, dt))
    val dtModel = dtPipeLine.fit(data)

    val evaluator = new RegressionEvaluator()
    val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data))
    val dtrmse = evaluator.evaluate(dtModel.transform(data))

    assert(sXGBoostrmse ~== dtrmse relTol 1e-5)
  }
}