org.apache.spark.ml.regression.DecisionTreeRegressor Scala Example

Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame


  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val distribTree = setParams(new DecisionTreeRegressor(), testParams)
    val localTree = setParams(new LocalDecisionTreeRegressor(), testParams)
    val localModel = localTree.fit(train)
    val model = distribTree.fit(train)
    OptimizedTreeTests.checkEqual(model, localModel)
  }


  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

}

Source File: OptimizedDecisionTreeIntegrationSuite.scala From oraf with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier}
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor}
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame



  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeRegressor(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams)
    val newModel = newTree.fit(train)
    val oldModel = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(oldModel, newModel)
  }

  private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeClassifier(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams)
    val newModel = newTree.fit(train)
    val model = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(model, newModel)
  }

  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree with two feature values") {
    val data = sc.parallelize(Range(0, 8).map(x => {
     if (x > 3) {
       Instance(x, 1.0, Vectors.dense(0.0))
     } else {
       Instance(x, 1.0, Vectors.dense(1.0))
     }}))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

}

Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.validation

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.DataFrame

class CrossValidatorParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount")
  override val sparkTransformer: Transformer = {
    val regressor = new RandomForestRegressor().
      setFeaturesCol("features").
      setLabelCol("loan_amount").
      setPredictionCol("prediction")
    val paramGrid = new ParamGridBuilder()
      .addGrid(regressor.numTrees, Array(2, 3, 4))
      .build()

    new Pipeline().setStages(Array(new StringIndexer().
      setInputCol("fico_score_group_fnl").
      setOutputCol("fico_index"),
      new VectorAssembler().
        setInputCols(Array("fico_index", "dti")).
        setOutputCol("features"),
      new CrossValidator().
        setEvaluator(new RegressionEvaluator().
          setLabelCol("loan_amount").
          setPredictionCol("prediction")).
        setEstimator(regressor).
        setEstimatorParamMaps(paramGrid))).fit(dataset)
  }

  override val ignoreSerializationTest = true
}

Source File: DecisionTreeRegressionParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.regression

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql._


class DecisionTreeRegressionParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new VectorAssembler().
      setInputCols(Array("fico_index", "dti")).
      setOutputCol("features"),
    new DecisionTreeRegressor().
      setFeaturesCol("features").
      setLabelCol("loan_amount").
      setPredictionCol("prediction"))).fit(dataset)

  override val unserializedParams = Set("stringOrderType", "labelCol", "seed")
}

Source File: TypedEncoderInstancesTests.scala From frameless with Apache License 2.0

5 votes

package frameless
package ml

import org.scalacheck.Prop._
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.regression.DecisionTreeRegressor
import Generators._
import scala.util.Random

class TypedEncoderInstancesTests extends FramelessMlSuite {

  test("Vector encoding is injective using collect()") {
    val prop = forAll { vector: Vector =>
      TypedDataset.create(Seq(vector)).collect().run() == Seq(vector)
    }
    check(prop)
  }

  test("Matrix encoding is injective using collect()") {
    val prop = forAll { matrix: Matrix =>
      TypedDataset.create(Seq(matrix)).collect().run() == Seq(matrix)
    }
    check(prop)
  }

  test("Vector is encoded as VectorUDT and thus can be run in a Spark ML model") {
    case class Input(features: Vector, label: Double)

    val prop = forAll { trainingData: Matrix =>
      (trainingData.numRows >= 1) ==> {
        val inputs = trainingData.rowIter.toVector.map(vector => Input(vector, 0D))
        val inputsDS = TypedDataset.create(inputs)

        val model = new DecisionTreeRegressor()

        // this line would throw a runtime exception if Vector was not encoded as VectorUDT
        val trainedModel = model.fit(inputsDS.dataset)

        val randomInput = inputs(Random.nextInt(inputs.length))
        val randomInputDS = TypedDataset.create(Seq(randomInput))

        val prediction = trainedModel.transform(randomInputDS.dataset)
          .select("prediction")
          .head
          .getAs[Double](0)

        prediction == 0D
      }

    }

    check(prop, MinSize(1))
  }

}

Source File: OpDecisionTreeRegressor.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.regression

import com.salesforce.op.UID
import com.salesforce.op.features.types.{OPVector, Prediction, RealNN}
import com.salesforce.op.stages.impl.CheckIsResponseValues
import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper}
import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor, OpDecisionTreeRegressorParams}

import scala.reflect.runtime.universe.TypeTag


class OpDecisionTreeRegressionModel
(
  sparkModel: DecisionTreeRegressionModel,
  uid: String = UID[OpDecisionTreeRegressionModel],
  operationName: String = classOf[DecisionTreeRegressor].getSimpleName
)(
  implicit tti1: TypeTag[RealNN],
  tti2: TypeTag[OPVector],
  tto: TypeTag[Prediction],
  ttov: TypeTag[Prediction#Value]
) extends OpPredictionModel[DecisionTreeRegressionModel](
  sparkModel = sparkModel, uid = uid, operationName = operationName
)

Source File: OpDecisionTreeRegressorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.regression

import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.PredictionEquality
import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
import com.salesforce.op.test._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTreeRegressor}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class OpDecisionTreeRegressorTest extends OpEstimatorSpec[Prediction,
  OpPredictorWrapperModel[DecisionTreeRegressionModel],
  OpPredictorWrapper[DecisionTreeRegressor, DecisionTreeRegressionModel]] with PredictionEquality {

  override def specName: String = Spec[OpDecisionTreeRegressor]

  val (inputData, rawLabel, features) = TestFeatureBuilder(
    Seq[(RealNN, OPVector)](
      (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
      (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
      (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
      (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
      (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
    )
  )
  val label = rawLabel.copy(isResponse = true)
  val estimator = new OpDecisionTreeRegressor().setInput(label, features)

  val expectedResult = Seq(
    Prediction(10.0),
    Prediction(20.0),
    Prediction(30.0),
    Prediction(40.0),
    Prediction(50.0)
  )

  it should "allow the user to set the desired spark parameters" in {
    estimator
      .setMaxDepth(6)
      .setMaxBins(2)
      .setMinInstancesPerNode(2)
      .setMinInfoGain(0.1)
    estimator.fit(inputData)

    estimator.predictor.getMaxDepth shouldBe 6
    estimator.predictor.getMaxBins shouldBe 2
    estimator.predictor.getMinInstancesPerNode shouldBe 2
    estimator.predictor.getMinInfoGain shouldBe 0.1

  }
}

Source File: SparkXGBoostRegressorSuite.scala From sparkxgboost with Apache License 2.0

5 votes

package rotationsymmetry.sxgboost

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.scalatest.FunSuite
import rotationsymmetry.sxgboost.loss.SquareLoss
import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext
import rotationsymmetry.sxgboost.utils.TestingUtils._


class SparkXGBoostRegressorSuite extends FunSuite with TestData with MLlibTestSparkContext {
  test("Compare with DecisionTree using simple data") {

    val data = sqlContext.createDataFrame(sc.parallelize(simpleData, 2))

    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(2)
      .fit(data)

    val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss)
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(1)
      .setNumTrees(1)
    val sparkXGBoostPipeline = new Pipeline()
      .setStages(Array(featureIndexer, sparkXGBoostRegressor))
    val sXGBoostModel = sparkXGBoostPipeline.fit(data)

    val dt = new DecisionTreeRegressor()
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(1)
    val dtPipeLine = new Pipeline()
      .setStages(Array(featureIndexer, dt))
    val dtModel = dtPipeLine.fit(data)

    val evaluator = new RegressionEvaluator()
    val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data))
    val dtrmse = evaluator.evaluate(dtModel.transform(data))

    assert(sXGBoostrmse ~== dtrmse relTol 1e-5)
  }

  test("Compare with DecisionTree using random data") {

    val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 40, 10, 2, 999))

    val featureIndexer = new VectorIndexer()
      .setInputCol("features")
      .setOutputCol("indexedFeatures")
      .setMaxCategories(2)
      .fit(data)

    val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss)
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(5)
      .setNumTrees(1)
    val sparkXGBoostPipeline = new Pipeline()
      .setStages(Array(featureIndexer, sparkXGBoostRegressor))
    val sXGBoostModel = sparkXGBoostPipeline.fit(data)

    val dt = new DecisionTreeRegressor()
      .setFeaturesCol("indexedFeatures")
      .setMaxDepth(5)
    val dtPipeLine = new Pipeline()
      .setStages(Array(featureIndexer, dt))
    val dtModel = dtPipeLine.fit(data)

    val evaluator = new RegressionEvaluator()
    val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data))
    val dtrmse = evaluator.evaluate(dtModel.transform(data))

    assert(sXGBoostrmse ~== dtrmse relTol 1e-5)
  }
}

org.apache.spark.ml.regression.DecisionTreeRegressor Scala Examples