org.apache.spark.mllib.tree.model.GradientBoostedTreesModel Scala Examples

The following examples show how to use org.apache.spark.mllib.tree.model.GradientBoostedTreesModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: GradientBoostingTree.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package com.intel.hibench.sparkbench.ml

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint

object GradientBoostingTree {
  def main(args: Array[String]): Unit = {
    var inputPath = ""
    var numIterations: Int = 3
    val numClasses: Int = 2
    val maxDepth: Int = 5

    if (args.length == 2) {
      inputPath = args(0)
      numIterations = args(1).toInt
    }

    val conf = new SparkConf()
        .setAppName("GradientBoostingTree")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    //val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    val data: RDD[LabeledPoint] = sc.objectFile(inputPath)

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a GradientBoostedTrees model.
    // The defaultParams for Classification use LogLoss by default.
    val boostingStrategy = BoostingStrategy.defaultParams("Classification")
    boostingStrategy.numIterations = numIterations
    boostingStrategy.treeStrategy.numClasses = numClasses
    boostingStrategy.treeStrategy.maxDepth = maxDepth
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    // Evaluate model on test instances and compute test error
    val labelAndPreds = testData.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Test Error = " + testErr)

    sc.stop()
  }
} 
Example 2
Source File: GradientBoostedTree.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.ml

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint

import scopt.OptionParser

object GradientBoostedTree {

  case class Params(
    numClasses: Int = 2,
    maxDepth: Int = 30,
    maxBins: Int = 32,
    numIterations: Int = 20,
    learningRate: Double = 0.1,
    dataPath: String = null
  )

  def main(args: Array[String]): Unit = {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("GBT"){
      head("GBT: an example of Gradient Boosted Tree for classification")
      opt[Int]("numClasses")
        .text(s"numClasses, default: ${defaultParams.numClasses}")
        .action((x,c) => c.copy(numClasses = x))
      opt[Int]("maxDepth")
        .text(s"maxDepth, default: ${defaultParams.maxDepth}")
        .action((x,c) => c.copy(maxDepth = x))
      opt[Int]("maxBins")
        .text(s"maxBins, default: ${defaultParams.maxBins}")
        .action((x,c) => c.copy(maxBins = x))
      opt[Int]("numIterations")
        .text(s"numIterations, default: ${defaultParams.numIterations}")
        .action((x,c) => c.copy(numIterations = x))
      opt[Double]("learningRate")
        .text(s"learningRate, default: ${defaultParams.learningRate}")
        .action((x,c) => c.copy(learningRate = x))
      arg[String]("<dataPath>")
        .required()
        .text("data path for Gradient Boosted Tree")
        .action((x,c) => c.copy(dataPath = x))
    }
    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"Gradient Boosted Tree with $params")
    val sc = new SparkContext(conf)

    val dataPath = params.dataPath
    val numClasses = params.numClasses
    val maxDepth = params.maxDepth
    val maxBins = params.maxBins
    val numIterations = params.numIterations
    val learningRate = params.learningRate

    // Load  data file.
    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a GradientBoostedTrees model.
    val boostingStrategy = BoostingStrategy.defaultParams("Classification")
    boostingStrategy.numIterations = numIterations
    boostingStrategy.learningRate = learningRate
    boostingStrategy.treeStrategy.numClasses = numClasses
    boostingStrategy.treeStrategy.maxDepth = maxDepth
    boostingStrategy.treeStrategy.maxBins = maxBins
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    // Evaluate model on test instances and compute test error
    val labelAndPreds = testData.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Test Error = " + testErr)

    sc.stop()
  }
} 
Example 3
package spark.ml.cookbook.chapter10

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.sql.SparkSession


object MyGradientBoostingClassification {
  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("MyGradientBoostedTreesClassification")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data")
    val data = rawData.map(_.trim)
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)

      }


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())


    val algo = "Classification"
    val numIterations = 3
    val numClasses = 2
    val maxDepth   = 5
    val maxBins  = 32
    val categoricalFeatureInfo = Map[Int,Int]()


    val boostingStrategy = BoostingStrategy.defaultParams(algo)

    boostingStrategy.setNumIterations(numIterations)
    boostingStrategy.treeStrategy.setNumClasses(numClasses)
    boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
    boostingStrategy.treeStrategy.setMaxBins(maxBins)
    boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo

    evaluate(trainingData, testData, boostingStrategy)
    println("===================")

    spark.stop()
  }

  def evaluate(
                trainingData: RDD[LabeledPoint],
                testData: RDD[LabeledPoint],
                boostingStrategy : BoostingStrategy
                ) :Unit = {



    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

    val metrics = getMetrics(model, testData)
    println("Confusion Matrix :")
    println(metrics.confusionMatrix)
    println("Model Accuracy: "+metrics.precision)
    println("Model Error: "+ (1-metrics.precision))
//    (0 until boostingStrategy.treeStrategy.getNumClasses()).map(
//      category => (metrics.precision(category), metrics.recall(category))
//    ).foreach(println)
//    println("My Classification GBT model:\n" + model.toDebugString)
  }

  def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    val predictionsAndLabels = data.map(example =>
      (model.predict(example.features), example.label)
    )
    new MulticlassMetrics(predictionsAndLabels)
  }
} 
Example 4
package spark.ml.cookbook.chapter10

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.sql.SparkSession



object MyGradientBoostingRegression {
  def main(args: Array[String]): Unit = {


    Logger.getLogger("org").setLevel(Level.ERROR)

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("MyGradientBoostedTreesRegression")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data")
    val data = rawData.map(_.trim)
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)

      }

    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))


    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())

    val algo = "Regression"
    val numIterations = 3
    val maxDepth   = 5
    val maxBins  = 32
    val categoricalFeatureInfo = Map[Int,Int]()


    val boostingStrategy = BoostingStrategy.defaultParams(algo)

    boostingStrategy.setNumIterations(numIterations)

    boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
    boostingStrategy.treeStrategy.setMaxBins(maxBins)

    boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo

    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)


    val metrics = getMetrics(model, testData)

    println("Test Mean Squared Error = " + metrics.meanSquaredError)
    println("My regression GBT model:\n" + model.toDebugString)
    spark.stop()
  }
  def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): RegressionMetrics = {
    val predictionsAndLabels = data.map(example =>
      (model.predict(example.features), example.label)
    )
    new RegressionMetrics(predictionsAndLabels)
  }
}