org.apache.spark.mllib.tree.model.GradientBoostedTreesModel Scala Examples
The following examples show how to use org.apache.spark.mllib.tree.model.GradientBoostedTreesModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GradientBoostingTree.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint object GradientBoostingTree { def main(args: Array[String]): Unit = { var inputPath = "" var numIterations: Int = 3 val numClasses: Int = 2 val maxDepth: Int = 5 if (args.length == 2) { inputPath = args(0) numIterations = args(1).toInt } val conf = new SparkConf() .setAppName("GradientBoostingTree") val sc = new SparkContext(conf) // Load and parse the data file. //val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val data: RDD[LabeledPoint] = sc.objectFile(inputPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a GradientBoostedTrees model. // The defaultParams for Classification use LogLoss by default. val boostingStrategy = BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = numIterations boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.treeStrategy.maxDepth = maxDepth // Empty categoricalFeaturesInfo indicates all features are continuous. boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() val model = GradientBoostedTrees.train(trainingData, boostingStrategy) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 2
Source File: GradientBoostedTree.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object GradientBoostedTree { case class Params( numClasses: Int = 2, maxDepth: Int = 30, maxBins: Int = 32, numIterations: Int = 20, learningRate: Double = 0.1, dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("GBT"){ head("GBT: an example of Gradient Boosted Tree for classification") opt[Int]("numClasses") .text(s"numClasses, default: ${defaultParams.numClasses}") .action((x,c) => c.copy(numClasses = x)) opt[Int]("maxDepth") .text(s"maxDepth, default: ${defaultParams.maxDepth}") .action((x,c) => c.copy(maxDepth = x)) opt[Int]("maxBins") .text(s"maxBins, default: ${defaultParams.maxBins}") .action((x,c) => c.copy(maxBins = x)) opt[Int]("numIterations") .text(s"numIterations, default: ${defaultParams.numIterations}") .action((x,c) => c.copy(numIterations = x)) opt[Double]("learningRate") .text(s"learningRate, default: ${defaultParams.learningRate}") .action((x,c) => c.copy(learningRate = x)) arg[String]("<dataPath>") .required() .text("data path for Gradient Boosted Tree") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Gradient Boosted Tree with $params") val sc = new SparkContext(conf) val dataPath = params.dataPath val numClasses = params.numClasses val maxDepth = params.maxDepth val maxBins = params.maxBins val numIterations = params.numIterations val learningRate = params.learningRate // Load data file. val data: RDD[LabeledPoint] = sc.objectFile(dataPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a GradientBoostedTrees model. val boostingStrategy = BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = numIterations boostingStrategy.learningRate = learningRate boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.treeStrategy.maxDepth = maxDepth boostingStrategy.treeStrategy.maxBins = maxBins // Empty categoricalFeaturesInfo indicates all features are continuous. boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() val model = GradientBoostedTrees.train(trainingData, boostingStrategy) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 3
Source File: MyGradientBoostingClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.sql.SparkSession object MyGradientBoostingClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyGradientBoostedTreesClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val algo = "Classification" val numIterations = 3 val numClasses = 2 val maxDepth = 5 val maxBins = 32 val categoricalFeatureInfo = Map[Int,Int]() val boostingStrategy = BoostingStrategy.defaultParams(algo) boostingStrategy.setNumIterations(numIterations) boostingStrategy.treeStrategy.setNumClasses(numClasses) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo evaluate(trainingData, testData, boostingStrategy) println("===================") spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], boostingStrategy : BoostingStrategy ) :Unit = { val model = GradientBoostedTrees.train(trainingData, boostingStrategy) val metrics = getMetrics(model, testData) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Model Accuracy: "+metrics.precision) println("Model Error: "+ (1-metrics.precision)) // (0 until boostingStrategy.treeStrategy.getNumClasses()).map( // category => (metrics.precision(category), metrics.recall(category)) // ).foreach(println) // println("My Classification GBT model:\n" + model.toDebugString) } def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 4
Source File: MyGradientBoostingRegression.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.sql.SparkSession object MyGradientBoostingRegression { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyGradientBoostedTreesRegression") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val algo = "Regression" val numIterations = 3 val maxDepth = 5 val maxBins = 32 val categoricalFeatureInfo = Map[Int,Int]() val boostingStrategy = BoostingStrategy.defaultParams(algo) boostingStrategy.setNumIterations(numIterations) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo val model = GradientBoostedTrees.train(trainingData, boostingStrategy) val metrics = getMetrics(model, testData) println("Test Mean Squared Error = " + metrics.meanSquaredError) println("My regression GBT model:\n" + model.toDebugString) spark.stop() } def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): RegressionMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new RegressionMetrics(predictionsAndLabels) } }