org.apache.spark.mllib.tree.configuration.BoostingStrategy Scala Examples
The following examples show how to use org.apache.spark.mllib.tree.configuration.BoostingStrategy.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GradientBoostedTreesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 2
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.loss.LogLoss import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a GBTs UnderlyingAlgorithm private object GBT { def trainingProcedure( input: RDD[LabeledPoint], numIterations: Int): (Vector => Double) = { //Configuration val boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.numIterations = numIterations boostingStrategy.treeStrategy.maxDepth = 5 boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() boostingStrategy.loss = LogLoss //Training val remappedInput = input.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) val model = new GradientBoostedTrees(boostingStrategy) .run(input = remappedInput) model.predict } } class GBT( private val input: RDD[LabeledPoint], private val numIterations: Int) extends UnderlyingAlgorithm( GBT.trainingProcedure(input,numIterations)) { override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { -score } else { score } } }
Example 3
Source File: GradientBoostedTreesUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gradientboosted import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object GradientBoostedTreesUtil { def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) return (training, test) } def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], iterations:Int, maxDepth:Int, maxBins: Int): Double ={ var boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.setNumIterations(iterations) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) val model = GradientBoostedTrees.train(train, boostingStrategy) // // @classmethod // @since("1.3.0") // def trainRegressor(cls, data, categoricalFeaturesInfo, // loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, // maxBins=32): val true_vs_predicted = test.map(p => (p.label, model.predict(p.features))) val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) return rmsle } }
Example 4
Source File: GradientBoostedTreesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gradientboosted import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object GradientBoostedTreesApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { //val conf = new SparkConf().setMaster("local").setAppName("GradientBoostedTreesRegressionApp") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) records.cache() var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Gradient Boosted Trees Model feature vector:" + first_point.features.toString) println("Gradient Boosted Trees Model feature vector length: " + first_point.features.size) var boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.setNumIterations(3)// Note: Use more iterations in practice. boostingStrategy.treeStrategy.setMaxDepth(5) val model = GradientBoostedTrees.train(data, boostingStrategy) val true_vs_predicted = data.map(p => (p.label, model.predict(p.features))) val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val save = true if(save){ val true_vs_predicted_csv = data.map(p => p.label + " ," + model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/gradient_boosted_trees_" + date + ".csv") } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Gradient Boosted Trees - Mean Squared Error: " + mse) println("Gradient Boosted Trees - Mean Absolute Error: " + mae) println("Gradient Boosted Trees - Root Mean Squared Log Error:" + rmsle) } }
Example 5
Source File: GradientBoostedTreesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 6
Source File: GradientBoostedTreesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 7
Source File: GradientBoostedTreesExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Algo} import org.apache.spark.mllib.util.MLUtils // 加载数据 val data = MLUtils.loadLibSVMFile(sc, "../data/mllib/rf_libsvm_data.txt") // 将数据随机分配为两份,一份用于训练,一份用于测试 val splits = data.randomSplit(Array(0.7, 0.3)) //数据分成训练和测试数据集 val (trainingData, testData) = (splits(0), splits(1)) //创建一个分类的提升策略并设置迭代次数为3(随机森林也支持回归) val boostingStrategy =BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = 3 //梯度提升决策树:综合多个决策树,消除噪声,避免过拟合 val model = GradientBoostedTrees.train(trainingData,boostingStrategy) //基于测试实例评估模型并计算测试错误 val testErr = testData.map { point => //预测 val prediction = model.predict(point.features) if (point.label == prediction) 1.0 else 0.0}.mean()//平均数 //检查模型 println("Test Error = " + testErr) println("Learned Random Forest:n" + model.toDebugString) } }
Example 8
Source File: GradientBoostedTreesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L, "all") val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L, "all") val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 9
Source File: GradientBoostingTree.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint object GradientBoostingTree { def main(args: Array[String]): Unit = { var inputPath = "" var numIterations: Int = 3 val numClasses: Int = 2 val maxDepth: Int = 5 if (args.length == 2) { inputPath = args(0) numIterations = args(1).toInt } val conf = new SparkConf() .setAppName("GradientBoostingTree") val sc = new SparkContext(conf) // Load and parse the data file. //val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val data: RDD[LabeledPoint] = sc.objectFile(inputPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a GradientBoostedTrees model. // The defaultParams for Classification use LogLoss by default. val boostingStrategy = BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = numIterations boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.treeStrategy.maxDepth = maxDepth // Empty categoricalFeaturesInfo indicates all features are continuous. boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() val model = GradientBoostedTrees.train(trainingData, boostingStrategy) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 10
Source File: GradientBoostedTree.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object GradientBoostedTree { case class Params( numClasses: Int = 2, maxDepth: Int = 30, maxBins: Int = 32, numIterations: Int = 20, learningRate: Double = 0.1, dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("GBT"){ head("GBT: an example of Gradient Boosted Tree for classification") opt[Int]("numClasses") .text(s"numClasses, default: ${defaultParams.numClasses}") .action((x,c) => c.copy(numClasses = x)) opt[Int]("maxDepth") .text(s"maxDepth, default: ${defaultParams.maxDepth}") .action((x,c) => c.copy(maxDepth = x)) opt[Int]("maxBins") .text(s"maxBins, default: ${defaultParams.maxBins}") .action((x,c) => c.copy(maxBins = x)) opt[Int]("numIterations") .text(s"numIterations, default: ${defaultParams.numIterations}") .action((x,c) => c.copy(numIterations = x)) opt[Double]("learningRate") .text(s"learningRate, default: ${defaultParams.learningRate}") .action((x,c) => c.copy(learningRate = x)) arg[String]("<dataPath>") .required() .text("data path for Gradient Boosted Tree") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Gradient Boosted Tree with $params") val sc = new SparkContext(conf) val dataPath = params.dataPath val numClasses = params.numClasses val maxDepth = params.maxDepth val maxBins = params.maxBins val numIterations = params.numIterations val learningRate = params.learningRate // Load data file. val data: RDD[LabeledPoint] = sc.objectFile(dataPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a GradientBoostedTrees model. val boostingStrategy = BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = numIterations boostingStrategy.learningRate = learningRate boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.treeStrategy.maxDepth = maxDepth boostingStrategy.treeStrategy.maxBins = maxBins // Empty categoricalFeaturesInfo indicates all features are continuous. boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() val model = GradientBoostedTrees.train(trainingData, boostingStrategy) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 11
Source File: MyGradientBoostingClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.sql.SparkSession object MyGradientBoostingClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyGradientBoostedTreesClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val algo = "Classification" val numIterations = 3 val numClasses = 2 val maxDepth = 5 val maxBins = 32 val categoricalFeatureInfo = Map[Int,Int]() val boostingStrategy = BoostingStrategy.defaultParams(algo) boostingStrategy.setNumIterations(numIterations) boostingStrategy.treeStrategy.setNumClasses(numClasses) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo evaluate(trainingData, testData, boostingStrategy) println("===================") spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], boostingStrategy : BoostingStrategy ) :Unit = { val model = GradientBoostedTrees.train(trainingData, boostingStrategy) val metrics = getMetrics(model, testData) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Model Accuracy: "+metrics.precision) println("Model Error: "+ (1-metrics.precision)) // (0 until boostingStrategy.treeStrategy.getNumClasses()).map( // category => (metrics.precision(category), metrics.recall(category)) // ).foreach(println) // println("My Classification GBT model:\n" + model.toDebugString) } def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 12
Source File: MyGradientBoostingRegression.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.sql.SparkSession object MyGradientBoostingRegression { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyGradientBoostedTreesRegression") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val algo = "Regression" val numIterations = 3 val maxDepth = 5 val maxBins = 32 val categoricalFeatureInfo = Map[Int,Int]() val boostingStrategy = BoostingStrategy.defaultParams(algo) boostingStrategy.setNumIterations(numIterations) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo val model = GradientBoostedTrees.train(trainingData, boostingStrategy) val metrics = getMetrics(model, testData) println("Test Mean Squared Error = " + metrics.meanSquaredError) println("My regression GBT model:\n" + model.toDebugString) spark.stop() } def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): RegressionMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new RegressionMetrics(predictionsAndLabels) } }