org.apache.spark.mllib.tree.configuration.Strategy Scala Examples
The following examples show how to use org.apache.spark.mllib.tree.configuration.Strategy.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GradientBoostedTreesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 2
Source File: GradientBoostedTreesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 3
Source File: MLLibRandomForest.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.CCUtil import scala.util.Random object MLLibRandomForest { def main(args: Array[String]): Unit = { val property = RFParameterBuilder.apply .addParameter(RFParameterType.Dataset, "data/sample-covtype.libsvm") .addParameter(RFParameterType.NumFeatures, 54) .addParameter(RFParameterType.NumClasses, 10) .addParameter(RFParameterType.NumTrees, 100) .addParameter(RFParameterType.Depth, Array(10)) .addParameter(RFParameterType.BinNumber, Array(8)) .addParameter(RFParameterType.SparkMaster, "local[4]") .addParameter(RFParameterType.SparkCoresMax, 4) .addParameter(RFParameterType.SparkPartition, 4*4) .addParameter(RFParameterType.SparkExecutorMemory, "4096m") .addParameter(RFParameterType.SparkExecutorInstances, 1) .build val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.6, 0.2, 0.2), 0) val (trainingData, testData) = (splits(0), splits(2)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 4
Source File: MLLibRandomForestFromFile.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.{CCUtil, CCUtilIO} import scala.util.Random object MLLibRandomForestFromFile { def main(args: Array[String]): Unit = { val property = RFParameterFromFile(args(0)).applyAppName("MLLib") val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.7, 0.3), 0) val (trainingData, testData) = (splits(0), splits(1)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() CCUtilIO.logACCURACY(property, (1-testErr), (timeEnd-timeStart)) println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 5
Source File: GradientBoostedTreesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 6
Source File: RandomForestClassifierExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.tree.configuration.Strategy // 加载数据 val data = MLUtils.loadLibSVMFile(sc, "../data/mllib/rf_libsvm_data.txt") // 将数据随机分配为两份,一份用于训练,一份用于测试 val splits = data.randomSplit(Array(0.7, 0.3)) //数据分成训练和测试数据集 val (trainingData, testData) = (splits(0), splits(1)) //创建一个分类的树策略(随机森林也支持回归) val treeStrategy = Strategy.defaultStrategy("Classification") //训练模型 val model = RandomForest.trainClassifier(trainingData,treeStrategy, numTrees=3, featureSubsetStrategy="auto", seed =12345) //基于测试实例评估模型并计算测试错误 val testErr = testData.map { point => //预测 val prediction = model.predict(point.features) if (point.label == prediction) 1.0 else 0.0}.mean()//平均数 //检查模型 println("Test Error = " + testErr) println("Learned Random Forest:n" + model.toDebugString) } }
Example 7
Source File: GradientBoostedTreesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L, "all") val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L, "all") val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }