org.apache.spark.mllib.tree.configuration.Algo Scala Examples
The following examples show how to use org.apache.spark.mllib.tree.configuration.Algo.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MLLibRandomForest.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.CCUtil import scala.util.Random object MLLibRandomForest { def main(args: Array[String]): Unit = { val property = RFParameterBuilder.apply .addParameter(RFParameterType.Dataset, "data/sample-covtype.libsvm") .addParameter(RFParameterType.NumFeatures, 54) .addParameter(RFParameterType.NumClasses, 10) .addParameter(RFParameterType.NumTrees, 100) .addParameter(RFParameterType.Depth, Array(10)) .addParameter(RFParameterType.BinNumber, Array(8)) .addParameter(RFParameterType.SparkMaster, "local[4]") .addParameter(RFParameterType.SparkCoresMax, 4) .addParameter(RFParameterType.SparkPartition, 4*4) .addParameter(RFParameterType.SparkExecutorMemory, "4096m") .addParameter(RFParameterType.SparkExecutorInstances, 1) .build val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.6, 0.2, 0.2), 0) val (trainingData, testData) = (splits(0), splits(2)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 2
Source File: MLLibRandomForestFromFile.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.{CCUtil, CCUtilIO} import scala.util.Random object MLLibRandomForestFromFile { def main(args: Array[String]): Unit = { val property = RFParameterFromFile(args(0)).applyAppName("MLLib") val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.7, 0.3), 0) val (trainingData, testData) = (splits(0), splits(1)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() CCUtilIO.logACCURACY(property, (1-testErr), (timeEnd-timeStart)) println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 3
Source File: GradientBoostedTreesExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Algo} import org.apache.spark.mllib.util.MLUtils // 加载数据 val data = MLUtils.loadLibSVMFile(sc, "../data/mllib/rf_libsvm_data.txt") // 将数据随机分配为两份,一份用于训练,一份用于测试 val splits = data.randomSplit(Array(0.7, 0.3)) //数据分成训练和测试数据集 val (trainingData, testData) = (splits(0), splits(1)) //创建一个分类的提升策略并设置迭代次数为3(随机森林也支持回归) val boostingStrategy =BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = 3 //梯度提升决策树:综合多个决策树,消除噪声,避免过拟合 val model = GradientBoostedTrees.train(trainingData,boostingStrategy) //基于测试实例评估模型并计算测试错误 val testErr = testData.map { point => //预测 val prediction = model.predict(point.features) if (point.label == prediction) 1.0 else 0.0}.mean()//平均数 //检查模型 println("Test Error = " + testErr) println("Learned Random Forest:n" + model.toDebugString) } }