org.apache.spark.mllib.tree.RandomForest Scala Examples
The following examples show how to use org.apache.spark.mllib.tree.RandomForest.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MLLibRandomForest.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.CCUtil import scala.util.Random object MLLibRandomForest { def main(args: Array[String]): Unit = { val property = RFParameterBuilder.apply .addParameter(RFParameterType.Dataset, "data/sample-covtype.libsvm") .addParameter(RFParameterType.NumFeatures, 54) .addParameter(RFParameterType.NumClasses, 10) .addParameter(RFParameterType.NumTrees, 100) .addParameter(RFParameterType.Depth, Array(10)) .addParameter(RFParameterType.BinNumber, Array(8)) .addParameter(RFParameterType.SparkMaster, "local[4]") .addParameter(RFParameterType.SparkCoresMax, 4) .addParameter(RFParameterType.SparkPartition, 4*4) .addParameter(RFParameterType.SparkExecutorMemory, "4096m") .addParameter(RFParameterType.SparkExecutorInstances, 1) .build val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.6, 0.2, 0.2), 0) val (trainingData, testData) = (splits(0), splits(2)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 2
Source File: MLLibRandomForestFromFile.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.example import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy} import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.mllib.util.MLUtils import reforest.rf.feature.RFStrategyFeatureSQRT import reforest.rf.parameter._ import reforest.util.{CCUtil, CCUtilIO} import scala.util.Random object MLLibRandomForestFromFile { def main(args: Array[String]): Unit = { val property = RFParameterFromFile(args(0)).applyAppName("MLLib") val sc = CCUtil.getSparkContext(property) sc.setLogLevel("error") val timeStart = System.currentTimeMillis() val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2) val splits = data.randomSplit(Array(0.7, 0.3), 0) val (trainingData, testData) = (splits(0), splits(1)) // Train a RandomForest model. // val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap val categoricalFeaturesInfo = Map[Int, Int]() val featureSubsetStrategy = "sqrt" val impurity = "entropy" val s = new Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1) val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt()) val timeEnd = System.currentTimeMillis() val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() CCUtilIO.logACCURACY(property, (1-testErr), (timeEnd-timeStart)) println("Time: "+(timeEnd-timeStart)) println("Test Error = " + testErr) if (property.outputTree) { println("Learned classification forest model:\n" + model.toDebugString) } } }
Example 3
Source File: RandomForestClassifierExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.tree.configuration.Strategy // 加载数据 val data = MLUtils.loadLibSVMFile(sc, "../data/mllib/rf_libsvm_data.txt") // 将数据随机分配为两份,一份用于训练,一份用于测试 val splits = data.randomSplit(Array(0.7, 0.3)) //数据分成训练和测试数据集 val (trainingData, testData) = (splits(0), splits(1)) //创建一个分类的树策略(随机森林也支持回归) val treeStrategy = Strategy.defaultStrategy("Classification") //训练模型 val model = RandomForest.trainClassifier(trainingData,treeStrategy, numTrees=3, featureSubsetStrategy="auto", seed =12345) //基于测试实例评估模型并计算测试错误 val testErr = testData.map { point => //预测 val prediction = model.predict(point.features) if (point.label == prediction) 1.0 else 0.0}.mean()//平均数 //检查模型 println("Test Error = " + testErr) println("Learned Random Forest:n" + model.toDebugString) } }
Example 4
Source File: RandomForestDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.evaluation.MulticlassMetrics object RandomForestDemo { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample")//.setMaster("local[*]") val sc = new SparkContext(conf) val filePath = args(0) val data = MLUtils.loadLibSVMFile(sc, filePath) val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L) val training = splits(0).cache() val test = splits(1) // Train a RandomForest mode with an empty categoricalFeaturesInfo indicates all features are continuous. val numClasses = 10 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 50 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. val impurity = "gini" val maxDepth = 30 val maxBins = 32 val model = RandomForest.trainClassifier(training, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) // Evaluate model on test instances and compute test error val labelAndPreds = test.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val metrics = new MulticlassMetrics(labelAndPreds) // Confusion matrix println("Confusion matrix:") println(metrics.confusionMatrix) // Overall Statistics val accuracy = metrics.accuracy println("Summary Statistics") println(s"Accuracy = $accuracy") // Precision by label val labels = metrics.labels labels.foreach { l => println(s"Precision($l) = " + metrics.precision(l)) } // Recall by label labels.foreach { l => println(s"Recall($l) = " + metrics.recall(l)) } // False positive rate by label labels.foreach { l => println(s"FPR($l) = " + metrics.falsePositiveRate(l)) } // F-measure by label labels.foreach { l => println(s"F1-Score($l) = " + metrics.fMeasure(l)) } // Weighted stats println(s"Weighted precision: ${metrics.weightedPrecision}") println(s"Weighted recall: ${metrics.weightedRecall}") println(s"Weighted F1 score: ${metrics.weightedFMeasure}") println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}") val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / test.count() println("Accuracy = " + (1-testErr) * 100 + " %") //println("Learned classification forest model:\n" + model.toDebugString) } }
Example 5
Source File: RandomForestClassification.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object RandomForestClassification { case class Params( inputPath: String = null, numTrees: Int = 3, numClasses: Int = 2, featureSubsetStrategy: String = "auto", impurity: String = "gini", maxDepth: Int = 4, maxBins: Int = 32) def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("RF") { head("RF: an example app.") opt[Int]("numTrees") .text(s"numTrees, default: ${defaultParams.numTrees}") .action((x, c) => c.copy(numTrees = x)) opt[Int]("numClasses") .text(s"numClasses, default: ${defaultParams.numClasses}") .action((x, c) => c.copy(numClasses = x)) opt[Int]("maxDepth") .text(s"maxDepth, default: ${defaultParams.maxDepth}") .action((x, c) => c.copy(maxDepth = x)) opt[Int]("maxBins") .text(s"maxBins, default: ${defaultParams.maxBins}") .action((x, c) => c.copy(maxBins = x)) opt[String]("featureSubsetStrategy") .text(s"featureSubsetStrategy, default: ${defaultParams.featureSubsetStrategy}") .action((x, c) => c.copy(featureSubsetStrategy = x)) opt[String]("impurity") .text(s"impurity (smoothing constant), default: ${defaultParams.impurity}") .action((x, c) => c.copy(impurity = x)) arg[String]("<inputPath>") .required() .text("Input path of dataset") .action((x, c) => c.copy(inputPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"RFC with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data: RDD[LabeledPoint] = sc.objectFile(params.inputPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a RandomForest model. // Empty categoricalFeaturesInfo indicates all features are continuous. val categoricalFeaturesInfo = Map[Int, Int]() val model = RandomForest.trainClassifier(trainingData, params.numClasses, categoricalFeaturesInfo, params.numTrees, params.featureSubsetStrategy, params.impurity, params.maxDepth, params.maxBins) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 6
Source File: MyRandomForestClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.sql.SparkSession object MyRandomForestClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyRandomForestClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 3 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. // val impurity = "gini" val maxDepth = 4 val maxBins = 32 evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees, featureSubsetStrategy, "gini", maxDepth, maxBins) evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees, featureSubsetStrategy, "entropy", maxDepth, maxBins) println("=============") spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], numClasses: Int, categoricalFeaturesInfo: Map[Int,Int], numTrees: Int, featureSubsetStrategy: String, impurity: String, maxDepth: Int, maxBins:Int ) :Unit = { val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy,impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Using Impurity :"+ impurity) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Model Accuracy: "+metrics.precision) println("Model Error: "+ (1-metrics.precision)) // (0 until numClasses).map( // category => (metrics.precision(category), metrics.recall(category)) // ).foreach(println) println("My Random Forest Model:\n" + model.toDebugString) } def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 7
Source File: MyRandomForestRegression.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.sql.SparkSession object MyRandomForestRegression { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyRandomForestRegression") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 3 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. val impurity = "variance" val maxDepth = 4 val maxBins = 32 val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Test Mean Squared Error = " + metrics.meanSquaredError) println("My Random Forest model:\n" + model.toDebugString) spark.stop() } def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): RegressionMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new RegressionMetrics(predictionsAndLabels) } } // scalastyle:on println