org.apache.spark.mllib.tree.RandomForest Scala Examples

The following examples show how to use org.apache.spark.mllib.tree.RandomForest. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MLLibRandomForest.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.example

import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy}
import org.apache.spark.mllib.tree.impurity.Entropy
import org.apache.spark.mllib.util.MLUtils
import reforest.rf.feature.RFStrategyFeatureSQRT
import reforest.rf.parameter._
import reforest.util.CCUtil

import scala.util.Random

object MLLibRandomForest {
  def main(args: Array[String]): Unit = {

    val property = RFParameterBuilder.apply
      .addParameter(RFParameterType.Dataset, "data/sample-covtype.libsvm")
      .addParameter(RFParameterType.NumFeatures, 54)
      .addParameter(RFParameterType.NumClasses, 10)
      .addParameter(RFParameterType.NumTrees, 100)
      .addParameter(RFParameterType.Depth, Array(10))
      .addParameter(RFParameterType.BinNumber, Array(8))
      .addParameter(RFParameterType.SparkMaster, "local[4]")
      .addParameter(RFParameterType.SparkCoresMax, 4)
      .addParameter(RFParameterType.SparkPartition, 4*4)
      .addParameter(RFParameterType.SparkExecutorMemory, "4096m")
      .addParameter(RFParameterType.SparkExecutorInstances, 1)

    val sc = CCUtil.getSparkContext(property)

    val timeStart = System.currentTimeMillis()
    val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2)

    val splits = data.randomSplit(Array(0.6, 0.2, 0.2), 0)
    val (trainingData, testData) = (splits(0), splits(2))

    // Train a RandomForest model.
    //    val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap
    val categoricalFeaturesInfo = Map[Int, Int]()
    val featureSubsetStrategy = "sqrt"
    val impurity = "entropy"

    val s = new
        Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1)

    val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt())
    val timeEnd = System.currentTimeMillis()

    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)

    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Time: "+(timeEnd-timeStart))
    println("Test Error = " + testErr)
    if (property.outputTree) {
      println("Learned classification forest model:\n" + model.toDebugString)
Example 2
Source File: MLLibRandomForestFromFile.scala    From reforest   with Apache License 2.0 5 votes vote down vote up
package reforest.example

import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.configuration.{Algo, QuantileStrategy, Strategy}
import org.apache.spark.mllib.tree.impurity.Entropy
import org.apache.spark.mllib.util.MLUtils
import reforest.rf.feature.RFStrategyFeatureSQRT
import reforest.rf.parameter._
import reforest.util.{CCUtil, CCUtilIO}

import scala.util.Random

object MLLibRandomForestFromFile {
  def main(args: Array[String]): Unit = {

    val property = RFParameterFromFile(args(0)).applyAppName("MLLib")

    val sc = CCUtil.getSparkContext(property)

    val timeStart = System.currentTimeMillis()
    val data = MLUtils.loadLibSVMFile(sc, property.dataset, property.numFeatures, property.sparkCoresMax * 2)

    val splits = data.randomSplit(Array(0.7, 0.3), 0)
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a RandomForest model.
    //    val categoricalFeaturesInfo = Array.tabulate(200)(i => (i, 5)).toMap
    val categoricalFeaturesInfo = Map[Int, Int]()
    val featureSubsetStrategy = "sqrt"
    val impurity = "entropy"

    val s = new
        Strategy(Algo.Classification, Entropy, property.getMaxDepth, property.numClasses, property.getMaxBinNumber, QuantileStrategy.Sort, categoricalFeaturesInfo, 1)

    val model = RandomForest.trainClassifier(trainingData, s, property.getMaxNumTrees, featureSubsetStrategy, Random.nextInt())
    val timeEnd = System.currentTimeMillis()

    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)

    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    CCUtilIO.logACCURACY(property, (1-testErr), (timeEnd-timeStart))
    println("Time: "+(timeEnd-timeStart))
    println("Test Error = " + testErr)
    if (property.outputTree) {
      println("Learned classification forest model:\n" + model.toDebugString)
Example 3
Source File: RandomForestClassifierExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.tree.configuration.Strategy

    // 加载数据
    val data = MLUtils.loadLibSVMFile(sc, "../data/mllib/rf_libsvm_data.txt")
    // 将数据随机分配为两份,一份用于训练,一份用于测试
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))
    val treeStrategy = Strategy.defaultStrategy("Classification")
    val model = RandomForest.trainClassifier(trainingData,treeStrategy, numTrees=3,
                featureSubsetStrategy="auto", seed =12345)
    val testErr = { point =>
            val prediction = model.predict(point.features)
            if (point.label == prediction) 
            else 0.0}.mean()//平均数
    println("Test Error = " + testErr)
    println("Learned Random Forest:n" + model.toDebugString)
Example 4
Source File: RandomForestDemo.scala    From Scala-and-Spark-for-Big-Data-Analytics   with MIT License 5 votes vote down vote up
package com.chapter11.SparkMachineLearning

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.evaluation.MulticlassMetrics

object RandomForestDemo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("PCAExample")//.setMaster("local[*]")
    val sc = new SparkContext(conf)
    val filePath = args(0)

    val data = MLUtils.loadLibSVMFile(sc, filePath)

    val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L)
    val training = splits(0).cache()
    val test = splits(1)

    // Train a RandomForest mode with an empty categoricalFeaturesInfo indicates all features are continuous.
    val numClasses = 10
    val categoricalFeaturesInfo = Map[Int, Int]()
    val numTrees = 50 // Use more in practice.
    val featureSubsetStrategy = "auto" // Let the algorithm choose.
    val impurity = "gini"
    val maxDepth = 30
    val maxBins = 32

    val model = RandomForest.trainClassifier(training, 

    // Evaluate model on test instances and compute test error
    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val metrics = new MulticlassMetrics(labelAndPreds)

    // Confusion matrix
    println("Confusion matrix:")

    // Overall Statistics
    val accuracy = metrics.accuracy
    println("Summary Statistics")
    println(s"Accuracy = $accuracy")

    // Precision by label
    val labels = metrics.labels
    labels.foreach { l =>
      println(s"Precision($l) = " + metrics.precision(l))

    // Recall by label
    labels.foreach { l =>
      println(s"Recall($l) = " + metrics.recall(l))

    // False positive rate by label
    labels.foreach { l =>
      println(s"FPR($l) = " + metrics.falsePositiveRate(l))

    // F-measure by label
    labels.foreach { l =>
      println(s"F1-Score($l) = " + metrics.fMeasure(l))

    // Weighted stats
    println(s"Weighted precision: ${metrics.weightedPrecision}")
    println(s"Weighted recall: ${metrics.weightedRecall}")
    println(s"Weighted F1 score: ${metrics.weightedFMeasure}")
    println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}")
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / test.count()
    println("Accuracy = " + (1-testErr) * 100 + " %")
    //println("Learned classification forest model:\n" + model.toDebugString)
Example 5
Source File: RandomForestClassification.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import scopt.OptionParser

object RandomForestClassification {
  case class Params(
    inputPath: String = null,
    numTrees: Int = 3,
    numClasses: Int = 2,
    featureSubsetStrategy: String = "auto",
    impurity: String = "gini",
    maxDepth: Int = 4,
    maxBins: Int = 32)
  def main(args: Array[String]) {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("RF") {
      head("RF: an example app.")
        .text(s"numTrees, default: ${defaultParams.numTrees}")
        .action((x, c) => c.copy(numTrees = x))
        .text(s"numClasses, default: ${defaultParams.numClasses}")
        .action((x, c) => c.copy(numClasses = x))
        .text(s"maxDepth, default: ${defaultParams.maxDepth}")
        .action((x, c) => c.copy(maxDepth = x))
        .text(s"maxBins, default: ${defaultParams.maxBins}")
        .action((x, c) => c.copy(maxBins = x))
        .text(s"featureSubsetStrategy, default: ${defaultParams.featureSubsetStrategy}")
        .action((x, c) => c.copy(featureSubsetStrategy = x))
        .text(s"impurity (smoothing constant), default: ${defaultParams.impurity}")
        .action((x, c) => c.copy(impurity = x))
        .text("Input path of dataset")
        .action((x, c) => c.copy(inputPath = x))	
    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"RFC with $params")
                              .set("spark.shuffle.compress", "false")
                              .set("", "")
                              .set("spark.smartCompress", "false")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data file.
    val data: RDD[LabeledPoint] = sc.objectFile(params.inputPath)

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    // Train a RandomForest model.
    // Empty categoricalFeaturesInfo indicates all features are continuous.

    val categoricalFeaturesInfo = Map[Int, Int]()

    val model = RandomForest.trainClassifier(trainingData, params.numClasses, categoricalFeaturesInfo,
      params.numTrees, params.featureSubsetStrategy, params.impurity, params.maxDepth, params.maxBins)

    // Evaluate model on test instances and compute test error
    val labelAndPreds = { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
    println("Test Error = " + testErr)

Example 6

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.sql.SparkSession

object MyRandomForestClassification {
  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())

    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val numTrees = 3 // Use more in practice.
    val featureSubsetStrategy = "auto" // Let the algorithm choose.
//    val impurity = "gini"
    val maxDepth = 4
    val maxBins = 32

    evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees,
      featureSubsetStrategy, "gini", maxDepth, maxBins)
    evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees,
      featureSubsetStrategy, "entropy", maxDepth, maxBins)


  def evaluate(
                trainingData: RDD[LabeledPoint],
                testData: RDD[LabeledPoint],
                numClasses: Int,
                categoricalFeaturesInfo: Map[Int,Int],

                numTrees: Int,
                featureSubsetStrategy: String,
                impurity: String,
                maxDepth: Int,
                ) :Unit = {

    val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
       numTrees, featureSubsetStrategy,impurity,  maxDepth, maxBins)
    val metrics = getMetrics(model, testData)
    println("Using Impurity :"+ impurity)
    println("Confusion Matrix :")
    println("Model Accuracy: "+metrics.precision)
    println("Model Error: "+ (1-metrics.precision))
//    (0 until numClasses).map(
//      category => (metrics.precision(category), metrics.recall(category))
//    ).foreach(println)
    println("My Random Forest Model:\n" + model.toDebugString)

  def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new MulticlassMetrics(predictionsAndLabels)
Example 7

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.sql.SparkSession

object MyRandomForestRegression {
  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .config("spark.sql.warehouse.dir", ".")

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/")
    val data =
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    println("Training Data count:"+trainingData.count())
    println("Test Data Count:"+testData.count())

    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val numTrees = 3 // Use more in practice.
    val featureSubsetStrategy = "auto" // Let the algorithm choose.
    val impurity = "variance"
    val maxDepth = 4
    val maxBins = 32

    val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo,
      numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

    val metrics = getMetrics(model, testData)

    println("Test Mean Squared Error = " + metrics.meanSquaredError)
    println("My Random Forest model:\n" + model.toDebugString)

  def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): RegressionMetrics = {
    val predictionsAndLabels = =>
      (model.predict(example.features), example.label)
    new RegressionMetrics(predictionsAndLabels)
// scalastyle:on println