org.apache.spark.mllib.tree.DecisionTree Scala Examples

The following examples show how to use org.apache.spark.mllib.tree.DecisionTree. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: PipeClassificationDecisionTree.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable
import de.unihamburg.vsis.sddf.Parameterized
import org.apache.spark.mllib.classification.ClassificationModel

class PipeClassificationDecisionTree(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32)
  extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins))

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = DecisionTree.trainClassifier(trainingData, numClasses = 2,
      categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins)

    log.debug("Decision Tree Model:" + model)
    log.debug("Decision Tree:" + model.toDebugString)

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationDecisionTree {
  def apply(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32) = {
    new PipeClassificationDecisionTree(impurity, maxDepth, maxBins)
  }
} 
Example 2
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeUtil {

  def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data_dt = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
    }

    val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)
    return (training, test)
  }

  def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint],
               categoricalFeaturesInfo: scala.Predef.Map[Int, Int],
                maxDepth :Int, maxBins: Int): Double = {
    val impurity = "variance"
    val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo,
      impurity,maxDepth, maxBins )

    val true_vs_predicted = test.map(p => (p.label, decisionTreeModel.predict(p.features)))
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())
    return rmsle
  }

} 
Example 3
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeCategoricalFeaturesApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val save = true
    //val sc = new SparkContext("local[2]", "First Spark App")
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data_dt = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
    }
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)


    def getCatFeatures(): scala.Predef.Map[Int, Int] = {

      var d = scala.Predef.Map[Int, Int]()

      for(a <- 2 until 10){
        d += (a-2 -> (get_mapping(records, a).size + 1))
        //d.put(a-2,get_mapping(records, a).size + 1)
      }
      return d

    }
    val cat_features = getCatFeatures()
    //dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2,10)])

    //val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32
    val decisionTreeModel= DecisionTree.trainRegressor(data_dt, cat_features,  impurity, maxDepth, maxBins)
    //val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
    //  impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict(data_dt.map( p=> p.features))
    val actual = data.map( p=> p.label)
    val true_vs_predicted_dt = actual.zip(preds)
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + decisionTreeModel.predict(p.features))

    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_categorical_" + date + ".csv")
    }

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)
    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree Categorical Features")
  }

} 
Example 4
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeWithLog{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val save = false
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)


    val data_dt = {
      records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extract_features_dt(r)))
    }
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)

    val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
      impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict(data_dt.map( p=> p.features))
    val preds_2 = preds.map(p=> Math.exp(p))
    val actual = data_dt.map( p=> Math.exp(p.label))
    val true_vs_predicted_dt = actual.zip(preds)

    if(save){
      val true_vs_predicted_csv = data_dt.map(p => p.label + " ,"  + decisionTreeModel.predict(p.features))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_" + date + ".csv")
    }

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)

    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree With Log")
    Util.sc.stop()
  }

} 
Example 5
Source File: DecisionTreeExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Entropy
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

    //加载文件
    val data = sc.textFile("../data/mllib/tennis.csv")
    //解析数据并把它加载到LablePoint
    val parsedData = data.map {line => 
          val parts = line.split(',').map(_.toDouble)
	  //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label)
          LabeledPoint(parts(0), Vectors.dense(parts.tail))
          }
    //用这些数据训练算法
   val model = DecisionTree.train(parsedData, Classification,Entropy, 3)
    //创建一个向量表示无雨,风大,低温
   val v=Vectors.dense(0.0,1.0,0.0)
   //预测是否打网球
   model.predict(v)
  
  }
} 
Example 6
Source File: DecisionTreeTest.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Gini

object DecisionTreeTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering")
    val sc = new SparkContext(sparkConf)
    val data = sc.textFile("../data/mllib/sample_tree_data.csv")    
    val parsedData = data.map { line =>
      val parts = line.split(',').map(_.toDouble)
      //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label)
      LabeledPoint(parts(0), Vectors.dense(parts.tail))
    }

    val maxDepth = 5//树的最大深度,为了防止过拟合,设定划分的终止条件
    val model = DecisionTree.train(parsedData, Classification, Gini, maxDepth)

    val labelAndPreds = parsedData.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val trainErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / parsedData.count
    println("Training Error = " + trainErr)
  }
} 
Example 7
package spark.ml.cookbook.chapter4


import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.sql.SparkSession

object MyRegressionMetrics {
  def main(args: Array[String]): Unit = {


    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("myRegressionMetrics")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter4/breast-cancer-wisconsin.data")
    val data = rawData.map(_.trim)
      .filter(text => !(text.isEmpty || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)

      }

    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    val categoricalFeaturesInfo = Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity,
      maxDepth, maxBins)
    // Instantiate metrics object

    val predictionsAndLabels = testData.map(example =>
      (model.predict(example.features), example.label)
    )

    val metrics = new RegressionMetrics(predictionsAndLabels)

    // Squared error
    println(s"MSE = ${metrics.meanSquaredError}")
    println(s"RMSE = ${metrics.rootMeanSquaredError}")

    // R-squared
    println(s"R-squared = ${metrics.r2}")

    // Mean absolute error
    println(s"MAE = ${metrics.meanAbsoluteError}")

    // Explained variance
    println(s"Explained variance = ${metrics.explainedVariance}")
    // $example off$

    spark.stop()
  }
} 
Example 8
package spark.ml.cookbook.chapter10

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object MyDecisionTreeRegression {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("MyDecisionTreeRegression")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data")
    val data = rawData.map(_.trim)
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)

      }

    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))

    val categoricalFeaturesInfo = Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity,
      maxDepth, maxBins)

    val metrics = getMetrics(model, testData)

    println("Test Mean Squared Error = " + metrics.meanSquaredError)
    println("My regression tree model:\n" + model.toDebugString)
    spark.stop()

  }
  def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): RegressionMetrics = {
    val predictionsAndLabels = data.map(example =>
      (model.predict(example.features), example.label)
    )
    new RegressionMetrics(predictionsAndLabels)
  }
} 
Example 9
package spark.ml.cookbook.chapter10


import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object MyDecisionTreeClassification {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("MyDecisionTreeClassification")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data")
    val data = rawData.map(_.trim)
      .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1))
      .map { line =>
        val values = line.split(',').map(_.toDouble)
        val slicedValues = values.slice(1, values.size)
        val featureVector = Vectors.dense(slicedValues.init)
        val label = values.last / 2 -1
        LabeledPoint(label, featureVector)
      }
    println(rawData.count())
    println(data.count())


    val splits = data.randomSplit(Array(0.7, 0.3))
    val (trainingData, testData) = (splits(0), splits(1))


    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val maxDepth = 5
    val maxBins = 32

    evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo,
              "gini", maxDepth, maxBins)

    evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo,
              "entropy", maxDepth, maxBins)

    spark.stop()

  }
  def evaluate(
                 trainingData: RDD[LabeledPoint],
                 testData: RDD[LabeledPoint],
                 numClasses: Int,
                 categoricalFeaturesInfo: Map[Int,Int],

                 impurity: String,
                 maxDepth: Int,
                 maxBins:Int
                 ) :Unit = {


    val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
      impurity, maxDepth, maxBins)
    val metrics = getMetrics(model, testData)
    println("Using Impurity :"+ impurity)
    println("Confusion Matrix :")
    println(metrics.confusionMatrix)
    println("Decision Tree Accuracy: "+metrics.precision)
    println("Decision Tree Error: "+ (1-metrics.precision))
    (0 until numClasses).map(
      category => (metrics.precision(category), metrics.recall(category))
    ).foreach(println)
  }

  def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    val predictionsAndLabels = data.map(example =>
      (model.predict(example.features), example.label)
    )
    new MulticlassMetrics(predictionsAndLabels)
  }


}