org.apache.spark.mllib.tree.DecisionTree Scala Examples
The following examples show how to use org.apache.spark.mllib.tree.DecisionTree.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PipeClassificationDecisionTree.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable import de.unihamburg.vsis.sddf.Parameterized import org.apache.spark.mllib.classification.ClassificationModel class PipeClassificationDecisionTree( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = DecisionTree.trainClassifier(trainingData, numClasses = 2, categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins) log.debug("Decision Tree Model:" + model) log.debug("Decision Tree:" + model.toDebugString) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationDecisionTree { def apply( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) = { new PipeClassificationDecisionTree(impurity, maxDepth, maxBins) } }
Example 2
Source File: DecisionTreeUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeUtil { def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data_dt = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r))) } val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) return (training, test) } def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], categoricalFeaturesInfo: scala.Predef.Map[Int, Int], maxDepth :Int, maxBins: Int): Double = { val impurity = "variance" val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo, impurity,maxDepth, maxBins ) val true_vs_predicted = test.map(p => (p.label, decisionTreeModel.predict(p.features))) val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) return rmsle } }
Example 3
Source File: DecisionTreeCategoricalFeaturesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeCategoricalFeaturesApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val save = true //val sc = new SparkContext("local[2]", "First Spark App") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) val first = records.first() println(numData.toInt) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen println("Feature vector length for categorical features:"+ catLen) println("Feature vector length for numerical features:" + numLen) println("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data_dt = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r))) } val first_point = data_dt.first() println("Decision Tree feature vector:" + first_point.features.toString) println("Decision Tree feature vector length: " + first_point.features.size) def getCatFeatures(): scala.Predef.Map[Int, Int] = { var d = scala.Predef.Map[Int, Int]() for(a <- 2 until 10){ d += (a-2 -> (get_mapping(records, a).size + 1)) //d.put(a-2,get_mapping(records, a).size + 1) } return d } val cat_features = getCatFeatures() //dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2,10)]) //val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val decisionTreeModel= DecisionTree.trainRegressor(data_dt, cat_features, impurity, maxDepth, maxBins) //val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo, // impurity, maxDepth, maxBins ) val preds = decisionTreeModel.predict(data_dt.map( p=> p.features)) val actual = data.map( p=> p.label) val true_vs_predicted_dt = actual.zip(preds) val true_vs_predicted_csv = data.map(p => p.label + " ," + decisionTreeModel.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) if (save){ true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_categorical_" + date + ".csv") } print("Decision Tree depth: " + decisionTreeModel.depth) print("Decision Tree number of nodes: " + decisionTreeModel.numNodes) Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree Categorical Features") } }
Example 4
Source File: DecisionTreeWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeWithLog{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val save = false val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) val first = records.first() println(numData.toInt) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen println("Feature vector length for categorical features:"+ catLen) println("Feature vector length for numerical features:" + numLen) println("Total feature vector length: " + totalLen) val data_dt = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extract_features_dt(r))) } val first_point = data_dt.first() println("Decision Tree feature vector:" + first_point.features.toString) println("Decision Tree feature vector length: " + first_point.features.size) val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo, impurity, maxDepth, maxBins ) val preds = decisionTreeModel.predict(data_dt.map( p=> p.features)) val preds_2 = preds.map(p=> Math.exp(p)) val actual = data_dt.map( p=> Math.exp(p.label)) val true_vs_predicted_dt = actual.zip(preds) if(save){ val true_vs_predicted_csv = data_dt.map(p => p.label + " ," + decisionTreeModel.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_" + date + ".csv") } print("Decision Tree depth: " + decisionTreeModel.depth) print("Decision Tree number of nodes: " + decisionTreeModel.numNodes) Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree With Log") Util.sc.stop() } }
Example 5
Source File: DecisionTreeExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.SparkConf import org.apache.spark.SparkContext //加载文件 val data = sc.textFile("../data/mllib/tennis.csv") //解析数据并把它加载到LablePoint val parsedData = data.map {line => val parts = line.split(',').map(_.toDouble) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(parts(0), Vectors.dense(parts.tail)) } //用这些数据训练算法 val model = DecisionTree.train(parsedData, Classification,Entropy, 3) //创建一个向量表示无雨,风大,低温 val v=Vectors.dense(0.0,1.0,0.0) //预测是否打网球 model.predict(v) } }
Example 6
Source File: DecisionTreeTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Gini object DecisionTreeTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering") val sc = new SparkContext(sparkConf) val data = sc.textFile("../data/mllib/sample_tree_data.csv") val parsedData = data.map { line => val parts = line.split(',').map(_.toDouble) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(parts(0), Vectors.dense(parts.tail)) } val maxDepth = 5//树的最大深度,为了防止过拟合,设定划分的终止条件 val model = DecisionTree.train(parsedData, Classification, Gini, maxDepth) val labelAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val trainErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / parsedData.count println("Training Error = " + trainErr) } }
Example 7
Source File: MyRegressionMetrics.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.sql.SparkSession object MyRegressionMetrics { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .appName("myRegressionMetrics") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter4/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val categoricalFeaturesInfo = Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins) // Instantiate metrics object val predictionsAndLabels = testData.map(example => (model.predict(example.features), example.label) ) val metrics = new RegressionMetrics(predictionsAndLabels) // Squared error println(s"MSE = ${metrics.meanSquaredError}") println(s"RMSE = ${metrics.rootMeanSquaredError}") // R-squared println(s"R-squared = ${metrics.r2}") // Mean absolute error println(s"MAE = ${metrics.meanAbsoluteError}") // Explained variance println(s"Explained variance = ${metrics.explainedVariance}") // $example off$ spark.stop() } }
Example 8
Source File: MyDecisionTreeRegression.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.tree.model.DecisionTreeModel import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MyDecisionTreeRegression { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyDecisionTreeRegression") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val categoricalFeaturesInfo = Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Test Mean Squared Error = " + metrics.meanSquaredError) println("My regression tree model:\n" + model.toDebugString) spark.stop() } def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): RegressionMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new RegressionMetrics(predictionsAndLabels) } }
Example 9
Source File: MyDecisionTreeClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.DecisionTreeModel import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MyDecisionTreeClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyDecisionTreeClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } println(rawData.count()) println(data.count()) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val maxDepth = 5 val maxBins = 32 evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo, "gini", maxDepth, maxBins) evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo, "entropy", maxDepth, maxBins) spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], numClasses: Int, categoricalFeaturesInfo: Map[Int,Int], impurity: String, maxDepth: Int, maxBins:Int ) :Unit = { val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Using Impurity :"+ impurity) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Decision Tree Accuracy: "+metrics.precision) println("Decision Tree Error: "+ (1-metrics.precision)) (0 until numClasses).map( category => (metrics.precision(category), metrics.recall(category)) ).foreach(println) } def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }