org.apache.spark.mllib.evaluation.MulticlassMetrics Scala Examples
The following examples show how to use org.apache.spark.mllib.evaluation.MulticlassMetrics.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkIntroduction.scala From reactive-machine-learning-systems with MIT License | 6 votes |
package com.reactivemachinelearning import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors object SparkIntroduction { def main(args: Array[String]) { // handle args // setup val session = SparkSession.builder.appName("Simple ModelExample").getOrCreate() import session.implicits._ // Load and parse the train and test data val inputBasePath = "example_data" val outputBasePath = "." val trainingDataPath = inputBasePath + "/training.txt" val testingDataPath = inputBasePath + "/testing.txt" val currentOutputPath = outputBasePath + System.currentTimeMillis() val trainingData = session.read.textFile(trainingDataPath) val trainingParsed = trainingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val testingData = session.read.textFile(testingDataPath) val testingParsed = testingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val model = LinearRegressionWithSGD.train(trainingParsed.rdd, numIterations) // Evaluate model on testing examples val predictionsAndLabels = testingParsed.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Report performance statistics val metrics = new MulticlassMetrics(predictionsAndLabels.rdd) val precision = metrics.precision val recall = metrics.recall println(s"Precision: $precision Recall: $recall") // Save model model.save(session.sparkContext, currentOutputPath) } }
Example 2
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 3
Source File: MyDecisionTreeClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.DecisionTreeModel import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MyDecisionTreeClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyDecisionTreeClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } println(rawData.count()) println(data.count()) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val maxDepth = 5 val maxBins = 32 evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo, "gini", maxDepth, maxBins) evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo, "entropy", maxDepth, maxBins) spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], numClasses: Int, categoricalFeaturesInfo: Map[Int,Int], impurity: String, maxDepth: Int, maxBins:Int ) :Unit = { val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Using Impurity :"+ impurity) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Decision Tree Accuracy: "+metrics.precision) println("Decision Tree Error: "+ (1-metrics.precision)) (0 until numClasses).map( category => (metrics.precision(category), metrics.recall(category)) ).foreach(println) } def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 4
Source File: MulticlassClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 5
Source File: MyRandomForestClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.sql.SparkSession object MyRandomForestClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyRandomForestClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 3 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. // val impurity = "gini" val maxDepth = 4 val maxBins = 32 evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees, featureSubsetStrategy, "gini", maxDepth, maxBins) evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees, featureSubsetStrategy, "entropy", maxDepth, maxBins) println("=============") spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], numClasses: Int, categoricalFeaturesInfo: Map[Int,Int], numTrees: Int, featureSubsetStrategy: String, impurity: String, maxDepth: Int, maxBins:Int ) :Unit = { val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy,impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Using Impurity :"+ impurity) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Model Accuracy: "+metrics.precision) println("Model Error: "+ (1-metrics.precision)) // (0 until numClasses).map( // category => (metrics.precision(category), metrics.recall(category)) // ).foreach(println) println("My Random Forest Model:\n" + model.toDebugString) } def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 6
Source File: MyGradientBoostingClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.sql.SparkSession object MyGradientBoostingClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyGradientBoostedTreesClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val algo = "Classification" val numIterations = 3 val numClasses = 2 val maxDepth = 5 val maxBins = 32 val categoricalFeatureInfo = Map[Int,Int]() val boostingStrategy = BoostingStrategy.defaultParams(algo) boostingStrategy.setNumIterations(numIterations) boostingStrategy.treeStrategy.setNumClasses(numClasses) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo evaluate(trainingData, testData, boostingStrategy) println("===================") spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], boostingStrategy : BoostingStrategy ) :Unit = { val model = GradientBoostedTrees.train(trainingData, boostingStrategy) val metrics = getMetrics(model, testData) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Model Accuracy: "+metrics.precision) println("Model Error: "+ (1-metrics.precision)) // (0 until boostingStrategy.treeStrategy.getNumClasses()).map( // category => (metrics.precision(category), metrics.recall(category)) // ).foreach(println) // println("My Classification GBT model:\n" + model.toDebugString) } def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 7
Source File: RandomForestDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.evaluation.MulticlassMetrics object RandomForestDemo { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample")//.setMaster("local[*]") val sc = new SparkContext(conf) val filePath = args(0) val data = MLUtils.loadLibSVMFile(sc, filePath) val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L) val training = splits(0).cache() val test = splits(1) // Train a RandomForest mode with an empty categoricalFeaturesInfo indicates all features are continuous. val numClasses = 10 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 50 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. val impurity = "gini" val maxDepth = 30 val maxBins = 32 val model = RandomForest.trainClassifier(training, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) // Evaluate model on test instances and compute test error val labelAndPreds = test.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val metrics = new MulticlassMetrics(labelAndPreds) // Confusion matrix println("Confusion matrix:") println(metrics.confusionMatrix) // Overall Statistics val accuracy = metrics.accuracy println("Summary Statistics") println(s"Accuracy = $accuracy") // Precision by label val labels = metrics.labels labels.foreach { l => println(s"Precision($l) = " + metrics.precision(l)) } // Recall by label labels.foreach { l => println(s"Recall($l) = " + metrics.recall(l)) } // False positive rate by label labels.foreach { l => println(s"FPR($l) = " + metrics.falsePositiveRate(l)) } // F-measure by label labels.foreach { l => println(s"F1-Score($l) = " + metrics.fMeasure(l)) } // Weighted stats println(s"Weighted precision: ${metrics.weightedPrecision}") println(s"Weighted recall: ${metrics.weightedRecall}") println(s"Weighted F1 score: ${metrics.weightedFMeasure}") println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}") val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / test.count() println("Accuracy = " + (1-testErr) * 100 + " %") //println("Learned classification forest model:\n" + model.toDebugString) } }
Example 8
Source File: MultilayerPerceptronClassifierSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSparkContext { test("XOR function learning as binary classification problem with two outputs.") { val dataFrame = sqlContext.createDataFrame(Seq( (Vectors.dense(0.0, 0.0), 0.0), (Vectors.dense(0.0, 1.0), 1.0), (Vectors.dense(1.0, 0.0), 1.0), (Vectors.dense(1.0, 1.0), 0.0)) ).toDF("features", "label") val layers = Array[Int](2, 5, 2) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) .setMaxIter(100) val model = trainer.fit(dataFrame) val result = model.transform(dataFrame) val predictionAndLabels = result.select("prediction", "label").collect() predictionAndLabels.foreach { case Row(p: Double, l: Double) => assert(p == l) } } // TODO: implement a more rigorous test test("3 class classification with 2 hidden layers") { val nPoints = 1000 // The following coefficients are taken from OneVsRestSuite.scala // they represent 3-class iris dataset val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) // the input seed is somewhat magic, to make this test pass val rdd = sc.parallelize(generateMultinomialLogisticInput( coefficients, xMean, xVariance, true, nPoints, 1), 2) val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features") val numClasses = 3 val numIterations = 100 val layers = Array[Int](4, 5, 4, numClasses) val trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(1) .setSeed(11L) // currently this seed is ignored .setMaxIter(numIterations) val model = trainer.fit(dataFrame) val numFeatures = dataFrame.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label") .map { case Row(p: Double, l: Double) => (p, l) } // train multinomial logistic regression val lr = new LogisticRegressionWithLBFGS() .setIntercept(true) .setNumClasses(numClasses) lr.optimizer.setRegParam(0.0) .setNumIterations(numIterations) val lrModel = lr.run(rdd) val lrPredictionAndLabels = lrModel.predict(rdd.map(_.features)).zip(rdd.map(_.label)) // MLP's predictions should not differ a lot from LR's. val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels) val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels) assert(mlpMetrics.confusionMatrix ~== lrMetrics.confusionMatrix absTol 100) } }
Example 9
Source File: MulticlassClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("1.5.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision case "recall" => metrics.recall case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall } metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true case "precision" => true case "recall" => true case "weightedPrecision" => true case "weightedRecall" => true } @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 10
Source File: MulticlassClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 11
Source File: MulticlassClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 setDefault(metricName -> "f1") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision//准确率 case "recall" => metrics.recall//召回率 case "weightedPrecision" => metrics.weightedPrecision//加权准确率 case "weightedRecall" => metrics.weightedRecall//加权召回率 } metric } override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "precision" => true//准确率 case "recall" => true//召回率 case "weightedPrecision" => true//加权准确率 case "weightedRecall" => true//加权召回率 } override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) }
Example 12
Source File: LogisticRegressionWithLBFGSDeom.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = modelBFGS.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 13
Source File: SVMWithSGDDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 14
Source File: MulticlassClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 15
Source File: LRAccuracyTest.scala From SparseML with Apache License 2.0 | 5 votes |
package MLlib import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkContext, SparkConf} object LRAccuracyTest { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map( l => LabeledPoint(l.label, l.features.toSparse)) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new SparseLogisticRegressionWithLBFGS() .setNumClasses(5) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Get evaluation metrics. val metrics = new MulticlassMetrics(predictionAndLabels) val precision = metrics.precision println("Precision = " + precision) } }
Example 16
Source File: MulticlassClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 17
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.mllib.util.MLUtils //import org.apache.spark.ml.feature.HashingTF //import org.apache.spark.ml.feature.IDF object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) println(zipped.first()) val train = zipped.map { case (topic, vector) => { LabeledPoint(newsgroupsMap(topic), vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm") train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => { println(topic) println(vector) LabeledPoint(topic, vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.accuracy) println(metrics.weightedFalsePositiveRate) println(metrics.weightedPrecision) println(metrics.weightedFMeasure) println(metrics.weightedRecall) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 18
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{ SparseVector => SV } object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.weightedFMeasure) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 19
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 20
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 21
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 22
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 23
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }