org.apache.spark.ml.regression.RandomForestRegressor Scala Examples
The following examples show how to use org.apache.spark.ml.regression.RandomForestRegressor.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HasMaxBinsParam.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.params.common import scala.language.reflectiveCalls import org.apache.spark.ml import org.apache.spark.ml.regression.RandomForestRegressor import ai.deepsense.deeplang.params.Params import ai.deepsense.deeplang.params.validators.RangeValidator import ai.deepsense.deeplang.params.wrappers.spark.{IntParamWrapper, LongParamWrapper} trait HasMaxBinsParam extends Params { val maxBins = new IntParamWrapper[ml.param.Params { val maxBins: ml.param.IntParam }]( name = "max bins", description = Some("The maximum number of bins used for discretizing continuous features " + "and for choosing how to split on features at each node. " + "More bins give higher granularity. " + "Must be >= 2 and >= number of categories in any categorical feature."), sparkParamGetter = _.maxBins, RangeValidator(2.0, Int.MaxValue, step = Some(1.0))) setDefault(maxBins, 32.0) }
Example 2
Source File: HasMinInstancePerNodeParam.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.params.common import scala.language.reflectiveCalls import org.apache.spark.ml import org.apache.spark.ml.regression.RandomForestRegressor import io.deepsense.deeplang.params.Params import io.deepsense.deeplang.params.validators.RangeValidator import io.deepsense.deeplang.params.wrappers.spark.{IntParamWrapper, DoubleParamWrapper} trait HasMinInstancePerNodeParam extends Params { val minInstancesPerNode = new IntParamWrapper[ml.param.Params { val minInstancesPerNode: ml.param.IntParam }]( name = "min instances per node", description = Some("The minimum number of instances each child must have after split. " + "If a split causes the left or right child to have fewer instances than the parameter's " + "value, the split will be discarded as invalid."), sparkParamGetter = _.minInstancesPerNode, RangeValidator(1.0, Int.MaxValue, step = Some(1.0))) setDefault(minInstancesPerNode, 1.0) }
Example 3
Source File: HasMaxBinsParam.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.params.common import scala.language.reflectiveCalls import org.apache.spark.ml import org.apache.spark.ml.regression.RandomForestRegressor import io.deepsense.deeplang.params.Params import io.deepsense.deeplang.params.validators.RangeValidator import io.deepsense.deeplang.params.wrappers.spark.{IntParamWrapper, LongParamWrapper} trait HasMaxBinsParam extends Params { val maxBins = new IntParamWrapper[ml.param.Params { val maxBins: ml.param.IntParam }]( name = "max bins", description = Some("The maximum number of bins used for discretizing continuous features " + "and for choosing how to split on features at each node. " + "More bins give higher granularity. " + "Must be >= 2 and >= number of categories in any categorical feature."), sparkParamGetter = _.maxBins, RangeValidator(2.0, Int.MaxValue, step = Some(1.0))) setDefault(maxBins, 32.0) }
Example 4
Source File: RandomForestRegressionSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.regression import com.ibm.aardpfark.pfa.PredictorResult import org.apache.spark.ml.regression.RandomForestRegressor class RandomForestRegressionSuite extends SparkRegressorPFASuiteBase[PredictorResult] { val data = spark.read.format("libsvm").load(inputPath) val dt = new RandomForestRegressor() .setMaxDepth(5) .setNumTrees(3) override val sparkTransformer = dt.fit(data) val result = sparkTransformer.transform(data) override val input = withColumnAsArray(result, dt.getFeaturesCol).toJSON.collect() override val expectedOutput = result.select(dt.getPredictionCol).toJSON.collect() }
Example 5
Source File: RandomForestRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") //算法预测结果的存储列的名称, 默认是”prediction” .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //Root Mean Squared Error (RMSE) on test data = 0.09854713827168428 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel] println("Learned regression forest model:\n" + rfModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 6
Source File: OpRandomForestRegressorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.regression import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.PredictionEquality import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel} import com.salesforce.op.test._ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpRandomForestRegressorTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[RandomForestRegressionModel], OpPredictorWrapper[RandomForestRegressor, RandomForestRegressionModel]] with PredictionEquality { override def specName: String = Spec[OpRandomForestRegressor] val (inputData, rawLabel, features) = TestFeatureBuilder( Seq[(RealNN, OPVector)]( (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector), (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector), (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector), (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector), (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector) ) ) val label = rawLabel.copy(isResponse = true) val estimator = new OpRandomForestRegressor().setInput(label, features) val expectedResult = Seq( Prediction(20.0), Prediction(23.5), Prediction(31.5), Prediction(35.5), Prediction(37.0) ) it should "allow the user to set the desired spark parameters" in { estimator .setMaxDepth(7) .setMaxBins(3) .setMinInstancesPerNode(2) .setMinInfoGain(0.1) .setSeed(42L) estimator.fit(inputData) estimator.predictor.getMaxDepth shouldBe 7 estimator.predictor.getMaxBins shouldBe 3 estimator.predictor.getMinInstancesPerNode shouldBe 2 estimator.predictor.getMinInfoGain shouldBe 0.1 estimator.predictor.getSeed shouldBe 42L } }
Example 7
Source File: OpRandomForestRegressor.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.regression import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictionModel, OpPredictorWrapper} import org.apache.spark.ml.regression.{OpRandomForestRegressorParams, RandomForestRegressionModel, RandomForestRegressor} import scala.reflect.runtime.universe.TypeTag class OpRandomForestRegressionModel ( sparkModel: RandomForestRegressionModel, uid: String = UID[OpRandomForestRegressionModel], operationName: String = classOf[RandomForestRegressor].getSimpleName )( implicit tti1: TypeTag[RealNN], tti2: TypeTag[OPVector], tto: TypeTag[Prediction], ttov: TypeTag[Prediction#Value] ) extends OpPredictionModel[RandomForestRegressionModel]( sparkModel = sparkModel, uid = uid, operationName = operationName )
Example 8
Source File: TypedRandomForestRegressor.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml package regression import frameless.ml.internals.TreesInputsChecker import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} final class TypedRandomForestRegressor[Inputs] private[ml]( rf: RandomForestRegressor, labelCol: String, featuresCol: String ) extends TypedEstimator[Inputs, TypedRandomForestRegressor.Outputs, RandomForestRegressionModel] { val estimator: RandomForestRegressor = rf .setLabelCol(labelCol) .setFeaturesCol(featuresCol) .setPredictionCol(AppendTransformer.tempColumnName) def setNumTrees(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setNumTrees(value)) def setMaxDepth(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxDepth(value)) def setMinInfoGain(value: Double): TypedRandomForestRegressor[Inputs] = copy(rf.setMinInfoGain(value)) def setMinInstancesPerNode(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMinInstancesPerNode(value)) def setMaxMemoryInMB(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxMemoryInMB(value)) def setSubsamplingRate(value: Double): TypedRandomForestRegressor[Inputs] = copy(rf.setSubsamplingRate(value)) def setFeatureSubsetStrategy(value: FeatureSubsetStrategy): TypedRandomForestRegressor[Inputs] = copy(rf.setFeatureSubsetStrategy(value.sparkValue)) def setMaxBins(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxBins(value)) private def copy(newRf: RandomForestRegressor): TypedRandomForestRegressor[Inputs] = new TypedRandomForestRegressor[Inputs](newRf, labelCol, featuresCol) } object TypedRandomForestRegressor { case class Outputs(prediction: Double) def apply[Inputs](implicit inputsChecker: TreesInputsChecker[Inputs]) : TypedRandomForestRegressor[Inputs] = { new TypedRandomForestRegressor(new RandomForestRegressor(), inputsChecker.labelCol, inputsChecker.featuresCol) } }
Example 9
Source File: HasMinInstancePerNodeParam.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.params.common import scala.language.reflectiveCalls import org.apache.spark.ml import org.apache.spark.ml.regression.RandomForestRegressor import ai.deepsense.deeplang.params.Params import ai.deepsense.deeplang.params.validators.RangeValidator import ai.deepsense.deeplang.params.wrappers.spark.{IntParamWrapper, DoubleParamWrapper} trait HasMinInstancePerNodeParam extends Params { val minInstancesPerNode = new IntParamWrapper[ml.param.Params { val minInstancesPerNode: ml.param.IntParam }]( name = "min instances per node", description = Some("The minimum number of instances each child must have after split. " + "If a split causes the left or right child to have fewer instances than the parameter's " + "value, the split will be discarded as invalid."), sparkParamGetter = _.minInstancesPerNode, RangeValidator(1.0, Int.MaxValue, step = Some(1.0))) setDefault(minInstancesPerNode, 1.0) }
Example 10
Source File: RandomForestModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML import org.apache.spark.ml.regression.{ RandomForestRegressor, RandomForestRegressionModel } import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.mllib.evaluation.RegressionMetrics object RandomForestModelReuse { def main(args: Array[String]) { val spark = SparkSessionCreate.createSession() import spark.implicits._ // Load the workflow back val cvModel = CrossValidatorModel.load("model/RF_model/") // ***************************************** println("Run prediction over test dataset") // ***************************************** // Predicts and saves file ready for Kaggle! //if(!params.outputFile.isEmpty){ cvModel.transform(Preproessing.testData) .select("id", "prediction") .withColumnRenamed("prediction", "loss") .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save("output/result_RF_reuse.csv") spark.stop() } }
Example 11
Source File: ClassifiersImpl.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.machinelearning.common import org.apache.spark.ml.classification.{DecisionTreeClassifier, GBTClassifier, LogisticRegression, NaiveBayes} import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql._ object ClassifiersImpl { def logisticRegression(trainingLabeledPointDf: DataFrame, testPercentage:Double): Unit = { val mlr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.3) .setElasticNetParam(0.8) val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = mlr.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of logisticRegression = " + accuracy) //println(model) } def gbtClassifer(trainingLabeledPointDf: DataFrame, testPercentage:Double): Unit = { val gbt = new GBTClassifier() val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = gbt.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of gbtClassifier = " + accuracy) //println(model) //println(model.toDebugString) } def randomForestRegressor(trainingLabeledPointDf: DataFrame, impurity:String, maxDepth:Int, maxBins:Int, testPercentage:Double): Unit = { val rf = new RandomForestRegressor() rf.setImpurity(impurity) rf.setMaxDepth(maxDepth) rf.setMaxBins(maxBins) val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = rf.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of NaiveBayer = " + accuracy) } }
Example 12
Source File: L9-15MLPipeline.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.ml.param.ParamMap object MLPipelineApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val pMap = ParamMap(normalizer.p -> 1.0) val model = pipeline.fit(train, pMap) val prediction = model.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 13
Source File: L9-17MLCrossValidation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object MLCrossValidationApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val validator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) val pGrid = new ParamGridBuilder() .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) .addGrid(regressor.numTrees, Array(10, 50, 100)) .build() validator.setEstimatorParamMaps(pGrid) validator.setNumFolds(5) val bestModel = validator.fit(train) val prediction = bestModel.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 14
Source File: RandomForestRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class RandomForestRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 15
Source File: TrainValidationSplitParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.DataFrame class TrainValidationSplitParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new TrainValidationSplit(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 16
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 17
Source File: RandomForestModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark.ml.regression.{ RandomForestRegressor, RandomForestRegressionModel } import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.ml.tuning.{ CrossValidator, CrossValidatorModel } import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object RandomForestModelReuse { def main(args: Array[String]) { val spark = SparkSessionCreate.createSession("ChurnPredictionRandomForestWithModelReuse") import spark.implicits._ // Load the workflow back val cvModel = CrossValidatorModel.load("model/RF_model_churn/") val predictions = cvModel.transform(Preprocessing.testSet) predictions.show(10) val result = predictions.select("label", "prediction", "probability") val resutDF = result.withColumnRenamed("prediction", "Predicted_label") resutDF.show(10) val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") val accuracy = evaluator.evaluate(predictions) println("Accuracy: " + accuracy) evaluator.explainParams() val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) spark.stop() } }