org.apache.spark.ml.evaluation.RegressionEvaluator Scala Examples
The following examples show how to use org.apache.spark.ml.evaluation.RegressionEvaluator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ALSExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.recommendation.ALS // $example off$ import org.apache.spark.sql.SparkSession object ALSExample { // $example on$ case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) def parseRating(str: String): Rating = { val fields = str.split("::") assert(fields.size == 4) Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) } // $example off$ def main(args: Array[String]) { val spark = SparkSession .builder .appName("ALSExample") .getOrCreate() import spark.implicits._ // $example on$ val ratings = spark.read.textFile("data/mllib/als/sample_movielens_ratings.txt") .map(parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) // Build the recommendation model using ALS on the training data val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") val model = als.fit(training) // Evaluate the model by computing the RMSE on the test data val predictions = model.transform(test) val evaluator = new RegressionEvaluator() .setMetricName("rmse") .setLabelCol("rating") .setPredictionCol("prediction") val rmse = evaluator.evaluate(predictions) println(s"Root-mean-square error = $rmse") // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: TrainValidationSplitExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object TrainValidationSplitExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("TrainValidationSplitExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // Prepare training and test data. val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) val lr = new LinearRegression() // We use a ParamGridBuilder to construct a grid of parameters to search over. // TrainValidationSplit will try all combinations of values and determine best model using // the evaluator. val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept, Array(true, false)) .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0)) .build() // In this case the estimator is simply the linear regression. // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. val trainValidationSplit = new TrainValidationSplit() .setEstimator(lr) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. trainValidationSplit.setTrainRatio(0.8) // Run train validation split, and choose the best set of parameters. val model = trainValidationSplit.fit(training) // Make predictions on test data. model is the model with combination of parameters // that performed best. model.transform(test) .select("features", "label", "prediction") .show() sc.stop() } }
Example 3
Source File: RandomForestRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") //算法预测结果的存储列的名称, 默认是”prediction” .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //Root Mean Squared Error (RMSE) on test data = 0.09854713827168428 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel] println("Learned regression forest model:\n" + rfModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 4
Source File: GradientBoostedTreeRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label")//标签列名 //预测结果列名 .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //rmse均方根误差说明样本的离散程度 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel] println("Learned regression GBT model:\n" + gbtModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 5
Source File: SparkXGBoostRegressorSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.DecisionTreeRegressor import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.SquareLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class SparkXGBoostRegressorSuite extends FunSuite with TestData with MLlibTestSparkContext { test("Compare with DecisionTree using simple data") { val data = sqlContext.createDataFrame(sc.parallelize(simpleData, 2)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(1) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(1) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } test("Compare with DecisionTree using random data") { val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 40, 10, 2, 999)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(5) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } }
Example 6
Source File: SeedSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.SquareLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class SeedSuite extends FunSuite with MLlibTestSparkContext with TestData { test("Different runs with the seed returns the same result"){ val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 100, 10, 3, 999)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sXGBoost1 = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) .setSampleRatio(0.5) .setFeatureSampleRatio(0.5) .setSeed(999) val sXGBoostModel1 = sXGBoost1.fit(featureIndexer.transform(data)) val sXGBoost2 = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) .setSampleRatio(0.5) .setFeatureSampleRatio(0.5) .setSeed(999) val sXGBoostModel2 = sXGBoost2.fit(featureIndexer.transform(data)) val sXGBoost3 = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) .setSampleRatio(0.5) .setFeatureSampleRatio(0.5) .setSeed(998) val sXGBoostModel3 = sXGBoost3.fit(featureIndexer.transform(data)) val evaluator = new RegressionEvaluator() val rmse1 = evaluator.evaluate(sXGBoostModel1.transform(featureIndexer.transform(data))) val rmse2 = evaluator.evaluate(sXGBoostModel2.transform(featureIndexer.transform(data))) val rmse3 = evaluator.evaluate(sXGBoostModel3.transform(featureIndexer.transform(data))) assert(rmse1 === rmse2) assert(rmse1 !~= rmse3 relTol 1e-3) } }
Example 7
Source File: ALSExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.recommendation.ALS // $example off$ import org.apache.spark.sql.SparkSession object ALSExample { // $example on$ case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) def parseRating(str: String): Rating = { val fields = str.split("::") assert(fields.size == 4) Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) } // $example off$ def main(args: Array[String]) { val spark = SparkSession .builder .appName("ALSExample") .getOrCreate() import spark.implicits._ // $example on$ val ratings = spark.read.textFile("data/mllib/als/sample_movielens_ratings.txt") .map(parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) // Build the recommendation model using ALS on the training data val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") val model = als.fit(training) // Evaluate the model by computing the RMSE on the test data val predictions = model.transform(test) val evaluator = new RegressionEvaluator() .setMetricName("rmse") .setLabelCol("rating") .setPredictionCol("prediction") val rmse = evaluator.evaluate(predictions) println(s"Root-mean-square error = $rmse") // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: ClassifiersImpl.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.machinelearning.common import org.apache.spark.ml.classification.{DecisionTreeClassifier, GBTClassifier, LogisticRegression, NaiveBayes} import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql._ object ClassifiersImpl { def logisticRegression(trainingLabeledPointDf: DataFrame, testPercentage:Double): Unit = { val mlr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.3) .setElasticNetParam(0.8) val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = mlr.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of logisticRegression = " + accuracy) //println(model) } def gbtClassifer(trainingLabeledPointDf: DataFrame, testPercentage:Double): Unit = { val gbt = new GBTClassifier() val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = gbt.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of gbtClassifier = " + accuracy) //println(model) //println(model.toDebugString) } def randomForestRegressor(trainingLabeledPointDf: DataFrame, impurity:String, maxDepth:Int, maxBins:Int, testPercentage:Double): Unit = { val rf = new RandomForestRegressor() rf.setImpurity(impurity) rf.setMaxDepth(maxDepth) rf.setMaxBins(maxBins) val splits = trainingLabeledPointDf.randomSplit(Array(testPercentage, 1-testPercentage)) val model = rf.fit(splits(0)) val trainTransformed = model.transform(splits(1)) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(trainTransformed) println("Test set accuracy of NaiveBayer = " + accuracy) } }
Example 9
Source File: LinearRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLinearRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.regression.LinearRegression() .setSolver("l-bfgs") .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }
Example 10
Source File: GLMRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) val m = ModelBuilderSSP.newGLR(coefficients, intercept) m.set(m.link, link.get) m.set(m.family, family.get) m } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new GeneralizedLinearRegression() .setLink(link) .setFamily(family) .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }
Example 11
Source File: ALS.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.recommendation import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, ScoringWithEvaluator} object ALS extends BenchmarkAlgorithm with ScoringWithEvaluator { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateRatings( ctx.sqlContext, numUsers, numItems, numExamples, numTestExamples, implicitPrefs = false, numPartitions, ctx.seed())._1 } override def testDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateRatings( ctx.sqlContext, numUsers, numItems, numExamples, numTestExamples, implicitPrefs = false, numPartitions, ctx.seed())._2 } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.recommendation.ALS() .setSeed(ctx.seed()) .setRegParam(regParam) .setNumBlocks(numPartitions) .setRank(rank) .setMaxIter(maxIter) } override protected def evaluator(ctx: MLBenchContext): Evaluator = { new RegressionEvaluator().setLabelCol("rating") } }
Example 12
Source File: ALSExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.recommendation.ALS // $example off$ import org.apache.spark.sql.SparkSession object ALSExample { // $example on$ case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) def parseRating(str: String): Rating = { val fields = str.split("::") assert(fields.size == 4) Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) } // $example off$ def main(args: Array[String]) { val spark = SparkSession .builder .appName("ALSExample") .getOrCreate() import spark.implicits._ // $example on$ val ratings = spark.read.textFile("data/mllib/als/sample_movielens_ratings.txt") .map(parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) // Build the recommendation model using ALS on the training data val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") val model = als.fit(training) // Evaluate the model by computing the RMSE on the test data val predictions = model.transform(test) val evaluator = new RegressionEvaluator() .setMetricName("rmse") .setLabelCol("rating") .setPredictionCol("prediction") val rmse = evaluator.evaluate(predictions) println(s"Root-mean-square error = $rmse") // $example off$ spark.stop() } } // scalastyle:on println
Example 13
Source File: ALSModeling.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package com.spark.recommendation import java.util import com.spark.recommendation.FeatureExtraction.{Rating, parseRating} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.recommendation.ALS import org.apache.spark.sql.{Row, DataFrame, DataFrameWriter} object ALSModeling { def createALSModel() { val ratings = FeatureExtraction.getFeatures(); val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) println(training.first()) // Build the recommendation model using ALS on the training data val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") val model = als.fit(training) println(model.userFactors.count()) println(model.itemFactors.count()) val predictions = model.transform(test) println(predictions.printSchema()) val evaluator = new RegressionEvaluator() .setMetricName("rmse") .setLabelCol("rating") .setPredictionCol("prediction") val rmse = evaluator.evaluate(predictions) println(s"Root-mean-square error = $rmse") } def main(args: Array[String]) { createALSModel() } }
Example 14
Source File: ALSGenericExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
// scalastyle:off println package org.sparksamples.als import org.apache.spark.SparkConf import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.recommendation.ALS // $example on$ // $example off$ object ALSGenericExample { val SPARK_PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"; case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) def parseRating(str: String): Rating = { val fields = str.split("::") assert(fields.size == 4) Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) } def main(args: Array[String]) { import org.apache.spark.sql.SparkSession val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val spark = SparkSession .builder() .appName("Spark SQL Example") .config(spConfig) .getOrCreate() import spark.implicits._ // Create an RDD of Person objects from a text file, convert it to a Dataframe val ratings = spark.sparkContext .textFile(SPARK_PATH + "data/mllib/als/sample_movielens_ratings.txt") .map(_.split("::")) .map(lineSplit => Rating(lineSplit(0).toInt, lineSplit(1).toInt, lineSplit(2).toFloat, lineSplit(3).toLong)) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) // Build the recommendation model using ALS on the training data val als = new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") //als. val model = als.fit(training) // Evaluate the model by computing the RMSE on the test data val predictions = model.transform(test) val evaluator = new RegressionEvaluator() .setMetricName("rmse") .setLabelCol("rating") .setPredictionCol("prediction") val rmse = evaluator.evaluate(predictions) println(s"Root-mean-square error = $rmse") spark.stop() } }
Example 15
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 16
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 17
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 18
Source File: GPExample.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.examples import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.GaussianProcessRegression import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.{DataFrame, SparkSession} trait GPExample { def name : String val spark = SparkSession.builder().appName(name).master("local[4]").getOrCreate() def cv(gp: GaussianProcessRegression, instances: DataFrame, expectedRMSE: Double) = { val cv = new CrossValidator() .setEstimator(gp) .setEvaluator(new RegressionEvaluator()) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setNumFolds(10) val rmse = cv.fit(instances).avgMetrics.head println("RMSE: " + rmse) assert(rmse < expectedRMSE) } }
Example 19
Source File: L9-17MLCrossValidation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object MLCrossValidationApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val validator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) val pGrid = new ParamGridBuilder() .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) .addGrid(regressor.numTrees, Array(10, 50, 100)) .build() validator.setEstimatorParamMaps(pGrid) validator.setNumFolds(5) val bestModel = validator.fit(train) val prediction = bestModel.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 20
Source File: TrainValidationSplitParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.DataFrame class TrainValidationSplitParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new TrainValidationSplit(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 21
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }