org.apache.spark.ml.tuning.TrainValidationSplit Scala Examples
The following examples show how to use org.apache.spark.ml.tuning.TrainValidationSplit.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TrainValidationSplitParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.DataFrame class TrainValidationSplitParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new TrainValidationSplit(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 2
Source File: MNIST.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.examples import org.apache.spark.ml.classification.GaussianProcessClassifier import org.apache.spark.ml.commons.kernel.RBFKernel import org.apache.spark.ml.commons.util.Scaling import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MNIST extends App with Scaling { val name = "MNIST" val spark = SparkSession.builder().appName(name).master(s"local[${args(0)}]").getOrCreate() val path = args(1) val parallelism = args(0).toInt * 4 val forExpert = args(2).toInt val activeSet = args(3).toInt import spark.sqlContext.implicits._ val dataset = (scale _ andThen labels201 _) (spark.read.format("csv").load(path).rdd.map(row => { val features = Vectors.dense((1 until row.length).map("_c" + _).map(row.getAs[String]).map(_.toDouble).toArray) val label = row.getAs[String]("_c0").toDouble LabeledPoint(label, features) }).cache()).toDF.repartition(parallelism).cache() val gp = new GaussianProcessClassifier() .setDatasetSizeForExpert(forExpert) .setActiveSetSize(activeSet) .setKernel(() => new RBFKernel(10)) .setTol(1e-3) val cv = new TrainValidationSplit() .setEstimator(gp) .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy")) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setTrainRatio(0.8) println("Accuracy: " + cv.fit(dataset).validationMetrics.toList) def labels201(data: RDD[LabeledPoint]) : RDD[LabeledPoint] = { val old2new = data.map(_.label).distinct().collect().zipWithIndex.toMap data.map(lp => LabeledPoint(old2new(lp.label), lp.features)) } }
Example 3
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 4
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 5
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 6
Source File: ACMEModel.scala From cdsw-simple-serving with Apache License 2.0 | 5 votes |
// Don't execute these lines in the workbench -- skip to "Start workbench session" package acme import org.apache.spark.ml.PipelineModel import com.cloudera.datascience.cdsw.acme.ACMEData import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.ml.{Pipeline, PipelineModel} import scala.util.Random // Read and cache training data prepared from acme-dataeng: val training = ACMEData.readData() training.cache() training.show() // Build a logistic regression model, val assembler = new VectorAssembler(). setInputCols(training.columns.filter(_ != "Occupancy")). setOutputCol("featureVec") val lr = new LogisticRegression(). setFeaturesCol("featureVec"). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val pipeline = new Pipeline().setStages(Array(assembler, lr)) // and tune that model: val paramGrid = new ParamGridBuilder(). addGrid(lr.regParam, Seq(0.00001, 0.001, 0.1)). addGrid(lr.elasticNetParam, Seq(1.0)). build() val eval = new BinaryClassificationEvaluator(). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val validator = new TrainValidationSplit(). setSeed(Random.nextLong()). setEstimator(pipeline). setEvaluator(eval). setEstimatorParamMaps(paramGrid). setTrainRatio(0.9) val validatorModel = validator.fit(training) val pipelineModel = validatorModel.bestModel.asInstanceOf[PipelineModel] val lrModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel] // Logistic regression model parameters: training.columns.zip(lrModel.coefficients.toArray).foreach(println) // Model hyperparameters: lrModel.getElasticNetParam lrModel.getRegParam // Validation metric (accuracy): validatorModel.validationMetrics.max pipelineModel // End workbench session } }
Example 7
Source File: TrainValidationSplitExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object TrainValidationSplitExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("TrainValidationSplitExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // Prepare training and test data. val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) val lr = new LinearRegression() // We use a ParamGridBuilder to construct a grid of parameters to search over. // TrainValidationSplit will try all combinations of values and determine best model using // the evaluator. val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept, Array(true, false)) .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0)) .build() // In this case the estimator is simply the linear regression. // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. val trainValidationSplit = new TrainValidationSplit() .setEstimator(lr) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. trainValidationSplit.setTrainRatio(0.8) // Run train validation split, and choose the best set of parameters. val model = trainValidationSplit.fit(training) // Make predictions on test data. model is the model with combination of parameters // that performed best. model.transform(test) .select("features", "label", "prediction") .show() sc.stop() } }