org.apache.spark.ml.tuning.CrossValidator Scala Examples
The following examples show how to use org.apache.spark.ml.tuning.CrossValidator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ModelPersistence.scala From reactive-machine-learning-systems with MIT License | 5 votes |
package com.reactivemachinelearning import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{QuantileDiscretizer, VectorAssembler} import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder} import org.apache.spark.sql.SparkSession object ModelPersistence extends App { val session = SparkSession.builder.appName("ModelPersistence").getOrCreate() val data = Seq( (0, 18.0, 0), (1, 20.0, 0), (2, 8.0, 1), (3, 5.0, 1), (4, 2.0, 0), (5, 21.0, 0), (6, 7.0, 1), (7, 18.0, 0), (8, 3.0, 1), (9, 22.0, 0), (10, 8.0, 1), (11, 2.0, 0), (12, 5.0, 1), (13, 4.0, 1), (14, 1.0, 0), (15, 11.0, 0), (16, 7.0, 1), (17, 15.0, 0), (18, 3.0, 1), (19, 20.0, 0)) val instances = session.createDataFrame(data) .toDF("id", "seeds", "label") val discretizer = new QuantileDiscretizer() .setInputCol("seeds") .setOutputCol("discretized") .setNumBuckets(3) val assembler = new VectorAssembler() .setInputCols(Array("discretized")) .setOutputCol("features") val classifier = new LogisticRegression() .setMaxIter(5) val pipeline = new Pipeline() .setStages(Array(discretizer, assembler, classifier)) val paramMaps = new ParamGridBuilder() .addGrid(classifier.regParam, Array(0.0, 0.1)) .build() val evaluator = new BinaryClassificationEvaluator() val crossValidator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setNumFolds(2) .setEstimatorParamMaps(paramMaps) val model = crossValidator.fit(instances) model.write.overwrite().save("my-model") val persistedModel = CrossValidatorModel.load("./my-model") println(s"UID: ${persistedModel.uid}") }
Example 2
Source File: RandomForestModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML import org.apache.spark.ml.regression.{ RandomForestRegressor, RandomForestRegressionModel } import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.mllib.evaluation.RegressionMetrics object RandomForestModelReuse { def main(args: Array[String]) { val spark = SparkSessionCreate.createSession() import spark.implicits._ // Load the workflow back val cvModel = CrossValidatorModel.load("model/RF_model/") // ***************************************** println("Run prediction over test dataset") // ***************************************** // Predicts and saves file ready for Kaggle! //if(!params.outputFile.isEmpty){ cvModel.transform(Preproessing.testData) .select("id", "prediction") .withColumnRenamed("prediction", "loss") .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save("output/result_RF_reuse.csv") spark.stop() } }
Example 3
Source File: ChurnPredictionSVM.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LinearSVC, LinearSVCModel} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object ChurnPredictionSVM { def main(args: Array[String]) { val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionSVM") import spark.implicits._ val numFolds = 10 val MaxIter: Seq[Int] = Seq(1000) val RegParam: Seq[Double] = Seq(0.10) // L2 regularization param, set 0.10 with L1 reguarization val Tol: Seq[Double] = Seq(1e-4) val ElasticNetParam: Seq[Double] = Seq(0.00001) // Combination of L1 and L2 val svm = new LinearSVC() // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(PipelineConstruction.ipindexer, PipelineConstruction.labelindexer, PipelineConstruction.assembler, svm)) // Search through decision tree's maxDepth parameter for best model val paramGrid = new ParamGridBuilder() .addGrid(svm.maxIter, MaxIter) .addGrid(svm.regParam, RegParam) .addGrid(svm.tol, Tol) .build() val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") // Set up 3-fold cross validation val crossval = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(numFolds) val cvModel = crossval.fit(Preprocessing.trainDF) val predictions = cvModel.transform(Preprocessing.testSet) val selectPrediction = predictions.select("label", "features", "rawPrediction","prediction") selectPrediction.show(10) val accuracy = evaluator.evaluate(predictions) println("Classification accuracy: " + accuracy) // Compute other performence metrices val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) } }
Example 4
Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.ml.linalg.{ Matrix, Vectors } import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row object Describe { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) def main(args: Array[String]) { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Desribe") .getOrCreate() spark.conf.set("spark.debug.maxToStringFields", 10000) val DEFAULT_MAX_TO_STRING_FIELDS = 2500 if (SparkEnv.get != null) { SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) } else { DEFAULT_MAX_TO_STRING_FIELDS } import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") } }
Example 5
Source File: ChurnPredictionLR.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object ChurnPredictionLR { def main(args: Array[String]) { val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionLogisticRegression") import spark.implicits._ val numFolds = 10 val MaxIter: Seq[Int] = Seq(100) val RegParam: Seq[Double] = Seq(1.0) // L2 regularization param, set 0.10 with L1 reguarization val Tol: Seq[Double] = Seq(1e-8) val ElasticNetParam: Seq[Double] = Seq(1.0) // Combination of L1 and L2 val lr = new LogisticRegression() .setLabelCol("label") .setFeaturesCol("features") // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(PipelineConstruction.ipindexer, PipelineConstruction.labelindexer, PipelineConstruction.assembler, lr)) // Search through decision tree's maxDepth parameter for best model val paramGrid = new ParamGridBuilder() .addGrid(lr.maxIter, MaxIter) .addGrid(lr.regParam, RegParam) .addGrid(lr.tol, Tol) .addGrid(lr.elasticNetParam, ElasticNetParam) .build() val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") // Set up 10-fold cross validation val crossval = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(numFolds) val cvModel = crossval.fit(Preprocessing.trainDF) val predictions = cvModel.transform(Preprocessing.testSet) val result = predictions.select("label", "prediction", "probability") val resutDF = result.withColumnRenamed("prediction", "Predicted_label") resutDF.show(10) val accuracy = evaluator.evaluate(predictions) println("Classification accuracy: " + accuracy) // Compute other performence metrices val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) } }
Example 6
Source File: RandomForestModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark.ml.regression.{ RandomForestRegressor, RandomForestRegressionModel } import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.ml.tuning.{ CrossValidator, CrossValidatorModel } import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object RandomForestModelReuse { def main(args: Array[String]) { val spark = SparkSessionCreate.createSession("ChurnPredictionRandomForestWithModelReuse") import spark.implicits._ // Load the workflow back val cvModel = CrossValidatorModel.load("model/RF_model_churn/") val predictions = cvModel.transform(Preprocessing.testSet) predictions.show(10) val result = predictions.select("label", "prediction", "probability") val resutDF = result.withColumnRenamed("prediction", "Predicted_label") resutDF.show(10) val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") val accuracy = evaluator.evaluate(predictions) println("Accuracy: " + accuracy) evaluator.explainParams() val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) spark.stop() } }
Example 7
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 8
Source File: L9-17MLCrossValidation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object MLCrossValidationApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val validator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) val pGrid = new ParamGridBuilder() .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) .addGrid(regressor.numTrees, Array(10, 50, 100)) .build() validator.setEstimatorParamMaps(pGrid) validator.setNumFolds(5) val bestModel = validator.fit(train) val prediction = bestModel.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 9
Source File: CrossValidation.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.ml.{Model, Pipeline, PipelineStage} import org.apache.spark.sql._ @throws(classOf[IllegalArgumentException]) protected def apply( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): CrossValidatorModel = { require(stages.size > 0, "Cannot cross-validate pipeline without stages") require(grid.size > 0, "Cannot cross-validate with undefined grid") val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator)) new CrossValidator() .setEstimator(pipeline) .setEstimatorParamMaps(grid) .setEvaluator(new BinaryClassificationEvaluator) .setNumFolds(numFolds) .fit(trainDf) } protected def evaluate( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): Evaluator = this(trainDf, stages, grid).getEvaluator }
Example 10
Source File: MNISTCrossValidation.scala From spark-knn with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.KNNClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.log4j object MNISTCrossValidation { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val dataset = MLUtils.loadLibSVMFile(sc, "data/mnist/mnist.bz2") .toDF() //.limit(10000) //split traning and testing val Array(train, test) = dataset.randomSplit(Array(0.7, 0.3), seed = 1234L).map(_.cache()) //create PCA matrix to reduce feature dimensions val pca = new PCA() .setInputCol("features") .setK(50) .setOutputCol("pcaFeatures") val knn = new KNNClassifier() .setTopTreeSize(50) .setFeaturesCol("pcaFeatures") .setPredictionCol("prediction") .setK(1) val pipeline = new Pipeline() .setStages(Array(pca, knn)) val paramGrid = new ParamGridBuilder() // .addGrid(knn.k, 1 to 20) .addGrid(pca.k, 10 to 100 by 10) .build() val cv = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(5) val cvModel = cv.fit(train) val insample = validate(cvModel.transform(train)) val outofsample = validate(cvModel.transform(test)) //reference accuracy: in-sample 95% out-of-sample 94% logger.info(s"In-sample: $insample, Out-of-sample: $outofsample") logger.info(s"Cross-validated: ${cvModel.avgMetrics.toSeq}") } private[this] def validate(results: DataFrame): Double = { results .selectExpr("SUM(CASE WHEN label = prediction THEN 1.0 ELSE 0.0 END) / COUNT(1)") .collect() .head .getDecimal(0) .doubleValue() } }
Example 11
Source File: Iris.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.examples import org.apache.spark.ml.classification.{GaussianProcessClassifier, OneVsRest} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.SparkSession object Iris extends App { val name = "Iris" val spark = SparkSession.builder().appName(name).master("local[4]").getOrCreate() import spark.sqlContext.implicits._ val name2indx = Map("Iris-versicolor" -> 0, "Iris-setosa" -> 1, "Iris-virginica" -> 2) val dataset = spark.read.format("csv").load("data/iris.csv").rdd.map(row => { val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3") .map(col => row.getAs[String](col).toDouble)) val label = name2indx(row.getAs[String]("_c4")) LabeledPoint(label, features) }).toDF val gp = new GaussianProcessClassifier().setDatasetSizeForExpert(20).setActiveSetSize(30) val ovr = new OneVsRest().setClassifier(gp) val cv = new CrossValidator() .setEstimator(ovr) .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy")) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setNumFolds(10) println("Accuracy: " + cv.fit(dataset).avgMetrics.toList) }
Example 12
Source File: GPExample.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.examples import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.GaussianProcessRegression import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.{DataFrame, SparkSession} trait GPExample { def name : String val spark = SparkSession.builder().appName(name).master("local[4]").getOrCreate() def cv(gp: GaussianProcessRegression, instances: DataFrame, expectedRMSE: Double) = { val cv = new CrossValidator() .setEstimator(gp) .setEvaluator(new RegressionEvaluator()) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setNumFolds(10) val rmse = cv.fit(instances).avgMetrics.head println("RMSE: " + rmse) assert(rmse < expectedRMSE) } }
Example 13
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import breeze.linalg._ import breeze.plot._ import org.jfree.chart.axis.NumberTickUnit object ROC extends App { val conf = new SparkConf().setAppName("ROC") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val transformedTest = sqlContext.read.parquet("transformedTest.parquet") val labelScores = transformedTest.select("probability", "label").map { case Row(probability:Vector, label:Double) => (probability(1), label) } val bm = new BinaryClassificationMetrics(labelScores, 300) val roc = bm.roc.collect roc.foreach { println } val falsePositives = roc.map { _._1 } val truePositives = roc.map { _._2 } val f = Figure() val p = f.subplot(0) p += plot(falsePositives, truePositives) p.xlabel = "false positives" p.ylabel = "true positives" p.xlim = (0.0, 0.1) p.xaxis.setTickUnit(new NumberTickUnit(0.01)) p.yaxis.setTickUnit(new NumberTickUnit(0.1)) f.refresh f.saveas("roc.png") }
Example 14
Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.SaveMode case class LabelledDocument(fileName:String, text:String, category:String) object LogisticRegressionDemo extends App { val conf = new SparkConf().setAppName("LrTest") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val spamText = sc.wholeTextFiles("spam/*") val hamText = sc.wholeTextFiles("ham/*") val spamDocuments = spamText.map { case (fileName, text) => LabelledDocument(fileName, text, "spam") } val hamDocuments = hamText.map { case (fileName, text) => LabelledDocument(fileName, text, "ham") } val documentsDF = spamDocuments.union(hamDocuments).toDF documentsDF.persist val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3)) val indexer = new StringIndexer().setInputCol("category").setOutputCol("label") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val hasher = new HashingTF().setInputCol("words").setOutputCol("features") val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0) val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr)) val model = pipeline.fit(trainDF) val transformedTrain = model.transform(trainDF) transformedTrain.persist val transformedTest = model.transform(testDF) transformedTest.persist println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count, " / ",transformedTrain.count) println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count, " / ",transformedTest.count) transformedTrain.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet") transformedTest.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet") }
Example 15
Source File: NaiveBayes.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter12.NaiveBayes import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql.SparkSession import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineStage; import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} object NaiveBayesExample { def main(args: Array[String]): Unit = { // Create the Spark session val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() // Load the data stored in LIBSVM format as a DataFrame. val data = spark.read.format("libsvm").load("C:/Users/rezkar/Downloads/spark-2.1.0-bin-hadoop2.7/data/sample.data") // Split the data into training and test sets (30% held out for testing) val Array(trainingData, validationData) = data.randomSplit(Array(0.75, 0.25), seed = 12345L) // Train a NaiveBayes model. val nb = new NaiveBayes().setSmoothing(0.00001) val model = nb.fit(trainingData) // Select example rows to display. val predictions = model.transform(validationData) predictions.show() // Select (prediction, true label) and compute test error obtain evaluator and compute the classification performnce metrics like accuracy, precision, recall and f1 measure. val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setMetricName("areaUnderROC") val evaluator1 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy") val evaluator2 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedPrecision") val evaluator3 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedRecall") val evaluator4 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("f1") // compute the classification accuracy, precision, recall, f1 measure and error on test data. val areaUnderROC = evaluator.evaluate(predictions) val accuracy = evaluator1.evaluate(predictions) val precision = evaluator2.evaluate(predictions) val recall = evaluator3.evaluate(predictions) val f1 = evaluator4.evaluate(predictions) // Print the performance metrics println("areaUnderROC = " + areaUnderROC) println("Accuracy = " + accuracy) println("Precision = " + precision) println("Recall = " + recall) println("F1 = " + f1) println(s"Test Error = ${1 - accuracy}") data.show(20) spark.stop() } }