org.apache.spark.ml.evaluation.Evaluator Scala Examples
The following examples show how to use org.apache.spark.ml.evaluation.Evaluator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CrossValidation.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.ml.{Model, Pipeline, PipelineStage} import org.apache.spark.sql._ @throws(classOf[IllegalArgumentException]) protected def apply( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): CrossValidatorModel = { require(stages.size > 0, "Cannot cross-validate pipeline without stages") require(grid.size > 0, "Cannot cross-validate with undefined grid") val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator)) new CrossValidator() .setEstimator(pipeline) .setEstimatorParamMaps(grid) .setEvaluator(new BinaryClassificationEvaluator) .setNumFolds(numFolds) .fit(trainDf) } protected def evaluate( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): Evaluator = this(trainDf, stages, grid).getEvaluator }
Example 2
Source File: ALS.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.recommendation import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, ScoringWithEvaluator} object ALS extends BenchmarkAlgorithm with ScoringWithEvaluator { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateRatings( ctx.sqlContext, numUsers, numItems, numExamples, numTestExamples, implicitPrefs = false, numPartitions, ctx.seed())._1 } override def testDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateRatings( ctx.sqlContext, numUsers, numItems, numExamples, numTestExamples, implicitPrefs = false, numPartitions, ctx.seed())._2 } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.recommendation.ALS() .setSeed(ctx.seed()) .setRegParam(regParam) .setNumBlocks(numPartitions) .setRank(rank) .setMaxIter(maxIter) } override protected def evaluator(ctx: MLBenchContext): Evaluator = { new RegressionEvaluator().setLabelCol("rating") } }
Example 3
Source File: NaiveBayes.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object NaiveBayes extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ val rng = ctx.newGenerator() // Max possible arity of a feature in generated training/test data for NaiveBayes models val maxFeatureArity = 20 // All features for Naive Bayes must be categorical, i.e. have arity >= 2 val featureArity = 0.until(numFeatures).map(_ => 2 + rng.nextInt(maxFeatureArity - 2)).toArray DataGenerator.generateMixedFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, featureArity) } override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ val rng = ctx.newGenerator() // pi = log of class priors, whose dimension is C (number of classes) // theta = log of class conditional probabilities, whose dimension is C (number of classes) // by D (number of features) val unnormalizedProbs = 0.until(numClasses).map(_ => rng.nextDouble() + 1e-5).toArray val logProbSum = math.log(unnormalizedProbs.sum) val piArray = unnormalizedProbs.map(prob => math.log(prob) - logProbSum) // For class i, set the class-conditional probability of feature i to 0.7, and split up the // remaining probability mass across the other features val currClassProb = 0.7 val thetaArray = Array.tabulate(numClasses) { i: Int => val baseProbMass = (1 - currClassProb) / (numFeatures - 1) val probs = Array.fill[Double](numFeatures)(baseProbMass) probs(i) = currClassProb probs }.map(_.map(math.log)) // Initialize new Naive Bayes model val pi = Vectors.dense(piArray) val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true) ModelBuilderSSP.newNaiveBayesModel(pi, theta) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.NaiveBayes() .setSmoothing(smoothing) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 4
Source File: LinearSVC.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object LinearSVC extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLinearSVCModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.LinearSVC() .setTol(tol) .setMaxIter(maxIter) .setRegParam(regParam) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 5
Source File: LogisticRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors object LogisticRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLogisticRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.LogisticRegression() .setTol(tol) .setMaxIter(maxIter) .setRegParam(regParam) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 6
Source File: GLMRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) val m = ModelBuilderSSP.newGLR(coefficients, intercept) m.set(m.link, link.get) m.set(m.family, family.get) m } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new GeneralizedLinearRegression() .setLink(link) .setFamily(family) .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }
Example 7
Source File: LinearRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLinearRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.regression.LinearRegression() .setSolver("l-bfgs") .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }