org.apache.spark.ml.PipelineStage Scala Examples
The following examples show how to use org.apache.spark.ml.PipelineStage.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 2
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 3
Source File: RandomForestClassification.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml.{Estimator, PipelineStage} import org.apache.spark.ml.classification.RandomForestClassifier import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ object RandomForestClassification extends BenchmarkAlgorithm with TreeOrForestClassifier { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ // TODO: subsamplingRate, featureSubsetStrategy // TODO: cacheNodeIds, checkpoint? new RandomForestClassifier() .setMaxDepth(depth) .setNumTrees(maxIter) .setSeed(ctx.seed()) } }
Example 4
Source File: MinHashLSH.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object MinHashLSH extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val df = DataGenerator.generateMixedFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, Array.fill(numFeatures)(2) ) df } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.feature.MinHashLSH() .setInputCol("features") .setNumHashTables(numHashTables) } }
Example 5
Source File: VectorSlicer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object VectorSlicer extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures ) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ val indices = (0 until numFeatures by 2).toArray new ml.feature.VectorSlicer() .setInputCol("features") .setIndices(indices) } }
Example 6
Source File: VectorAssembler.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object VectorAssembler extends BenchmarkAlgorithm with TestFromTraining { private def getInputCols(numInputCols: Int): Array[String] = { Array.tabulate(numInputCols)(i => s"c${i}") } override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ require(numInputCols.get <= numFeatures.get, s"numInputCols (${numInputCols}) cannot be greater than numFeatures (${numFeatures}).") val df = DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) val slice = udf { (v: Vector, numSlices: Int) => val data = v.toArray val n = data.length.toLong (0 until numSlices).map { i => val start = ((i * n) / numSlices).toInt val end = ((i + 1) * n / numSlices).toInt Vectors.dense(data.slice(start, end)) } } val inputCols = getInputCols(numInputCols.get) df.select(slice(col("features"), lit(numInputCols.get)).as("slices")) .select((0 until numInputCols.get).map(i => col("slices")(i).as(inputCols(i))): _*) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ val inputCols = getInputCols(numInputCols.get) new ml.feature.VectorAssembler() .setInputCols(inputCols) } }
Example 7
Source File: QuantileDiscretizer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object QuantileDiscretizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, 1 ).rdd.map { case Row(vec: Vector) => vec(0) // extract the single generated double value for each row }.toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.feature.QuantileDiscretizer() .setInputCol(inputCol) .setOutputCol(outputCol) .setNumBuckets(bucketizerNumBuckets) .setRelativeError(relativeError) } }
Example 8
Source File: Word2Vec.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import scala.util.Random import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.feature.Word2VecModel import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, split} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object Word2Vec extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val df = DataGenerator.generateDoc( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, docLength, "text" ) df.select(split(col("text"), " ").as("text")) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.Word2Vec().setInputCol("text") } override def testAdditionalMethods( ctx: MLBenchContext, model: Transformer): Map[String, () => _] = { import ctx.params._ val rng = new Random(ctx.seed()) val word2vecModel = model.asInstanceOf[Word2VecModel] val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian())) Map("findSynonyms" -> (() => { word2vecModel.findSynonyms(testWord, numSynonymsToFind) })) } }
Example 9
Source File: GaussianMixture.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.clustering import org.apache.spark.ml import org.apache.spark.ml.PipelineStage import org.apache.spark.sql.DataFrame import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.data.DataGenerator object GaussianMixture extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateGaussianMixtureData(ctx.sqlContext, numCenters = k, numExamples = numExamples, seed = ctx.seed(), numPartitions = numPartitions, numFeatures = numFeatures) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.clustering.GaussianMixture() .setK(k) .setSeed(randomSeed.toLong) .setMaxIter(maxIter) .setTol(tol) } // TODO(?) add a scoring method here. }
Example 10
Source File: KMeans.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.clustering import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage} import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object KMeans extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateGaussianMixtureData(ctx.sqlContext, k, numExamples, ctx.seed(), numPartitions, numFeatures) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.clustering.KMeans() .setK(k) .setSeed(randomSeed.toLong) .setMaxIter(maxIter) .setTol(tol) } // TODO(?) add a scoring method here. }
Example 11
Source File: LDA.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.clustering import scala.collection.mutable.{HashMap => MHashMap} import org.apache.commons.math3.random.Well19937c import org.apache.spark.ml.{Estimator, PipelineStage} import org.apache.spark.ml import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.ml.linalg.{Vector, Vectors} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ object LDA extends BenchmarkAlgorithm with TestFromTraining { // The LDA model is package private, no need to expose it. override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val rdd = ctx.sqlContext.sparkContext.parallelize( 0L until numExamples, numPartitions ) val seed: Int = randomSeed val docLen = docLength.get val numVocab = vocabSize.get val data: RDD[(Long, Vector)] = rdd.mapPartitionsWithIndex { (idx, partition) => val rng = new Well19937c(seed ^ idx) partition.map { docIndex => var currentSize = 0 val entries = MHashMap[Int, Int]() while (currentSize < docLen) { val index = rng.nextInt(numVocab) entries(index) = entries.getOrElse(index, 0) + 1 currentSize += 1 } val iter = entries.toSeq.map(v => (v._1, v._2.toDouble)) (docIndex, Vectors.sparse(numVocab, iter)) } } ctx.sqlContext.createDataFrame(data).toDF("docIndex", "features") } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.clustering.LDA() .setK(k) .setSeed(randomSeed.toLong) .setMaxIter(maxIter) .setOptimizer(optimizer) } // TODO(?) add a scoring method here. }
Example 12
Source File: ALS.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.recommendation import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, ScoringWithEvaluator} object ALS extends BenchmarkAlgorithm with ScoringWithEvaluator { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateRatings( ctx.sqlContext, numUsers, numItems, numExamples, numTestExamples, implicitPrefs = false, numPartitions, ctx.seed())._1 } override def testDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateRatings( ctx.sqlContext, numUsers, numItems, numExamples, numTestExamples, implicitPrefs = false, numPartitions, ctx.seed())._2 } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.recommendation.ALS() .setSeed(ctx.seed()) .setRegParam(regParam) .setNumBlocks(numPartitions) .setRank(rank) .setMaxIter(maxIter) } override protected def evaluator(ctx: MLBenchContext): Evaluator = { new RegressionEvaluator().setLabelCol("rating") } }
Example 13
Source File: FPGrowth.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.fpm import org.apache.spark.ml import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.fpm.FPGrowthModel import org.apache.spark.sql.DataFrame import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object FPGrowth extends BenchmarkAlgorithm with TestFromTraining { def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ DataGenerator.generateItemSet( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numItems, itemSetSize) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.fpm.FPGrowth() .setItemsCol("items") } override def testAdditionalMethods( ctx: MLBenchContext, model: Transformer): Map[String, () => _] = { val fpModel = model.asInstanceOf[FPGrowthModel] Map("associationRules" -> (() => { fpModel.associationRules.count() })) } }
Example 14
Source File: NaiveBayes.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object NaiveBayes extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ val rng = ctx.newGenerator() // Max possible arity of a feature in generated training/test data for NaiveBayes models val maxFeatureArity = 20 // All features for Naive Bayes must be categorical, i.e. have arity >= 2 val featureArity = 0.until(numFeatures).map(_ => 2 + rng.nextInt(maxFeatureArity - 2)).toArray DataGenerator.generateMixedFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, featureArity) } override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ val rng = ctx.newGenerator() // pi = log of class priors, whose dimension is C (number of classes) // theta = log of class conditional probabilities, whose dimension is C (number of classes) // by D (number of features) val unnormalizedProbs = 0.until(numClasses).map(_ => rng.nextDouble() + 1e-5).toArray val logProbSum = math.log(unnormalizedProbs.sum) val piArray = unnormalizedProbs.map(prob => math.log(prob) - logProbSum) // For class i, set the class-conditional probability of feature i to 0.7, and split up the // remaining probability mass across the other features val currClassProb = 0.7 val thetaArray = Array.tabulate(numClasses) { i: Int => val baseProbMass = (1 - currClassProb) / (numFeatures - 1) val probs = Array.fill[Double](numFeatures)(baseProbMass) probs(i) = currClassProb probs }.map(_.map(math.log)) // Initialize new Naive Bayes model val pi = Vectors.dense(piArray) val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true) ModelBuilderSSP.newNaiveBayesModel(pi, theta) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.NaiveBayes() .setSmoothing(smoothing) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 15
Source File: LinearSVC.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object LinearSVC extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLinearSVCModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.LinearSVC() .setTol(tol) .setMaxIter(maxIter) .setRegParam(regParam) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 16
Source File: GBTClassification.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ object GBTClassification extends BenchmarkAlgorithm with TreeOrForestClassifier { import TreeOrForestEstimator.getFeatureArity override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ // We add +1 to the depth to make it more likely that many iterations of boosting are needed // to model the true tree. ModelBuilderSSP.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx), ctx.seed()) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ // TODO: subsamplingRate, featureSubsetStrategy // TODO: cacheNodeIds, checkpoint? new GBTClassifier() .setMaxDepth(depth) .setMaxIter(maxIter) .setSeed(ctx.seed()) } }
Example 17
Source File: BucketedRandomProjectionLSH.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object BucketedRandomProjectionLSH extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val df = DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures ) df } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.feature.BucketedRandomProjectionLSH() .setInputCol("features") .setNumHashTables(numHashTables) } }
Example 18
Source File: LogisticRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.classification import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transformer} import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors object LogisticRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLogisticRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.classification.LogisticRegression() .setTol(tol) .setMaxIter(maxIter) .setRegParam(regParam) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() }
Example 19
Source File: GLMRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { import ctx.params._ val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) val m = ModelBuilderSSP.newGLR(coefficients, intercept) m.set(m.link, link.get) m.set(m.family, family.get) m } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new GeneralizedLinearRegression() .setLink(link) .setFamily(family) .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }
Example 20
Source File: LinearRegression.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.regression import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) } override protected def trueModel(ctx: MLBenchContext): Transformer = { val rng = ctx.newGenerator() val coefficients = Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) ModelBuilderSSP.newLinearRegressionModel(coefficients, intercept) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.regression.LinearRegression() .setSolver("l-bfgs") .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) } override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() }
Example 21
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 22
Source File: RegressionUtils.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.injections import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.regression.Regressor object RegressionUtils { def isRegressor(stage: PipelineStage): Boolean = { stage match { case _: Regressor[_, _, _] => true case _ => false } } }
Example 23
Source File: EvaluationUtils.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.automl import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.schema.SchemaConstants import com.microsoft.ml.spark.train.{TrainClassifier, TrainRegressor, TrainedClassifierModel, TrainedRegressorModel} import org.apache.spark.injections.RegressionUtils import org.apache.spark.ml.classification.{ClassificationModel, Classifier} import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.regression._ object EvaluationUtils { val ModelTypeUnsupportedErr = "Model type not supported for evaluation" // Find type of trained models def getModelType(model: PipelineStage): String = { model match { case _: TrainRegressor => SchemaConstants.RegressionKind case _: TrainClassifier => SchemaConstants.ClassificationKind case _: Classifier[_, _, _] => SchemaConstants.ClassificationKind case regressor: PipelineStage if RegressionUtils.isRegressor(regressor) => SchemaConstants.RegressionKind case _: DecisionTreeRegressor => SchemaConstants.RegressionKind case _: GBTRegressor => SchemaConstants.RegressionKind case _: RandomForestRegressor => SchemaConstants.RegressionKind case _: TrainedRegressorModel => SchemaConstants.RegressionKind case _: TrainedClassifierModel => SchemaConstants.ClassificationKind case evm: BestModel => getModelType(evm.getBestModel) case _: ClassificationModel[_, _] => SchemaConstants.ClassificationKind case _: RegressionModel[_, _] => SchemaConstants.RegressionKind case _ => throw new Exception(ModelTypeUnsupportedErr) } } def getMetricWithOperator(model: PipelineStage, evaluationMetric: String): (String, Ordering[Double]) = { val modelType = getModelType(model) getMetricWithOperator(modelType, evaluationMetric) } def getMetricWithOperator(modelType: String, evaluationMetric: String): (String, Ordering[Double]) = { val chooseHighest = Ordering.Double val chooseLowest = Ordering.Double.reverse val (evaluationMetricColumnName, operator): (String, Ordering[Double]) = modelType match { case SchemaConstants.RegressionKind => evaluationMetric match { case MetricConstants.MseSparkMetric => (MetricConstants.MseColumnName, chooseLowest) case MetricConstants.RmseSparkMetric => (MetricConstants.RmseColumnName, chooseLowest) case MetricConstants.R2SparkMetric => (MetricConstants.R2ColumnName, chooseHighest) case MetricConstants.MaeSparkMetric => (MetricConstants.MaeColumnName, chooseLowest) case _ => throw new Exception("Metric is not supported for regressors") } case SchemaConstants.ClassificationKind => evaluationMetric match { case MetricConstants.AucSparkMetric => (MetricConstants.AucColumnName, chooseHighest) case MetricConstants.PrecisionSparkMetric => (MetricConstants.PrecisionColumnName, chooseHighest) case MetricConstants.RecallSparkMetric => (MetricConstants.RecallColumnName, chooseHighest) case MetricConstants.AccuracySparkMetric => (MetricConstants.AccuracyColumnName, chooseHighest) case _ => throw new Exception("Metric is not supported for classifiers") } case _ => throw new Exception("Model type not supported for evaluation") } (evaluationMetricColumnName, operator) } def getModelParams(model: Transformer): ParamMap = { model match { case reg: TrainedRegressorModel => reg.getParamMap case cls: TrainedClassifierModel => cls.getParamMap case evm: BestModel => getModelParams(evm.getBestModel) case _ => throw new Exception("Model type not supported for evaluation") } } def modelParamsToString(model: Transformer): String = getModelParams(model).toSeq.map(pv => s"${pv.param.name}: ${pv.value}").sorted.mkString(", ") }
Example 24
Source File: SparkStageParam.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.hadoop.fs.Path import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.param.{Param, ParamPair, Params} import org.apache.spark.ml.util.{Identifiable, MLReader, MLWritable} import org.apache.spark.util.SparkUtils import org.json4s.JsonAST.{JObject, JValue} import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.{DefaultFormats, Formats, JString} class SparkStageParam[S <: PipelineStage with Params] ( parent: String, name: String, doc: String, isValid: Option[S] => Boolean ) extends Param[Option[S]](parent, name, doc, isValid) { import SparkStageParam._ override def jsonDecode(jsonStr: String): Option[S] = { val json = parse(jsonStr) val uid = (json \ "uid").extractOpt[String] val path = (json \ "path").extractOpt[String] path -> uid match { case (None, _) | (_, None) | (_, Some(NoUID)) => savePath = None None case (Some(p), Some(stageUid)) => savePath = Option(p) val stagePath = new Path(p, stageUid).toString val className = (json \ "className").extract[String] val cls = SparkUtils.classForName(className) val stage = cls.getMethod("read").invoke(null).asInstanceOf[MLReader[PipelineStage]].load(stagePath) Option(stage).map(_.asInstanceOf[S]) } } } object SparkStageParam { implicit val formats: Formats = DefaultFormats val NoClass = "" val NoUID = "" def updateParamsMetadataWithPath(jValue: JValue, path: String): JValue = jValue match { case JObject(pairs) => JObject( pairs.map { case (SparkWrapperParams.SparkStageParamName, j) => SparkWrapperParams.SparkStageParamName -> j.merge(JObject("path" -> JString(path))) case param => param } ) case j => throw new IllegalArgumentException(s"Cannot recognize JSON Spark params metadata: $j") } }
Example 25
Source File: RichParamMap.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.spark import com.salesforce.op.features.TransientFeature import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.types.StructType object RichParamMap { def getAsMap(): Map[String, Any] = { val mapped = params.toSeq.map(pp => pp.param.name -> pp.value).toMap mapped.map { case (k, v: Array[_]) => if (v.headOption.exists(_.isInstanceOf[TransientFeature])) { k -> v.map(_.asInstanceOf[TransientFeature].toJsonString()).toList } else k -> v.toList case (k, v: StructType) => k -> v.toString() case (k, v: PipelineStage) => k -> v.getClass.getName case (k, Some(v: PipelineStage)) => k -> v.getClass.getName case (k, v) => k -> v } } } }
Example 26
Source File: OneHotEncoderDemo2.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator object OneHotEncoderDemo2 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() spark.stop() } }
Example 27
Source File: StringIndexerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql._ import org.apache.spark.sql.SQLContext object StringIndexerDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("label") .fit(df) val indexed = indexer.transform(df) indexed.show(false) spark.stop() } }
Example 28
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 29
Source File: TransformerWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import com.tencent.angel.spark.automl.feature.InToOutRelation.InToOutRelation import org.apache.spark.ml.PipelineStage abstract class TransformerWrapper { val transformer: PipelineStage var parent: TransformerWrapper val relation: InToOutRelation val hasMultiInputs: Boolean val hasMultiOutputs: Boolean val needAncestorInputs: Boolean private val prefix = "out" val requiredInputCols: Array[String] val requiredOutputCols: Array[String] private var inputCols: Array[String] = _ private var outputCols: Array[String] = _ private var ancestorCols: Array[String] = _ def getTransformer = transformer def setParent(parent: TransformerWrapper): Unit = this.parent = parent def setInputCols(cols: Array[String]): Unit = inputCols = cols def setOutputCols(cols: Array[String]): Unit = outputCols = cols def getInputCols: Array[String] = inputCols def getOutputCols: Array[String] = outputCols def setAncestorCols(cols: Array[String]): Unit = ancestorCols = cols def generateInputCols(): Unit = { //require(ancestorCols.contains(requiredInputCols), "Missing required input cols.") // require(requiredInputCols.forall(ancestorCols.contains), "Missing required input cols.") // if transformer has required input cols, feed required input cols // if transformer needs all input cols, feed all input cols // if transformer has no required input cols, feed the output cols of the parent transformer if (ancestorCols.contains(requiredInputCols)) { setInputCols(requiredInputCols) } else if (needAncestorInputs) { setInputCols(ancestorCols) } else { setInputCols(parent.outputCols) } } def generateOutputCols(): Unit = { relation match { case InToOutRelation.Fixed => setOutputCols(requiredOutputCols) case InToOutRelation.InPlace => setOutputCols(inputCols) case InToOutRelation.OneToOne => setOutputCols(Array(prefix + transformer.getClass.getSimpleName)) case InToOutRelation.MultiToMulti => setOutputCols(inputCols.map(prefix + _)) case InToOutRelation.MultiToOne => setOutputCols(Array(prefix + transformer.getClass.getName.toLowerCase)) case _ => throw new IncompatibleFiledExecption( "wrong relations between input and output of transformer") } } def declareInAndOut(): this.type }
Example 30
Source File: PipelineBuilder.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.SparkException import org.apache.spark.ml.PipelineStage import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class IncompatibleFiledExecption(msg: String) extends SparkException(msg) {} object PipelineBuilder { def build(transformers: Array[TransformerWrapper]): Array[PipelineStage] = { val stages: ArrayBuffer[PipelineStage] = new ArrayBuffer[PipelineStage]() //val allInputCols: ArrayBuffer[String] = new ArrayBuffer[String]() val allInputCols: mutable.HashSet[String] = new mutable.HashSet[String]() transformers(0).setInputCols(transformers(0).requiredInputCols) transformers(0).setOutputCols(transformers(0).requiredOutputCols) allInputCols ++= transformers(0).getInputCols transformers(0).setAncestorCols(allInputCols.toArray) stages += transformers(0).declareInAndOut().getTransformer (1 until transformers.length).foreach { i => println(s"add $i-th transformer = ${transformers(i).getTransformer.getClass.getSimpleName}") // set parent transformers(i).setParent(transformers(i - 1)) // add new cols allInputCols ++= transformers(i - 1).getOutputCols // set parent cols transformers(i).setAncestorCols(allInputCols.toArray) // generate input cols transformers(i).generateInputCols() // generate output cols transformers(i).generateOutputCols() // add fully configured transformer stages += transformers(i).declareInAndOut().getTransformer } stages.toArray } }
Example 31
Source File: PipelineWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.{DataFrame, Dataset} class PipelineWrapper() { var pipeline = new Pipeline() var transformers: Array[TransformerWrapper] = Array() def setTransformers(value: Array[TransformerWrapper]): this.type = { transformers = value setStages(PipelineBuilder.build(transformers)) this } def setStages(value: Array[_ <: PipelineStage]): Unit = { pipeline = pipeline.setStages(value) } def fit(dataset: Dataset[_]): PipelineModelWrapper = { new PipelineModelWrapper(pipeline.fit(dataset), transformers) } } class PipelineModelWrapper(val model: PipelineModel, val transformers: Array[TransformerWrapper]) { def transform(dataset: Dataset[_]): DataFrame = { var df = model.transform(dataset) if (transformers.length >= 2) { (0 until transformers.length - 1).foreach { i => val outCols = transformers(i).getOutputCols for (col <- outCols) { df = df.drop(col) } } } df } }
Example 32
Source File: Components.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer} import org.apache.spark.sql.DataFrame import scala.collection.mutable.ArrayBuffer object Components { def sample(data: DataFrame, fraction: Double): DataFrame = { data.sample(false, fraction) } def addSampler(components: ArrayBuffer[PipelineStage], inputCol: String, fraction: Double): Unit = { val sampler = new Sampler(fraction) .setInputCol("features") components += sampler } def addTokenizer(components: ArrayBuffer[PipelineStage], inputCol: String, outputCol: String): Unit = { val tokenizer = new Tokenizer() .setInputCol(inputCol) .setOutputCol(outputCol) components += tokenizer } def addStopWordsRemover(components: ArrayBuffer[PipelineStage], inputCol: String, outputCol: String): Unit = { val remover = new StopWordsRemover() .setInputCol(inputCol) .setOutputCol(outputCol) components += remover } }
Example 33
Source File: FPreprocess.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.AutoConf import com.tencent.angel.spark.automl.feature.DataLoader import com.tencent.angel.spark.automl.utils.ArgsUtil import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.SparkSession import scala.collection.mutable.ArrayBuffer object FPreprocess { def main(args: Array[String]): Unit = { val params = ArgsUtil.parse(args) val master = params.getOrElse("master", "yarn") val deploy = params.getOrElse("deploy-mode", "cluster") val input = params.getOrElse("input", "") val inputSeparator = params.getOrElse(AutoConf.Preprocess.ML_DATA_SPLITOR, AutoConf.Preprocess.DEFAULT_ML_DATA_SPLITOR) val inputFormat = params.getOrElse(AutoConf.Preprocess.ML_DATA_INPUT_FORMAT, AutoConf.Preprocess.DEFAULT_ML_DATA_INPUT_FORMAT) val inputType = params.getOrElse(AutoConf.Preprocess.INPUT_TYPE, AutoConf.Preprocess.DEFAULT_INPUT_TYPE) val sampleRate = params.getOrElse(AutoConf.Preprocess.SAMPLE_RATE, AutoConf.Preprocess.DEFAULT_SAMPLE_RATE).toDouble val imbalanceSampleRate = params.getOrElse(AutoConf.Preprocess.IMBALANCE_SAMPLE, AutoConf.Preprocess.DEFAULT_IMBALANCE_SAMPLE) val hasTokenizer = if (inputFormat.equals("document")) true else false val hasStopWordsRemover = if (inputFormat.equals("document")) true else false val ss = SparkSession .builder .master(master + "-" + deploy) .appName("preprocess") .getOrCreate() var training = DataLoader.load(ss, inputFormat, input, inputSeparator) var components = new ArrayBuffer[PipelineStage] if (sampleRate > 0 & sampleRate < 1.0) Components.addSampler(components, "features", sampleRate) if (hasTokenizer) Components.addTokenizer(components, "sentence", "words") if (hasStopWordsRemover) Components.addStopWordsRemover(components, "words", "filterWords") val pipeline = new Pipeline() .setStages(components.toArray) val model = pipeline.fit(training) ss.stop() } }
Example 34
Source File: GBTLRCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.gbtlr.GBTLRClassifier import org.apache.spark.sql.DataFrame class GBTLRCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) val featureEngineeringStages:Array[PipelineStage] = FeatureEngineering.preProcessInnerProductSamplesStages() val model = new GBTLRClassifier() .setFeaturesCol("scaledFeatures") .setLabelCol("label") .setGBTMaxIter(10) .setLRMaxIter(100) .setRegParam(0.01) .setElasticNetParam(0.5) val pipelineStages = featureEngineeringStages ++ Array(model) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithInnerProduct) } override def transform(samples:DataFrame):DataFrame = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) _pipelineModel.transform(samplesWithInnerProduct) } }
Example 35
Source File: LogisticRegressionCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.sql.DataFrame class LogisticRegressionCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { val featureEngineeringStages:Array[PipelineStage] = FeatureEngineering.preProcessSamplesStages() val model:LogisticRegression = new LogisticRegression() .setMaxIter(20) //max iteration .setRegParam(0.0) //regularization parameter .setElasticNetParam(0.0) //0-L2 regularization 1-L1 regularization .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = featureEngineeringStages ++ Array(model) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samples) } }
Example 36
Source File: CrossValidation.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.ml.{Model, Pipeline, PipelineStage} import org.apache.spark.sql._ @throws(classOf[IllegalArgumentException]) protected def apply( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): CrossValidatorModel = { require(stages.size > 0, "Cannot cross-validate pipeline without stages") require(grid.size > 0, "Cannot cross-validate with undefined grid") val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator)) new CrossValidator() .setEstimator(pipeline) .setEstimatorParamMaps(grid) .setEvaluator(new BinaryClassificationEvaluator) .setNumFolds(numFolds) .fit(trainDf) } protected def evaluate( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): Evaluator = this(trainDf, stages, grid).getEvaluator }
Example 37
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 38
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 39
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 40
Source File: OneHotEncoder.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object OneHotEncoder extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateMixedFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, Array.fill(1)(featureArity.get) ).rdd.map { case Row(vec: Vector) => vec(0) // extract the single generated double value for each row }.toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.OneHotEncoder() .setInputCol(inputCol) } }
Example 41
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 42
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 43
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 44
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 45
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 46
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 47
Source File: Tokenizer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object Tokenizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateDoc( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, docLength, inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.Tokenizer() .setInputCol(inputCol) } }
Example 48
Source File: Bucketizer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import scala.util.Random import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object Bucketizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ val rng = ctx.newGenerator() // For a bucketizer, training data consists of a single column of random doubles DataGenerator.generateContinuousFeatures(ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures = 1).rdd.map { case Row(vec: Vector) => vec(0) // extract the single generated double value for each row }.toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ val rng = ctx.newGenerator() // Generate an array of (finite) splitting points in [-1, 1) for the Bucketizer val splitPoints = 0.until(bucketizerNumBuckets - 1).map { _ => 2 * rng.nextDouble() - 1 }.sorted.toArray // Final array of splits contains +/- infinity val splits = Array(Double.NegativeInfinity) ++ splitPoints ++ Array(Double.PositiveInfinity) new ml.feature.Bucketizer() .setSplits(splits) .setInputCol(inputCol) } }
Example 49
Source File: HashingTF.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import scala.util.Random import org.apache.spark.ml import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import org.apache.spark.sql.functions.split import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object HashingTF extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { // Sample a random sentence of length up to maxLen from the provided array of words private def randomSentence(rng: Random, maxLen: Int, dictionary: Array[String]): Array[String] = { val length = rng.nextInt(maxLen - 1) + 1 val dictLength = dictionary.length Array.tabulate[String](length)(_ => dictionary(rng.nextInt(dictLength))) } override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ // To test HashingTF, we generate arrays of (on average) docLength strings, where // each string is selected from a pool of vocabSize strings // The expected # of occurrences of each word in our vocabulary is // (docLength * numExamples) / vocabSize val df = DataGenerator.generateDoc(ctx.sqlContext, numExamples = numExamples, seed = ctx.seed(), numPartitions = numPartitions, vocabSize = vocabSize, avgDocLength = docLength, dataColName = inputCol) df.withColumn(inputCol, split(df(inputCol), " ")) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.feature.HashingTF() .setInputCol(inputCol) .setNumFeatures(numFeatures) } }
Example 50
Source File: StringIndexer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object StringIndexer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateRandString(ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ import ctx.sqlContext.implicits._ new ml.feature.StringIndexer() .setInputCol(inputCol) .setHandleInvalid("skip") } }
Example 51
Source File: GenericTestSpec.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.common.LocalData import org.apache.spark.SparkConf import org.apache.spark.ml.linalg.{Matrix, Vector} import org.apache.spark.mllib.linalg.{Matrix => OldMatrix, Vector => OldVector} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.{BeforeAndAfterAll, FunSpec} trait GenericTestSpec extends FunSpec with BeforeAndAfterAll { val conf = new SparkConf() .setMaster("local[2]") .setAppName("test") .set("spark.ui.enabled", "false") val session: SparkSession = SparkSession.builder().config(conf).getOrCreate() def modelPath(modelName: String): String = s"./target/test_models/${session.version}/$modelName" def test( name: String, data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ) = { val path = modelPath(name.toLowerCase()) var validation = LocalData.empty var localPipelineModel = Option.empty[LocalPipelineModel] it("should train") { val pipeline = new Pipeline().setStages(steps.toArray) val pipelineModel = pipeline.fit(data) validation = LocalData.fromDataFrame(pipelineModel.transform(data)) pipelineModel.write.overwrite().save(path) } it("should load local version") { localPipelineModel = Some(LocalPipelineModel.load(path)) assert(localPipelineModel.isDefined) } it("should transform LocalData") { val localData = LocalData.fromDataFrame(data) val model = localPipelineModel.get val result = model.transform(localData) columns.foreach { col => val resCol = result .column(col) .getOrElse(throw new IllegalArgumentException("Result column is absent")) val valCol = validation .column(col) .getOrElse(throw new IllegalArgumentException("Validation column is absent")) resCol.data.zip(valCol.data).foreach { case (r: Seq[Number @unchecked], v: Seq[Number @unchecked]) if r.head.isInstanceOf[Number] && r.head.isInstanceOf[Number] => r.zip(v).foreach { case (ri, vi) => assert(ri.doubleValue() - vi.doubleValue() <= accuracy, s"$ri - $vi > $accuracy") } case (r: Number, v: Number) => assert(r.doubleValue() - v.doubleValue() <= accuracy, s"$r - $v > $accuracy") case (r, n) => assert(r === n) } result.column(col).foreach { resData => resData.data.foreach { resRow => if (resRow.isInstanceOf[Seq[_]]) { assert(resRow.isInstanceOf[List[_]], resRow) } else if (resRow.isInstanceOf[Vector] || resRow.isInstanceOf[OldVector] || resRow .isInstanceOf[Matrix] || resRow.isInstanceOf[OldMatrix]) { assert(false, s"SparkML type detected. Column: $col, value: $resRow") } } } } } } def modelTest( data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ): Unit = { lazy val name = steps.map(_.getClass.getSimpleName).foldLeft("") { case ("", b) => b case (a, b) => a + "-" + b } describe(name) { test(name, data, steps, columns, accuracy) } } }