org.apache.spark.ml.Pipeline Scala Examples
The following examples show how to use org.apache.spark.ml.Pipeline.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0 | 8 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel, val labelCount: Long, val layers: Array[Int], val weights: Array[Double] ) extends MLWritable { def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val labelCount = (rMetadata \ "labelCount").extract[Long] val layers = (rMetadata \ "layers").extract[Array[Int]] val weights = (rMetadata \ "weights").extract[Array[Double]] val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = ("class" -> instance.getClass.getName) ~ ("labelCount" -> instance.labelCount) ~ ("layers" -> instance.layers.toSeq) ~ ("weights" -> instance.weights.toArray.toSeq) val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 2
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 3
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 4
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 5
Source File: ModelPersistence.scala From reactive-machine-learning-systems with MIT License | 5 votes |
package com.reactivemachinelearning import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{QuantileDiscretizer, VectorAssembler} import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder} import org.apache.spark.sql.SparkSession object ModelPersistence extends App { val session = SparkSession.builder.appName("ModelPersistence").getOrCreate() val data = Seq( (0, 18.0, 0), (1, 20.0, 0), (2, 8.0, 1), (3, 5.0, 1), (4, 2.0, 0), (5, 21.0, 0), (6, 7.0, 1), (7, 18.0, 0), (8, 3.0, 1), (9, 22.0, 0), (10, 8.0, 1), (11, 2.0, 0), (12, 5.0, 1), (13, 4.0, 1), (14, 1.0, 0), (15, 11.0, 0), (16, 7.0, 1), (17, 15.0, 0), (18, 3.0, 1), (19, 20.0, 0)) val instances = session.createDataFrame(data) .toDF("id", "seeds", "label") val discretizer = new QuantileDiscretizer() .setInputCol("seeds") .setOutputCol("discretized") .setNumBuckets(3) val assembler = new VectorAssembler() .setInputCols(Array("discretized")) .setOutputCol("features") val classifier = new LogisticRegression() .setMaxIter(5) val pipeline = new Pipeline() .setStages(Array(discretizer, assembler, classifier)) val paramMaps = new ParamGridBuilder() .addGrid(classifier.regParam, Array(0.0, 0.1)) .build() val evaluator = new BinaryClassificationEvaluator() val crossValidator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setNumFolds(2) .setEstimatorParamMaps(paramMaps) val model = crossValidator.fit(instances) model.write.overwrite().save("my-model") val persistedModel = CrossValidatorModel.load("./my-model") println(s"UID: ${persistedModel.uid}") }
Example 6
Source File: Main.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
import io.hydrosphere.spark_ml_serving.LocalPipelineModel import io.hydrosphere.spark_ml_serving.common.{LocalData, LocalDataColumn} import org.apache.spark.SparkConf import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg._ import org.apache.spark.sql.SparkSession object Train extends App { val conf = new SparkConf() .setMaster("local[2]") .setAppName("example") .set("spark.ui.enabled", "false") val session: SparkSession = SparkSession.builder().config(conf).getOrCreate() val df = session.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") val cv = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) val pipeline = new Pipeline().setStages(Array(cv)) val model = pipeline.fit(df) model.write.overwrite().save("../target/test_models/2.0.2/countVectorizer") } object Serve extends App { import LocalPipelineModel._ val model = LocalPipelineModel .load("../target/test_models/2.0.2/countVectorizer") val data = LocalData(List(LocalDataColumn("words", List( List("a", "b", "d"), List("a", "b", "b", "b") )))) val result = model.transform(data) println(result) }
Example 7
Source File: GenericTestSpec.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.common.LocalData import org.apache.spark.SparkConf import org.apache.spark.ml.linalg.{Matrix, Vector} import org.apache.spark.mllib.linalg.{Matrix => OldMatrix, Vector => OldVector} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.{BeforeAndAfterAll, FunSpec} trait GenericTestSpec extends FunSpec with BeforeAndAfterAll { val conf = new SparkConf() .setMaster("local[2]") .setAppName("test") .set("spark.ui.enabled", "false") val session: SparkSession = SparkSession.builder().config(conf).getOrCreate() def modelPath(modelName: String): String = s"./target/test_models/${session.version}/$modelName" def test( name: String, data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ) = { val path = modelPath(name.toLowerCase()) var validation = LocalData.empty var localPipelineModel = Option.empty[LocalPipelineModel] it("should train") { val pipeline = new Pipeline().setStages(steps.toArray) val pipelineModel = pipeline.fit(data) validation = LocalData.fromDataFrame(pipelineModel.transform(data)) pipelineModel.write.overwrite().save(path) } it("should load local version") { localPipelineModel = Some(LocalPipelineModel.load(path)) assert(localPipelineModel.isDefined) } it("should transform LocalData") { val localData = LocalData.fromDataFrame(data) val model = localPipelineModel.get val result = model.transform(localData) columns.foreach { col => val resCol = result .column(col) .getOrElse(throw new IllegalArgumentException("Result column is absent")) val valCol = validation .column(col) .getOrElse(throw new IllegalArgumentException("Validation column is absent")) resCol.data.zip(valCol.data).foreach { case (r: Seq[Number @unchecked], v: Seq[Number @unchecked]) if r.head.isInstanceOf[Number] && r.head.isInstanceOf[Number] => r.zip(v).foreach { case (ri, vi) => assert(ri.doubleValue() - vi.doubleValue() <= accuracy, s"$ri - $vi > $accuracy") } case (r: Number, v: Number) => assert(r.doubleValue() - v.doubleValue() <= accuracy, s"$r - $v > $accuracy") case (r, n) => assert(r === n) } result.column(col).foreach { resData => resData.data.foreach { resRow => if (resRow.isInstanceOf[Seq[_]]) { assert(resRow.isInstanceOf[List[_]], resRow) } else if (resRow.isInstanceOf[Vector] || resRow.isInstanceOf[OldVector] || resRow .isInstanceOf[Matrix] || resRow.isInstanceOf[OldMatrix]) { assert(false, s"SparkML type detected. Column: $col, value: $resRow") } } } } } } def modelTest( data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ): Unit = { lazy val name = steps.map(_.getClass.getSimpleName).foldLeft("") { case ("", b) => b case (a, b) => a + "-" + b } describe(name) { test(name, data, steps, columns, accuracy) } } }
Example 8
Source File: TokenizerSuite.scala From spark-nkp with Apache License 2.0 | 5 votes |
package com.github.uosdmlab.nkp import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, BeforeAndAfter, FunSuite} class TokenizerSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfter { private var tokenizer: Tokenizer = _ private val spark: SparkSession = SparkSession.builder() .master("local[2]") .appName("Tokenizer Suite") .getOrCreate spark.sparkContext.setLogLevel("WARN") import spark.implicits._ override protected def afterAll(): Unit = { try { spark.stop } finally { super.afterAll() } } before { tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") } private val df = spark.createDataset( Seq( "아버지가방에들어가신다.", "사랑해요 제플린!", "스파크는 재밌어", "나는야 데이터과학자", "데이터야~ 놀자~" ) ).toDF("text") test("Default parameters") { assert(tokenizer.getFilter sameElements Array.empty[String]) } test("Basic operation") { val words = tokenizer.transform(df) assert(df.count == words.count) assert(words.schema.fieldNames.contains(tokenizer.getOutputCol)) } test("POS filter") { val nvTokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("nvWords") .setFilter("N", "V") val words = tokenizer.transform(df).join(nvTokenizer.transform(df), "text") assert(df.count == words.count) assert(words.schema.fieldNames.contains(nvTokenizer.getOutputCol)) assert(words.where(s"SIZE(${tokenizer.getOutputCol}) < SIZE(${nvTokenizer.getOutputCol})").count == 0) } test("TF-IDF pipeline") { tokenizer.setFilter("N") val cntVec = new CountVectorizer() .setInputCol("words") .setOutputCol("tf") val idf = new IDF() .setInputCol("tf") .setOutputCol("tfidf") val pipe = new Pipeline() .setStages(Array(tokenizer, cntVec, idf)) val pipeModel = pipe.fit(df) val result = pipeModel.transform(df) assert(result.count == df.count) val fields = result.schema.fieldNames assert(fields.contains(tokenizer.getOutputCol)) assert(fields.contains(cntVec.getOutputCol)) assert(fields.contains(idf.getOutputCol)) result.show } }
Example 9
Source File: FeatureCrossSelectorExample.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.examples import org.apache.spark.SparkConf import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.operator.{VarianceSelector, VectorCartesian} import org.apache.spark.sql.SparkSession object FeatureCrossSelectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf() val input = conf.get("spark.input.path", "data/a9a/a9a_123d_train_trans.libsvm") val numFeatures = conf.get("spark.num.feature", "123") val twoOrderNumFeatures = conf.getInt("spark.two.order.num.feature", 123) val threeOrderNumFeatures = conf.getInt("spark.three.order.num.feature", 123) val spark = SparkSession.builder().master("local").config(conf).getOrCreate() val data = spark.read.format("libsvm") .option("numFeatures", numFeatures) .load(input) .persist() val cartesian = new VectorCartesian() .setInputCols(Array("features", "features")) .setOutputCol("f_f") val selector = new VarianceSelector() .setFeaturesCol("f_f") .setOutputCol("selected_f_f") .setNumTopFeatures(twoOrderNumFeatures) val cartesian2 = new VectorCartesian() .setInputCols(Array("features", "selected_f_f")) .setOutputCol("f_f_f") val selector2 = new VarianceSelector() .setFeaturesCol("f_f_f") .setOutputCol("selected_f_f_f") .setNumTopFeatures(threeOrderNumFeatures) val assembler = new VectorAssembler() .setInputCols(Array("features", "selected_f_f", "selected_f_f_f")) .setOutputCol("assembled_features") val pipeline = new Pipeline() .setStages(Array(cartesian, selector, cartesian2, selector2, assembler)) val crossDF = pipeline.fit(data).transform(data).persist() data.unpersist() crossDF.drop("f_f", "f_f_f", "selected_f_f", "selected_f_f_f") crossDF.show(1) val splitDF = crossDF.randomSplit(Array(0.9, 0.1)) val trainDF = splitDF(0).persist() val testDF = splitDF(1).persist() val originalLR = new LogisticRegression() .setFeaturesCol("features") .setLabelCol("label") .setMaxIter(20) .setRegParam(0.01) val originalPredictions = originalLR.fit(trainDF).transform(testDF) originalPredictions.show(1) val originalEvaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("rawPrediction") .setMetricName("areaUnderROC") val originalAUC = originalEvaluator.evaluate(originalPredictions) println(s"original features auc: $originalAUC") val crossLR = new LogisticRegression() .setFeaturesCol("assembled_features") .setLabelCol("label") .setMaxIter(20) .setRegParam(0.01) val crossPredictions = crossLR.fit(trainDF).transform(testDF) crossPredictions.show(1) val crossEvaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("rawPrediction") .setMetricName("areaUnderROC") val crossAUC = crossEvaluator.evaluate(crossPredictions) println(s"cross features auc: $crossAUC") spark.close() } }
Example 10
Source File: PipelineWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.{DataFrame, Dataset} class PipelineWrapper() { var pipeline = new Pipeline() var transformers: Array[TransformerWrapper] = Array() def setTransformers(value: Array[TransformerWrapper]): this.type = { transformers = value setStages(PipelineBuilder.build(transformers)) this } def setStages(value: Array[_ <: PipelineStage]): Unit = { pipeline = pipeline.setStages(value) } def fit(dataset: Dataset[_]): PipelineModelWrapper = { new PipelineModelWrapper(pipeline.fit(dataset), transformers) } } class PipelineModelWrapper(val model: PipelineModel, val transformers: Array[TransformerWrapper]) { def transform(dataset: Dataset[_]): DataFrame = { var df = model.transform(dataset) if (transformers.length >= 2) { (0 until transformers.length - 1).foreach { i => val outCols = transformers(i).getOutputCols for (col <- outCols) { df = df.drop(col) } } } df } }
Example 11
Source File: Sampler.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import scala.util.Random class Sampler(fraction: Double, override val uid: String, seed: Int = Random.nextInt) extends Transformer { def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) final def getOutputCol: String = $(inputCol) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sample(false, fraction, seed).toDF } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Sampler = defaultCopy(extra) } object Sampler { def main(args: Array[String]): Unit = { val ss = SparkSession .builder .master("local") .appName("preprocess") .getOrCreate() val training = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") println(training.count) val sampler = new Sampler(0.5) .setInputCol("features") val pipeline = new Pipeline() .setStages(Array(sampler)) val model = pipeline.fit(training) val test = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") model.transform(test).select("*") .collect() .foreach { case Row(label: Double, vector: Vector) => println(s"($label, " + s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + s"${vector.toSparse.values.mkString("[", ",", "]")}") } ss.stop() } }
Example 12
Source File: FPreprocess.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.AutoConf import com.tencent.angel.spark.automl.feature.DataLoader import com.tencent.angel.spark.automl.utils.ArgsUtil import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.SparkSession import scala.collection.mutable.ArrayBuffer object FPreprocess { def main(args: Array[String]): Unit = { val params = ArgsUtil.parse(args) val master = params.getOrElse("master", "yarn") val deploy = params.getOrElse("deploy-mode", "cluster") val input = params.getOrElse("input", "") val inputSeparator = params.getOrElse(AutoConf.Preprocess.ML_DATA_SPLITOR, AutoConf.Preprocess.DEFAULT_ML_DATA_SPLITOR) val inputFormat = params.getOrElse(AutoConf.Preprocess.ML_DATA_INPUT_FORMAT, AutoConf.Preprocess.DEFAULT_ML_DATA_INPUT_FORMAT) val inputType = params.getOrElse(AutoConf.Preprocess.INPUT_TYPE, AutoConf.Preprocess.DEFAULT_INPUT_TYPE) val sampleRate = params.getOrElse(AutoConf.Preprocess.SAMPLE_RATE, AutoConf.Preprocess.DEFAULT_SAMPLE_RATE).toDouble val imbalanceSampleRate = params.getOrElse(AutoConf.Preprocess.IMBALANCE_SAMPLE, AutoConf.Preprocess.DEFAULT_IMBALANCE_SAMPLE) val hasTokenizer = if (inputFormat.equals("document")) true else false val hasStopWordsRemover = if (inputFormat.equals("document")) true else false val ss = SparkSession .builder .master(master + "-" + deploy) .appName("preprocess") .getOrCreate() var training = DataLoader.load(ss, inputFormat, input, inputSeparator) var components = new ArrayBuffer[PipelineStage] if (sampleRate > 0 & sampleRate < 1.0) Components.addSampler(components, "features", sampleRate) if (hasTokenizer) Components.addTokenizer(components, "sentence", "words") if (hasStopWordsRemover) Components.addStopWordsRemover(components, "words", "filterWords") val pipeline = new Pipeline() .setStages(components.toArray) val model = pipeline.fit(training) ss.stop() } }
Example 13
Source File: MetadataTest.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.operator.{MetadataTransformUtils, VectorCartesian} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfter, FunSuite} class MetadataTest extends FunSuite with BeforeAndAfter { var spark: SparkSession = _ before { spark = SparkSession.builder().master("local").getOrCreate() } after { spark.close() } test("test_vector_cartesian") { val data = spark.read.format("libsvm") .option("numFeatures", "123") .load("data/a9a/a9a_123d_train_trans.libsvm") .persist() val cartesian = new VectorCartesian() .setInputCols(Array("features", "features")) .setOutputCol("cartesian_features") val assembler = new VectorAssembler() .setInputCols(Array("features", "cartesian_features")) .setOutputCol("assemble_features") val pipeline = new Pipeline() .setStages(Array(cartesian, assembler)) val featureModel = pipeline.fit(data) val crossDF = featureModel.transform(data) crossDF.schema.fields.foreach { field => println("name: " + field.name) println("metadata: " + field.metadata.toString()) } } test("test_three_order_cartesian") { val data = spark.read.format("libsvm") .option("numFeatures", 8) .load("data/abalone/abalone_8d_train.libsvm") .persist() val cartesian = new VectorCartesian() .setInputCols(Array("features", "features")) .setOutputCol("f_f") val cartesian2 = new VectorCartesian() .setInputCols(Array("features", "f_f")) .setOutputCol("f_f_f") val pipeline = new Pipeline() .setStages(Array(cartesian, cartesian2)) val crossDF = pipeline.fit(data).transform(data).persist() // first cartesian, the number of dimensions is 64 println("first cartesian dimension = " + crossDF.select("f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).length) println(crossDF.select("f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).mkString(",")) println() // second cartesian, the number of dimensions is 512 println("second cartesian dimension = " + crossDF.select("f_f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).length) println(crossDF.select("f_f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).mkString(",")) } }
Example 14
Source File: GBTLRCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.gbtlr.GBTLRClassifier import org.apache.spark.sql.DataFrame class GBTLRCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) val featureEngineeringStages:Array[PipelineStage] = FeatureEngineering.preProcessInnerProductSamplesStages() val model = new GBTLRClassifier() .setFeaturesCol("scaledFeatures") .setLabelCol("label") .setGBTMaxIter(10) .setLRMaxIter(100) .setRegParam(0.01) .setElasticNetParam(0.5) val pipelineStages = featureEngineeringStages ++ Array(model) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithInnerProduct) } override def transform(samples:DataFrame):DataFrame = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) _pipelineModel.transform(samplesWithInnerProduct) } }
Example 15
Source File: OuterProductNNCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class OuterProductNNCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { //calculate inner product between item embedding and user embedding val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples) val prePipelineModel = FeatureEngineering.preProcessOuterProductSamples(samplesWithOuterProduct) val preparedSamples = prePipelineModel.transform(samplesWithOuterProduct) //network architecture, better to keep tuning it until metrics converge val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length, preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2) val nnModel = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(150) //max iterations, keep increasing it if loss function or metrics don't converge .setStepSize(0.005) //learning step size, larger size will lead to loss vibration .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = prePipelineModel.stages ++ Array(nnModel) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithOuterProduct) } override def transform(samples:DataFrame):DataFrame = { val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples) _pipelineModel.transform(samplesWithOuterProduct) } }
Example 16
Source File: InnerProductNNCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.{LogisticRegression, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class InnerProductNNCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { //calculate inner product between item embedding and user embedding val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) val prePipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct) val preparedSamples = prePipelineModel.transform(samplesWithInnerProduct) //network architecture, better to keep tuning it until metrics converge val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length, preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2) val nnModel = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(150) //max iterations, keep increasing it if loss function or metrics don't converge .setStepSize(0.005) //learning step size, larger size will lead to loss vibration .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = prePipelineModel.stages ++ Array(nnModel) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithInnerProduct) } override def transform(samples:DataFrame):DataFrame = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) _pipelineModel.transform(samplesWithInnerProduct) } }
Example 17
Source File: NeuralNetworkCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class NeuralNetworkCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { val prePipelineModel = FeatureEngineering.preProcessSamples(samples) val preparedSamples = prePipelineModel.transform(samples) //network architecture, better to keep tuning it until metrics converge val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length, preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2) val nnModel = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(150) //max iterations, keep increasing it if loss function or metrics don't converge .setStepSize(0.005) //learning step size, larger size will lead to loss vibration .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = prePipelineModel.stages ++ Array(nnModel) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samples) } }
Example 18
Source File: LogisticRegressionCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.sql.DataFrame class LogisticRegressionCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { val featureEngineeringStages:Array[PipelineStage] = FeatureEngineering.preProcessSamplesStages() val model:LogisticRegression = new LogisticRegression() .setMaxIter(20) //max iteration .setRegParam(0.0) //regularization parameter .setElasticNetParam(0.0) //0-L2 regularization 1-L1 regularization .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = featureEngineeringStages ++ Array(model) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samples) } }
Example 19
Source File: ChurnPredictionSVM.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LinearSVC, LinearSVCModel} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object ChurnPredictionSVM { def main(args: Array[String]) { val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionSVM") import spark.implicits._ val numFolds = 10 val MaxIter: Seq[Int] = Seq(1000) val RegParam: Seq[Double] = Seq(0.10) // L2 regularization param, set 0.10 with L1 reguarization val Tol: Seq[Double] = Seq(1e-4) val ElasticNetParam: Seq[Double] = Seq(0.00001) // Combination of L1 and L2 val svm = new LinearSVC() // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(PipelineConstruction.ipindexer, PipelineConstruction.labelindexer, PipelineConstruction.assembler, svm)) // Search through decision tree's maxDepth parameter for best model val paramGrid = new ParamGridBuilder() .addGrid(svm.maxIter, MaxIter) .addGrid(svm.regParam, RegParam) .addGrid(svm.tol, Tol) .build() val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") // Set up 3-fold cross validation val crossval = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(numFolds) val cvModel = crossval.fit(Preprocessing.trainDF) val predictions = cvModel.transform(Preprocessing.testSet) val selectPrediction = predictions.select("label", "features", "rawPrediction","prediction") selectPrediction.show(10) val accuracy = evaluator.evaluate(predictions) println("Classification accuracy: " + accuracy) // Compute other performence metrices val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) } }
Example 20
Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.ml.linalg.{ Matrix, Vectors } import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row object Describe { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) def main(args: Array[String]) { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Desribe") .getOrCreate() spark.conf.set("spark.debug.maxToStringFields", 10000) val DEFAULT_MAX_TO_STRING_FIELDS = 2500 if (SparkEnv.get != null) { SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) } else { DEFAULT_MAX_TO_STRING_FIELDS } import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") } }
Example 21
Source File: ChurnPredictionLR.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator object ChurnPredictionLR { def main(args: Array[String]) { val spark: SparkSession = SparkSessionCreate.createSession("ChurnPredictionLogisticRegression") import spark.implicits._ val numFolds = 10 val MaxIter: Seq[Int] = Seq(100) val RegParam: Seq[Double] = Seq(1.0) // L2 regularization param, set 0.10 with L1 reguarization val Tol: Seq[Double] = Seq(1e-8) val ElasticNetParam: Seq[Double] = Seq(1.0) // Combination of L1 and L2 val lr = new LogisticRegression() .setLabelCol("label") .setFeaturesCol("features") // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(PipelineConstruction.ipindexer, PipelineConstruction.labelindexer, PipelineConstruction.assembler, lr)) // Search through decision tree's maxDepth parameter for best model val paramGrid = new ParamGridBuilder() .addGrid(lr.maxIter, MaxIter) .addGrid(lr.regParam, RegParam) .addGrid(lr.tol, Tol) .addGrid(lr.elasticNetParam, ElasticNetParam) .build() val evaluator = new BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction") // Set up 10-fold cross validation val crossval = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(numFolds) val cvModel = crossval.fit(Preprocessing.trainDF) val predictions = cvModel.transform(Preprocessing.testSet) val result = predictions.select("label", "prediction", "probability") val resutDF = result.withColumnRenamed("prediction", "Predicted_label") resutDF.show(10) val accuracy = evaluator.evaluate(predictions) println("Classification accuracy: " + accuracy) // Compute other performence metrices val predictionAndLabels = predictions .select("prediction", "label") .rdd.map(x => (x(0).asInstanceOf[Double], x(1) .asInstanceOf[Double])) val metrics = new BinaryClassificationMetrics(predictionAndLabels) val areaUnderPR = metrics.areaUnderPR println("Area under the precision-recall curve: " + areaUnderPR) val areaUnderROC = metrics.areaUnderROC println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) val lp = predictions.select("label", "prediction") val counttotal = predictions.count() val correct = lp.filter($"label" === $"prediction").count() val wrong = lp.filter(not($"label" === $"prediction")).count() val ratioWrong = wrong.toDouble / counttotal.toDouble val ratioCorrect = correct.toDouble / counttotal.toDouble val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / counttotal.toDouble val truen = lp.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / counttotal.toDouble val falsep = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble val falsen = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / counttotal.toDouble println("Total Count: " + counttotal) println("Correct: " + correct) println("Wrong: " + wrong) println("Ratio wrong: " + ratioWrong) println("Ratio correct: " + ratioCorrect) println("Ratio true positive: " + truep) println("Ratio false positive: " + falsep) println("Ratio true negative: " + truen) println("Ratio false negative: " + falsen) } }
Example 22
Source File: LinearSVCParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.parity import org.apache.spark.ml.classification.LinearSVCModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearSVCParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LinearSVCModel("linear_svc", Vectors.dense(0.44, 0.77), 0.66).setThreshold(0.5).setFeaturesCol("features"))) .fit(dataset) // The string order type is ignored, because once the transformer is built based on some order type, we need to serialize only the string to index map // but not the order in which it has to index. This value we can ignore while we check the transformer values. override val unserializedParams: Set[String] = Set("stringOrderType") }
Example 23
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 24
Source File: TrainValidationSplitParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.DataFrame class TrainValidationSplitParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new TrainValidationSplit(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 25
Source File: MinMaxScalerWithNonDefaultsParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{MinMaxScaler, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class MinMaxScalerWithNonDefaultsParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new MinMaxScaler(). setInputCol("features"). setOutputCol("scaled_features"). setMin(2.0). setMax(4.0))).fit(dataset) }
Example 26
Source File: HashingTermFrequencyParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import ml.combust.mleap.spark.SparkSupport._ class HashingTermFrequencyParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new HashingTF(). setNumFeatures(1 << 17). setInputCol("loan_title_tokens"). setOutputCol("loan_title_tf"))).fit(dataset) }
Example 27
Source File: BinarizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{Binarizer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame class BinarizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti")). setOutputCol("features"), new Binarizer(). setThreshold(0.12). setInputCol("dti"). setOutputCol("thresholded_features_double"), new Binarizer(). setThreshold(0.12). setInputCol("features"). setOutputCol("thresholded_features"))).fit(dataset) }
Example 28
Source File: PcaParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{PCA, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class PcaParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new PCA(). setInputCol("features"). setOutputCol("pca_features"). setK(2))).fit(dataset) override val unserializedParams = Set("k") }
Example 29
Source File: OneHotEncoderParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class OneHotEncoderParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("state") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer().setInputCol("state").setOutputCol("state_index"), new StringIndexer().setInputCol("state").setOutputCol("state_index2"), new OneHotEncoderEstimator() .setInputCols(Array("state_index", "state_index2")) .setOutputCols(Array("state_oh", "state_oh2")) )) .fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 30
Source File: PolynomialExpansionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{PolynomialExpansion, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class PolynomialExpansionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new PolynomialExpansion(). setInputCol("features"). setOutputCol("poly"). setDegree(3))).fit(dataset) }
Example 31
Source File: NGramsParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{NGram, Tokenizer} import org.apache.spark.sql.DataFrame class NGramsParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new NGram(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_ngram"). setN(3))).fit(dataset) }
Example 32
Source File: ReverseStringIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{IndexToString, StringIndexer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class ReverseStringIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("state") override val sparkTransformer: Transformer = { val stringIndexer = new StringIndexer(). setInputCol("state"). setOutputCol("state_index"). fit(dataset) val reverseStringIndexer = new IndexToString(). setInputCol("state_index"). setOutputCol("state_reverse"). setLabels(stringIndexer.labels) new Pipeline().setStages(Array(stringIndexer, reverseStringIndexer)).fit(dataset) } override val unserializedParams = Set("stringOrderType") }
Example 33
Source File: CountVectorizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{CountVectorizer, Tokenizer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class CountVectorizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new CountVectorizer(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_token_counts") .setMinTF(2))).fit(dataset) }
Example 34
Source File: NormalizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{Normalizer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class NormalizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new Normalizer(). setP(3d). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) }
Example 35
Source File: WordToVectorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{Tokenizer, Word2Vec} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class WordToVectorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new Word2Vec(uid = "words"). setInputCol("loan_title_tokens"). setOutputCol("loan_title_token_counts"))).fit(dataset) override val unserializedParams = Set("seed") }
Example 36
Source File: VectorSlicerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{VectorAssembler, VectorSlicer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorSlicerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new VectorSlicer(). setIndices(Array(1)). setNames(Array("dti")). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) }
Example 37
Source File: MinMaxScalerPipelineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, MinMaxScaler, QuantileDiscretizer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MinMaxScalerPipelineParitySpec extends SparkParityBase { private val getKeys: Map[String, Double] => Seq[String] = { input: Map[String, Double] => input.keySet.toSeq } val keyUdf = functions.udf(getKeys) override val dataset = spark.createDataFrame(Seq( (Array("1"), 1.0, Map("a" -> 0.1, "b" -> 0.2, "c" -> 0.3), 1), (Array("2"), 10.0, Map("d" -> 0.1, "e" -> 0.2, "c" -> 0.3), 0), (Array("3"), 20.0, Map("x" -> 0.1, "a" -> 0.2, "b" -> 0.3), 0), (Array("4"), 15.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("5"), 18.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("6"), 25.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 1), (Array("6"), 5.0, Map("a" -> 0.1, "b" -> 0.2, "d" -> 0.3), 0), (Array("7"), 30.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0)) ) .toDF("book_id", "pv", "myInputCol0", "label") .withColumn("myInputCol", keyUdf(functions.col("myInputCol0"))) .drop("myInputCol0") override val sparkTransformer = new Pipeline() .setStages(Array(new CountVectorizer() .setInputCol("book_id") .setOutputCol("book_id_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new QuantileDiscretizer() .setInputCol("pv") .setOutputCol("pv_bucket") .setNumBuckets(3), new CountVectorizer() .setInputCol("myInputCol") .setOutputCol("myInputCol1_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new VectorAssembler() .setInputCols(Array("pv_bucket", "book_id_vec", "myInputCol1_vec")) .setOutputCol("vectorFeature"), new MinMaxScaler().setInputCol("vectorFeature").setOutputCol("scaledFeatures"))).fit(dataset) }
Example 38
Source File: BucketedRandomProjectionLSHParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{BucketedRandomProjectionLSH, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class BucketedRandomProjectionLSHParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new BucketedRandomProjectionLSH(). setInputCol("features"). setBucketLength(2). setOutputCol("lsh_features"))).fit(dataset) }
Example 39
Source File: StopWordsRemoverParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer} import org.apache.spark.sql.DataFrame class StopWordsRemoverParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new StopWordsRemover(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_stop"). setStopWords(Array("loan")))).fit(dataset) }
Example 40
Source File: DCTParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{DCT, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DCTParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new DCT(uid = "dct"). setInverse(true). setInputCol("features"). setOutputCol("filter_features"))).fit(dataset) }
Example 41
Source File: VectorIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "state") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("state"). setOutputCol("state_index"), new VectorAssembler(). setInputCols(Array("dti", "loan_amount", "state_index")). setOutputCol("features"), new VectorIndexer(). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 42
Source File: BisectingKMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.clustering.BisectingKMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class BisectingKMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new BisectingKMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "minDivisibleClusterSize") }
Example 43
Source File: LDAParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import org.scalatest.Ignore @Ignore class LDAParitySpec extends SparkParityBase { override val dataset: DataFrame = textDataset.select("text") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val remover = new StopWordsRemover() .setInputCol(tokenizer.getOutputCol) .setOutputCol("words_filtered") val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000) val lda = new LDA().setK(5).setMaxIter(2) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = { val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution") val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution") sparkDataset.collect().zip(mleapDataset.collect()).foreach { case (sv, mv) => val sparkPrediction = sv.getAs[Vector](sparkPredictionCol) val mleapPrediction = mv.getAs[Vector](mleapPredictionCol) sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach { case (s, m) => assert(Math.abs(m - s) < 0.001) } } } }
Example 44
Source File: KMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.KMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class KMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new KMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "initMode", "initSteps", "maxIter", "tol", "k", "seed") }
Example 45
Source File: GaussianMixtureParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.GaussianMixture import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class GaussianMixtureParitySpec extends SparkParityBase { override val dataset: DataFrame = { baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") } override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GaussianMixture(). setFeaturesCol("features"). setPredictionCol("prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "tol") }
Example 46
Source File: ALSParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.recommendation import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.recommendation.ALS import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class ALSParitySpec extends SparkParityBase { override val dataset: DataFrame = recommendationDataset override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new ALS() .setMaxIter(5) .setRegParam(0.01) .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") )).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = super.equalityTest(sparkDataset.orderBy("userId", "movieId"), mleapDataset.orderBy("userId", "movieId")) //TODO: maybe coldStartStrategy should be serialized override val unserializedParams = Set("coldStartStrategy") }
Example 47
Source File: MultiLayerPerceptronClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MultiLayerPerceptronClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = multiClassClassificationDataset override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new MultilayerPerceptronClassifier(uid = "mlp"). setThresholds(Array(0.1, 0.2, 0.3)). // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) setLayers(Array(4, 5, 4, 3)). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) }
Example 48
Source File: DecisionTreeClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DecisionTreeClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new DecisionTreeClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 49
Source File: OneVsRestParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame class OneVsRestParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new OneVsRest().setClassifier(new LogisticRegression()). setLabelCol("fico_index"). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "classifier", "labelCol") }
Example 50
Source File: LogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame import org.apache.spark.ml.linalg.Vectors class LogisticRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficients = Vectors.dense(0.44, 0.77), intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 51
Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class MultinomialLogisticRegressionParitySpec extends SparkParityBase { val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0) val ages = Seq(15, 30, 40, 50, 15, 80) val heights = Seq(175, 190, 155, 160, 170, 180) val weights = Seq(67, 100, 57, 56, 56, 88) val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) }) val schema = new StructType().add("label", DoubleType, nullable = false) .add("age", IntegerType, nullable = false) .add("height", IntegerType, nullable = false) .add("weight", IntegerType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new VectorAssembler(). setInputCols(Array("age", "height", "weight")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)), interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703), numClasses = 3, isMultinomial = true))).fit(dataset) }
Example 52
Source File: GBTClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new GBTClassifier(). setFeaturesCol("features"). setLabelCol("label"). setThresholds(Array(1.0, 1.0)). setProbabilityCol("myProbability"). setPredictionCol("myPrediction"). setRawPredictionCol("myRawPrediction") )).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 53
Source File: RandomForestClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class RandomForestClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new RandomForestClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "seed") }
Example 54
Source File: NaiveBayesClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class NaiveBayesClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new NaiveBayes(uid = "nb"). setModelType("multinomial"). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "smoothing") }
Example 55
Source File: GBTRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.GBTRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GBTRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 56
Source File: AFTSurvivalRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.AFTSurvivalRegression import org.apache.spark.sql._ import org.apache.spark.sql.functions.lit class AFTSurvivalRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").withColumn("censor", lit(1.0)) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new AFTSurvivalRegression(). setQuantileProbabilities(Array(0.5)). setFeaturesCol("features"). setLabelCol("loan_amount"). setQuantilesCol("quant"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol", "stringOrderType", "maxIter", "tol") }
Example 57
Source File: LinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new LinearRegression(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "elasticNetParam", "maxIter", "tol", "epsilon", "labelCol", "loss", "regParam", "solver") }
Example 58
Source File: RandomForestRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class RandomForestRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 59
Source File: DecisionTreeRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class DecisionTreeRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new DecisionTreeRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 60
Source File: GeneralizedLinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.sql._ class GeneralizedLinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new GeneralizedLinearRegression(). setFamily("gaussian"). setLink("log"). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "maxIter", "tol", "regParam", "solver", "variancePower") }
Example 61
Source File: IsotonicRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.IsotonicRegression import org.apache.spark.sql._ class IsotonicRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").sample(withReplacement = true, 0.05) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti")). setOutputCol("features"), new IsotonicRegression(). setFeaturesCol("dti"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol") }
Example 62
Source File: TestXgboost.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.databricks.runtime.testkit import java.io.File import java.nio.file.{Files, StandardCopyOption} import ml.combust.bundle.BundleFile import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.SparkSession import com.databricks.spark.avro._ import ml.combust.mleap.spark.SparkSupport._ import ml.combust.mleap.runtime.MleapSupport._ import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier import org.apache.spark.ml.Pipeline class TestXgboost(session: SparkSession) extends Runnable { private val xgboostParams: Map[String, Any] = Map( "eta" -> 0.3, "max_depth" -> 2, "objective" -> "binary:logistic", "early_stopping_rounds" ->2, "num_round" -> 15, "nworkers" -> 2 ) override def run(): Unit = { val sqlContext = session.sqlContext // Create a temporary file and copy the contents of the resource avro to it val path = Files.createTempFile("mleap-databricks-runtime-testkit", ".avro") Files.copy(getClass.getClassLoader.getResource("datasources/lending_club_sample.avro").openStream(), path, StandardCopyOption.REPLACE_EXISTING) val sampleData = sqlContext.read.avro(path.toString) sampleData.show() val stringIndexer = new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index") val featureAssembler = new VectorAssembler(). setInputCols(Array(stringIndexer.getOutputCol, "dti", "loan_amount")). setOutputCol("features") val logisticRegression = new XGBoostClassifier(xgboostParams). setFeaturesCol("features"). setLabelCol("approved"). setPredictionCol("prediction") val pipeline = new Pipeline().setStages(Array(stringIndexer, featureAssembler, logisticRegression)) val model = pipeline.fit(sampleData) val modelPath = Files.createTempFile("mleap-databricks-runtime-testkit", ".zip") Files.delete(modelPath) { println("Writing model to...", modelPath) implicit val sbc = SparkBundleContext.defaultContext.withDataset(model.transform(sampleData)) val bf = BundleFile(new File(modelPath.toString)) model.writeBundle.save(bf).get bf.close() } { val bf = BundleFile(new File(modelPath.toString)) bf.loadMleapBundle() bf.close() } } }
Example 63
Source File: TestSparkMl.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.databricks.runtime.testkit import java.io.File import java.nio.file.{Files, StandardCopyOption} import ml.combust.bundle.BundleFile import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.SparkSession import com.databricks.spark.avro._ import ml.combust.mleap.spark.SparkSupport._ import ml.combust.mleap.runtime.MleapSupport._ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression class TestSparkMl(session: SparkSession) extends Runnable { override def run(): Unit = { val sqlContext = session.sqlContext // Create a temporary file and copy the contents of the resource avro to it val path = Files.createTempFile("mleap-databricks-runtime-testkit", ".avro") Files.copy(getClass.getClassLoader.getResource("datasources/lending_club_sample.avro").openStream(), path, StandardCopyOption.REPLACE_EXISTING) val sampleData = sqlContext.read.avro(path.toString) sampleData.show() val stringIndexer = new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index") val featureAssembler = new VectorAssembler(). setInputCols(Array(stringIndexer.getOutputCol, "dti", "loan_amount")). setOutputCol("features") val logisticRegression = new LogisticRegression(). setFeaturesCol(featureAssembler.getOutputCol). setLabelCol("approved"). setPredictionCol("prediction") val pipeline = new Pipeline().setStages(Array(stringIndexer, featureAssembler, logisticRegression)) val model = pipeline.fit(sampleData) val modelPath = Files.createTempFile("mleap-databricks-runtime-testkit", ".zip") Files.delete(modelPath) // Save the model { println("Writing model to...", modelPath) implicit val sbc = SparkBundleContext.defaultContext.withDataset(model.transform(sampleData)) val bf = BundleFile(new File(modelPath.toString)) model.writeBundle.save(bf).get bf.close() } // Load the model { val bf = BundleFile(new File(modelPath.toString)) bf.loadMleapBundle().get bf.close() } } }
Example 64
Source File: MathUnaryParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.MathUnaryModel import ml.combust.mleap.core.feature.UnaryOperation.Tan import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.mleap.feature.MathUnary import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MathUnaryParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new MathUnary(uid = "math_unary", model = MathUnaryModel(Tan)). setInputCol("dti"). setOutputCol("dti_tan") )).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 65
Source File: MultinomialLabelerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.{MultinomialLabelerModel, ReverseStringIndexerModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.feature.MultinomialLabeler import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MultinomialLabelerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new MultinomialLabeler(uid = "multinomial_labeler", model = MultinomialLabelerModel(threshold = 0.1, indexer = ReverseStringIndexerModel(Seq("fico", "dtizy")))). setFeaturesCol("features"). setProbabilitiesCol("probabilities"). setLabelsCol("labels"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 66
Source File: StringMapParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel} import org.apache.spark.ml.mleap.feature.StringMap import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ import org.apache.spark.sql.types.{StringType, StructType} class StringMapParitySpec extends SparkParityBase { val names = Seq("alice", "andy", "kevin") val rows = spark.sparkContext.parallelize(Seq.tabulate(3) { i => Row(names(i)) }) val schema = new StructType().add("name", StringType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new StringMap(uid = "string_map", model = new StringMapModel( Map("alice" -> 0, "andy" -> 1, "kevin" -> 2) )).setInputCol("name").setOutputCol("index"), new StringMap(uid = "string_map2", model = new StringMapModel( // This map is missing the label "kevin". Exception is thrown unless HandleInvalid.Keep is set. Map("alice" -> 0, "andy" -> 1), handleInvalid = HandleInvalid.Keep, defaultValue = 1.0 )).setInputCol("name").setOutputCol("index2") )).fit(dataset) }
Example 67
Source File: MathBinaryParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.BinaryOperation.Multiply import ml.combust.mleap.core.feature.MathBinaryModel import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.mleap.feature.MathBinary import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MathBinaryParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new MathBinary(uid = "math_bin", model = MathBinaryModel(Multiply)). setInputA("fico_index"). setInputB("dti"). setOutputCol("bin_out") )).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 68
Source File: SupportVectorMachineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.classification import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql._ class SupportVectorMachineParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new SVMModel(uid = "svm", model = new mllib.classification.SVMModel(weights = Vectors.dense(0.53, 0.67), intercept = 0.77)). setRawPredictionCol("raw_prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 69
Source File: L9-17MLCrossValidation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object MLCrossValidationApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val validator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) val pGrid = new ParamGridBuilder() .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) .addGrid(regressor.numTrees, Array(10, 50, 100)) .build() validator.setEstimatorParamMaps(pGrid) validator.setNumFolds(5) val bestModel = validator.fit(train) val prediction = bestModel.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 70
Source File: L9-15MLPipeline.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.ml.param.ParamMap object MLPipelineApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val pMap = ParamMap(normalizer.p -> 1.0) val model = pipeline.fit(train, pMap) val prediction = model.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 71
Source File: CrossValidation.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.ml.{Model, Pipeline, PipelineStage} import org.apache.spark.sql._ @throws(classOf[IllegalArgumentException]) protected def apply( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): CrossValidatorModel = { require(stages.size > 0, "Cannot cross-validate pipeline without stages") require(grid.size > 0, "Cannot cross-validate with undefined grid") val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator)) new CrossValidator() .setEstimator(pipeline) .setEstimatorParamMaps(grid) .setEvaluator(new BinaryClassificationEvaluator) .setNumFolds(numFolds) .fit(trainDf) } protected def evaluate( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): Evaluator = this(trainDf, stages, grid).getEvaluator }
Example 72
Source File: MNIST.scala From spark-knn with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.KNNClassifier import org.apache.spark.ml.feature.PCA import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.log4j object MNIST { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, "data/mnist/mnist.bz2") .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) //split training and testing val Array(train, test) = dataset .randomSplit(Array(0.7, 0.3), seed = 1234L) .map(_.cache()) //create PCA matrix to reduce feature dimensions val pca = new PCA() .setInputCol("features") .setK(50) .setOutputCol("pcaFeatures") val knn = new KNNClassifier() .setTopTreeSize(dataset.count().toInt / 500) .setFeaturesCol("pcaFeatures") .setPredictionCol("predicted") .setK(1) val pipeline = new Pipeline() .setStages(Array(pca, knn)) .fit(train) val insample = validate(pipeline.transform(train)) val outofsample = validate(pipeline.transform(test)) //reference accuracy: in-sample 95% out-of-sample 94% logger.info(s"In-sample: $insample, Out-of-sample: $outofsample") } private[this] def validate(results: DataFrame): Double = { results .selectExpr("SUM(CASE WHEN label = predicted THEN 1.0 ELSE 0.0 END) / COUNT(1)") .collect() .head .getDecimal(0) .doubleValue() } }
Example 73
Source File: MNISTCrossValidation.scala From spark-knn with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.KNNClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.log4j object MNISTCrossValidation { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val dataset = MLUtils.loadLibSVMFile(sc, "data/mnist/mnist.bz2") .toDF() //.limit(10000) //split traning and testing val Array(train, test) = dataset.randomSplit(Array(0.7, 0.3), seed = 1234L).map(_.cache()) //create PCA matrix to reduce feature dimensions val pca = new PCA() .setInputCol("features") .setK(50) .setOutputCol("pcaFeatures") val knn = new KNNClassifier() .setTopTreeSize(50) .setFeaturesCol("pcaFeatures") .setPredictionCol("prediction") .setK(1) val pipeline = new Pipeline() .setStages(Array(pca, knn)) val paramGrid = new ParamGridBuilder() // .addGrid(knn.k, 1 to 20) .addGrid(pca.k, 10 to 100 by 10) .build() val cv = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(5) val cvModel = cv.fit(train) val insample = validate(cvModel.transform(train)) val outofsample = validate(cvModel.transform(test)) //reference accuracy: in-sample 95% out-of-sample 94% logger.info(s"In-sample: $insample, Out-of-sample: $outofsample") logger.info(s"Cross-validated: ${cvModel.avgMetrics.toSeq}") } private[this] def validate(results: DataFrame): Double = { results .selectExpr("SUM(CASE WHEN label = prediction THEN 1.0 ELSE 0.0 END) / COUNT(1)") .collect() .head .getDecimal(0) .doubleValue() } }
Example 74
Source File: AtlasEntityUtils.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.types import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat, CatalogTable} import com.hortonworks.spark.atlas.{AtlasClientConf, SACAtlasEntityWithDependencies, SACAtlasReferenceable} import com.hortonworks.spark.atlas.utils.{Logging, SparkUtils} import org.apache.spark.ml.Pipeline trait AtlasEntityUtils extends Logging { def conf: AtlasClientConf def clusterName: String = conf.get(AtlasClientConf.CLUSTER_NAME) def sparkDbType: String = metadata.DB_TYPE_STRING def sparkDbToEntity(dbDefinition: CatalogDatabase): SACAtlasEntityWithDependencies = { internal.sparkDbToEntity(dbDefinition, clusterName, SparkUtils.currUser()) } def sparkDbUniqueAttribute(db: String): String = { internal.sparkDbUniqueAttribute(db) } def sparkStorageFormatType: String = metadata.STORAGEDESC_TYPE_STRING def sparkStorageFormatToEntity( storageFormat: CatalogStorageFormat, db: String, table: String): SACAtlasEntityWithDependencies = { internal.sparkStorageFormatToEntity(storageFormat, db, table) } def sparkStorageFormatUniqueAttribute(db: String, table: String): String = { internal.sparkStorageFormatUniqueAttribute(db, table) } def sparkTableType: String = metadata.TABLE_TYPE_STRING def tableToEntity( tableDefinition: CatalogTable, mockDbDefinition: Option[CatalogDatabase] = None): SACAtlasReferenceable = { if (SparkUtils.usingRemoteMetastoreService()) { external.hiveTableToReference(tableDefinition, clusterName, mockDbDefinition) } else { internal.sparkTableToEntity(tableDefinition, clusterName, mockDbDefinition) } } def sparkTableToEntity( tableDefinition: CatalogTable, mockDbDefinition: Option[CatalogDatabase] = None): SACAtlasReferenceable = { internal.sparkTableToEntity(tableDefinition, clusterName, mockDbDefinition) } def sparkTableToEntityForAlterTable( tableDefinition: CatalogTable, mockDbDefinition: Option[CatalogDatabase] = None): SACAtlasReferenceable = { internal.sparkTableToEntityForAlterTable(tableDefinition, clusterName, mockDbDefinition) } def sparkTableUniqueAttribute(db: String, table: String): String = { internal.sparkTableUniqueAttribute(db, table) } def pipelineUniqueAttribute(pipeline: Pipeline): String = { pipeline.uid } def processType: String = metadata.PROCESS_TYPE_STRING def processUniqueAttribute(executionId: Long): String = internal.sparkProcessUniqueAttribute(executionId) // If there is cycle, return empty output entity list def cleanOutput( inputs: Seq[SACAtlasReferenceable], outputs: Seq[SACAtlasReferenceable]): List[SACAtlasReferenceable] = { val qualifiedNames = inputs.map(_.qualifiedName) val isCycle = outputs.exists(x => qualifiedNames.contains(x.qualifiedName)) if (isCycle) { logWarn("Detected cycle - same entity observed to both input and output. " + "Discarding output entities as Atlas doesn't support cycle.") List.empty } else { outputs.toList } } }
Example 75
Source File: MLPipelineTrackerIT.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.ml import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.scalatest.Matchers import com.hortonworks.spark.atlas._ import com.hortonworks.spark.atlas.types._ import com.hortonworks.spark.atlas.TestUtils._ class MLPipelineTrackerIT extends BaseResourceIT with Matchers with WithHiveSupport { private val atlasClient = new RestAtlasClient(atlasClientConf) def clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME) def getTableEntity(tableName: String): SACAtlasEntityWithDependencies = { val dbDefinition = createDB("db1", "hdfs:///test/db/db1") val sd = createStorageFormat() val schema = new StructType() .add("user", StringType, false) .add("age", IntegerType, true) val tableDefinition = createTable("db1", s"$tableName", schema, sd) internal.sparkTableToEntity(tableDefinition, clusterName, Some(dbDefinition)) } // Enable it to run integrated test. it("pipeline and pipeline model") { val uri = "hdfs://" val pipelineDir = "tmp/pipeline" val modelDir = "tmp/model" val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, modelDirEntity)) val df = sparkSession.createDataFrame(Seq( (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) )).toDF("id", "features", "label") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("features_scaled") .setMin(0.0) .setMax(3.0) val pipeline = new Pipeline().setStages(Array(scaler)) val model = pipeline.fit(df) pipeline.write.overwrite().save(pipelineDir) val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, pipelineEntity)) val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) atlasClient.createEntitiesWithDependencies(Seq(modelDirEntity, modelEntity)) val tableEntities1 = getTableEntity("chris1") val tableEntities2 = getTableEntity("chris2") atlasClient.createEntitiesWithDependencies(tableEntities1) atlasClient.createEntitiesWithDependencies(tableEntities2) } }
Example 76
Source File: MLAtlasEntityUtilsSuite.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.types import java.io.File import org.apache.atlas.{AtlasClient, AtlasConstants} import org.apache.atlas.model.instance.AtlasEntity import org.apache.commons.io.FileUtils import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.scalatest.{FunSuite, Matchers} import com.hortonworks.spark.atlas.TestUtils._ import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport} class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport { def getTableEntity(tableName: String): AtlasEntity = { val dbDefinition = createDB("db1", "hdfs:///test/db/db1") val sd = createStorageFormat() val schema = new StructType() .add("user", StringType, false) .add("age", IntegerType, true) val tableDefinition = createTable("db1", s"$tableName", schema, sd) val tableEntities = internal.sparkTableToEntity( tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition)) val tableEntity = tableEntities.entity tableEntity } test("pipeline, pipeline model, fit and transform") { val uri = "/" val pipelineDir = "tmp/pipeline" val modelDir = "tmp/model" val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) pipelineDirEntity.entity.getAttribute("uri") should be (uri) pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir) pipelineDirEntity.dependencies.length should be (0) val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) modelDirEntity.entity.getAttribute("uri") should be (uri) modelDirEntity.entity.getAttribute("directory") should be (modelDir) modelDirEntity.dependencies.length should be (0) val df = sparkSession.createDataFrame(Seq( (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) )).toDF("id", "features", "label") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("features_scaled") .setMin(0.0) .setMax(3.0) val pipeline = new Pipeline().setStages(Array(scaler)) val model = pipeline.fit(df) pipeline.write.overwrite().save(pipelineDir) val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING) pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be ( pipeline.uid) pipelineEntity.entity.getAttribute("name") should be (pipeline.uid) pipelineEntity.entity.getRelationshipAttribute("directory") should be ( AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false)) pipelineEntity.dependencies should be (Seq(pipelineDirEntity)) val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) val modelUid = model.uid.replaceAll("pipeline", "model") modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING) modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid) modelEntity.entity.getAttribute("name") should be (modelUid) modelEntity.entity.getRelationshipAttribute("directory") should be ( AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false)) modelEntity.dependencies should be (Seq(modelDirEntity)) FileUtils.deleteDirectory(new File("tmp")) } }
Example 77
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 78
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 79
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/Users/manpreet.singh/Sandbox/codehub/github/machinelearning/spark-ml/Chapter_06/2.0.0/scala-spark-app/src/main/scala/org/sparksamples/classification/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 80
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 81
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 82
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 83
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 84
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/LR.xls") holdout.rdd.map(x => x(1).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/Actual.xls") savePredictions(holdout, dataFrame, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/LogisticRegression.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 85
Source File: TextClassificationPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.textclassifier import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row import org.utils.StandaloneSpark object TextClassificationPipeline { def main(args: Array[String]): Unit = { val spark = StandaloneSpark.getSparkInstance() // Prepare training documents from a list of (id, text, label) tuples. val training = spark.createDataFrame(Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) )).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training) // Now we can optionally save the fitted pipeline to disk model.write.overwrite().save("/tmp/spark-logistic-regression-model") // We can also save this unfit pipeline to disk pipeline.write.overwrite().save("/tmp/unfit-lr-model") // And load it back in during production val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "spark hadoop spark"), (7L, "apache hadoop") )).toDF("id", "text") // Make predictions on test documents. model.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } } }
Example 86
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 87
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 88
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 89
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 90
Source File: LogisticRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.DataFrame object LogisticRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def logisticRegressionPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val lr = new LogisticRegression() val paramGrid = new ParamGridBuilder() .addGrid(lr.regParam, Array(0.1, 0.01)) .addGrid(lr.fitIntercept) .addGrid(lr.elasticNetParam, Array(0.0, 0.25, 0.5, 0.75, 1.0)) .build() val pipeline = new Pipeline().setStages(Array(vectorAssembler, lr)) val trainValidationSplit = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) .setEstimatorParamMaps(paramGrid) // 80% of the data will be used for training and the remaining 20% for validation. .setTrainRatio(0.8) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) //val model = trainValidationSplit.fit(training) val model = trainValidationSplit.fit(dataFrame) //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val totalPoints = dataFrame.count() val lrTotalCorrect = holdout.rdd.map(x => if (x(0).asInstanceOf[Double] == x(1).asInstanceOf[Double]) 1 else 0).sum() val accuracy = lrTotalCorrect/totalPoints println("Accuracy of LogisticRegression is: ", accuracy) } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { println("Mean Squared Error:", regressionMetrics.meanSquaredError) println("Root Mean Squared Error:", regressionMetrics.rootMeanSquaredError) predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 91
Source File: GeneralizedLinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{SparkSession, _} object GeneralizedLinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def genLinearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new GeneralizedLinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def genLinearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new GeneralizedLinearRegression() .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) // Fit the model val model = lr.fit(training) // Print the coefficients and intercept for generalized linear regression model println(s"Coefficients: ${model.coefficients}") println(s"Intercept: ${model.intercept}") // Summarize the model over the training set and print out some metrics val summary = model.summary println(s"Coefficient Standard Errors: ${summary.coefficientStandardErrors.mkString(",")}") println(s"T Values: ${summary.tValues.mkString(",")}") println(s"P Values: ${summary.pValues.mkString(",")}") println(s"Dispersion: ${summary.dispersion}") println(s"Null Deviance: ${summary.nullDeviance}") println(s"Residual Degree Of Freedom Null: ${summary.residualDegreeOfFreedomNull}") println(s"Deviance: ${summary.deviance}") println(s"Residual Degree Of Freedom: ${summary.residualDegreeOfFreedom}") println(s"AIC: ${summary.aic}") println("Deviance Residuals: ") summary.residuals().show() } }
Example 92
Source File: LinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.log4j.Logger import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, SparkSession} object LinearRegressionPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = { val lr = new LinearRegression() .setFeaturesCol("features") .setLabelCol("label") .setRegParam(0.1) .setElasticNetParam(1.0) .setMaxIter(10) val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr)) val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345) val model = pipeline.fit(training) val fullPredictions = model.transform(test).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select("label").rdd.map(_.getDouble(0)) val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError println(s" Root mean squared error (RMSE): $RMSE") } def linearRegressionWithSVMFormat(spark: SparkSession) = { // Load training data val training = spark.read.format("libsvm") .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt") val lr = new LinearRegression() .setMaxIter(10) .setRegParam(0.3) .setElasticNetParam(0.8) // Fit the model val lrModel = lr.fit(training) // Print the coefficients and intercept for linear regression println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") // Summarize the model over the training set and print out some metrics val trainingSummary = lrModel.summary println(s"numIterations: ${trainingSummary.totalIterations}") println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") trainingSummary.residuals.show() println(s"RMSE: ${trainingSummary.rootMeanSquaredError}") println(s"r2: ${trainingSummary.r2}") } }
Example 93
Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 94
Source File: Preprocessor.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions import config.paramconf.PreprocessParams import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer} import org.apache.spark.sql.DataFrame def preprocess(data: DataFrame): Pipeline = { val spark = data.sparkSession val params = new PreprocessParams val indexModel = new StringIndexer() .setHandleInvalid(params.handleInvalid) .setInputCol("label") .setOutputCol("indexedLabel") .fit(data) val cleaner = new Cleaner() .setFanJian(params.fanjian) .setQuanBan(params.quanban) .setMinLineLen(params.minLineLen) .setInputCol("content") .setOutputCol("cleand") val segmenter = new Segmenter() .isAddNature(params.addNature) .isDelEn(params.delEn) .isDelNum(params.delNum) .isNatureFilter(params.natureFilter) .setMinTermLen(params.minTermLen) .setMinTermNum(params.minTermNum) .setSegType(params.segmentType) .setInputCol(cleaner.getOutputCol) .setOutputCol("segmented") val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect() val remover = new StopWordsRemover() .setStopWords(stopwords) .setInputCol(segmenter.getOutputCol) .setOutputCol("removed") val vectorizer = new CountVectorizer() .setMinTF(params.minTF) .setVocabSize(params.vocabSize) .setInputCol(remover.getOutputCol) .setOutputCol("vectorized") val idf = new IDF() .setMinDocFreq(params.minDocFreq) .setInputCol(vectorizer.getOutputCol) .setOutputCol("features") val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf) new Pipeline().setStages(stages) } }
Example 95
Source File: recursive.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.ml.{Pipeline, PipelineModel} package object recursive { implicit def p2recursive(pipeline: Pipeline): RecursivePipeline = new RecursivePipeline(pipeline) implicit def pm2recursive(pipelineModel: PipelineModel): RecursivePipelineModel = new RecursivePipelineModel(pipelineModel.uid, pipelineModel) implicit def pm2light(pipelineModel: PipelineModel): LightPipeline = new LightPipeline(pipelineModel) implicit class Recursive(p: Pipeline) { def recursive: RecursivePipeline = { new RecursivePipeline(p) } } implicit class RecursiveModel(p: PipelineModel) { def recursive: RecursivePipelineModel = { new RecursivePipelineModel(p.uid, p) } } }
Example 96
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 97
Source File: PubTator.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.training import com.johnsnowlabs.nlp.annotator.{PerceptronModel, SentenceDetector, Tokenizer} import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DocumentAssembler, Finisher} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} object PubTator { def readDataset(spark: SparkSession, path: String): DataFrame = { val pubtator = spark.sparkContext.textFile(path) val titles = pubtator.filter(x => x.contains("|a|") | x.contains("|t|")) val titlesText = titles.map(x => x.split("\\|")).groupBy(_.head) .map(x => (x._1.toInt, x._2.foldLeft(Seq[String]())((a, b) => a ++ Seq(b.last)))).map(x => (x._1, x._2.mkString(" "))) val df = spark.createDataFrame(titlesText).toDF("doc_id", "text") val docAsm = new DocumentAssembler().setInputCol("text").setOutputCol("document") val setDet = new SentenceDetector().setInputCols("document").setOutputCol("sentence") val tknz = new Tokenizer().setInputCols("sentence").setOutputCol("token") val pl = new Pipeline().setStages(Array(docAsm, setDet, tknz)) val nlpDf = pl.fit(df).transform(df) val annotations = pubtator.filter(x => !x.contains("|a|") & !x.contains("|t|") & x.nonEmpty) val splitAnnotations = annotations.map(_.split("\\t")).map(x => (x(0), x(1).toInt, x(2).toInt - 1, x(3), x(4), x(5))) val docAnnotations = splitAnnotations.groupBy(_._1).map(x => (x._1, x._2)) .map(x => (x._1.toInt, x._2.zipWithIndex.map(a => (new Annotation(AnnotatorType.CHUNK, a._1._2, a._1._3, a._1._4, Map("entity" -> a._1._5, "chunk" -> a._2.toString), Array[Float]()))).toList ) ) val chunkMeta = new MetadataBuilder().putString("annotatorType", AnnotatorType.CHUNK).build() val annDf = spark.createDataFrame(docAnnotations).toDF("doc_id", "chunk") .withColumn("chunk", col("chunk").as("chunk", chunkMeta)) val alignedDf = nlpDf.join(annDf, Seq("doc_id")).selectExpr("doc_id", "sentence", "token", "chunk") val iobTagging = udf((tokens: Seq[Row], chunkLabels: Seq[Row]) => { val tokenAnnotations = tokens.map(Annotation(_)) val labelAnnotations = chunkLabels.map(Annotation(_)) tokenAnnotations.map(ta => { val tokenLabel = labelAnnotations.filter(la => la.begin <= ta.begin && la.end >= ta.end).headOption val tokenTag = { if (tokenLabel.isEmpty) "O" else { val tokenCSV = tokenLabel.get.metadata.get("entity").get if (tokenCSV == "UnknownType") "O" else { val tokenPrefix = if (ta.begin == tokenLabel.get.begin) "B-" else "I-" val paddedTokenTag = "T" + "%03d".format(tokenCSV.split(",")(0).slice(1, 4).toInt) tokenPrefix + paddedTokenTag } } } Annotation(AnnotatorType.NAMED_ENTITY, ta.begin, ta.end, tokenTag, Map("word" -> ta.result) ) } ) }) val labelMeta = new MetadataBuilder().putString("annotatorType", AnnotatorType.NAMED_ENTITY).build() val taggedDf = alignedDf.withColumn("label", iobTagging(col("token"), col("chunk")).as("label", labelMeta)) val pos = PerceptronModel.pretrained().setInputCols(Array("sentence", "token")).setOutputCol("pos") val finisher = new Finisher().setInputCols("token", "pos", "label").setIncludeMetadata(true) val finishingPipeline = new Pipeline().setStages(Array(pos, finisher)) finishingPipeline.fit(taggedDf).transform(taggedDf) .withColumnRenamed("finished_label", "finished_ner") //CoNLL generator expects finished_ner } }
Example 98
Source File: WordEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.base.{DocumentAssembler, RecursivePipeline} import com.johnsnowlabs.nlp.util.io.{ReadAs, ResourceHelper} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.scalatest._ class WordEmbeddingsTestSpec extends FlatSpec { "Word Embeddings" should "correctly embed clinical words not embed non-existent words" ignore { val words = ResourceHelper.spark.read.option("header","true").csv("src/test/resources/embeddings/clinical_words.txt") val notWords = ResourceHelper.spark.read.option("header","true").csv("src/test/resources/embeddings/not_words.txt") val documentAssembler = new DocumentAssembler() .setInputCol("word") .setOutputCol("document") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained() .setInputCols("document", "token") .setOutputCol("embeddings") .setCaseSensitive(false) val pipeline = new RecursivePipeline() .setStages(Array( documentAssembler, tokenizer, embeddings )) val wordsP = pipeline.fit(words).transform(words).cache() val notWordsP = pipeline.fit(notWords).transform(notWords).cache() val wordsCoverage = WordEmbeddingsModel.withCoverageColumn(wordsP, "embeddings", "cov_embeddings") val notWordsCoverage = WordEmbeddingsModel.withCoverageColumn(notWordsP, "embeddings", "cov_embeddings") wordsCoverage.select("word","cov_embeddings").show() notWordsCoverage.select("word","cov_embeddings").show() val wordsOverallCoverage = WordEmbeddingsModel.overallCoverage(wordsCoverage,"embeddings").percentage val notWordsOverallCoverage = WordEmbeddingsModel.overallCoverage(notWordsCoverage,"embeddings").percentage ResourceHelper.spark.createDataFrame( Seq( ("Words", wordsOverallCoverage),("Not Words", notWordsOverallCoverage) ) ).toDF("Dataset", "OverallCoverage").show() assert(wordsOverallCoverage == 1) assert(notWordsOverallCoverage == 0) } "Word Embeddings" should "store and load from disk" in { val data = ResourceHelper.spark.read.option("header","true").csv("src/test/resources/embeddings/clinical_words.txt") val documentAssembler = new DocumentAssembler() .setInputCol("word") .setOutputCol("document") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val embeddings = new WordEmbeddings() .setStoragePath("src/test/resources/random_embeddings_dim4.txt", ReadAs.TEXT) .setDimension(4) .setStorageRef("glove_4d") .setInputCols("document", "token") .setOutputCol("embeddings") val pipeline = new Pipeline() .setStages(Array( documentAssembler, tokenizer, embeddings )) val model = pipeline.fit(data) model.write.overwrite().save("./tmp_embeddings_pipeline") model.transform(data).show(5) val loadedPipeline1 = PipelineModel.load("./tmp_embeddings_pipeline") loadedPipeline1.transform(data).show(5) val loadedPipeline2 = PipelineModel.load("./tmp_embeddings_pipeline") loadedPipeline2.transform(data).show(5) } }
Example 99
Source File: ElmoEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._ import org.apache.spark.ml.Pipeline import org.apache.spark.sql.functions.{size, explode} import org.scalatest._ class ElmoEmbeddingsTestSpec extends FlatSpec { "Elmo Embeddings" should "generate annotations" ignore { System.out.println("Working Directory = " + System.getProperty("user.dir")) val data = Seq( "I like pancakes in the summer. I hate ice cream in winter.", "If I had asked people what they wanted, they would have said faster horses" ).toDF("text") val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val elmoSavedModel = ElmoEmbeddings.pretrained() .setPoolingLayer("word_emb") .setInputCols(Array("token", "document")) .setOutputCol("embeddings") elmoSavedModel.write.overwrite().save("./tmp_elmo_tf") val embeddings = ElmoEmbeddings.load("./tmp_elmo_tf") val pipeline = new Pipeline().setStages(Array( document, sentence, tokenizer, embeddings )) val elmoDDD = pipeline.fit(data).transform(data) elmoDDD.select("embeddings.result").show(false) elmoDDD.select("embeddings.metadata").show(false) val explodeEmbds = elmoDDD.select(explode($"embeddings.embeddings").as("embedding")) elmoDDD.select(size(elmoDDD("embeddings.embeddings")).as("embeddings_size")).show explodeEmbds.select(size($"embedding").as("embeddings_size")).show } }
Example 100
Source File: XlnetEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.apache.spark.sql.functions.size import org.scalatest._ class XlnetEmbeddingsTestSpec extends FlatSpec { "Xlnet Embeddings" should "correctly load pretrained model" ignore { val smallCorpus = ResourceHelper.spark.read.option("header","true") .csv("src/test/resources/embeddings/sentence_embeddings.csv") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = XlnetEmbeddings.pretrained() .setInputCols("sentence", "token") .setOutputCol("embeddings") val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentence, tokenizer, embeddings )) val pipelineDF = pipeline.fit(smallCorpus).transform(smallCorpus) println(pipelineDF.count()) pipelineDF.show() // pipelineDF.printSchema() pipelineDF.select("token.result").show(4, false) pipelineDF.select("embeddings.result").show(4, false) pipelineDF.select("embeddings.metadata").show(4, false) pipelineDF.select("embeddings.embeddings").show(4, truncate = 300) pipelineDF.select(size(pipelineDF("embeddings.embeddings")).as("embeddings_size")).show Benchmark.time("Time to save XlnetEmbeddings results") { pipelineDF.select("embeddings").write.mode("overwrite").parquet("./tmp_xlnet_embeddings") } } }
Example 101
Source File: AlbertEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.apache.spark.sql.functions.size import org.scalatest._ class AlbertEmbeddingsTestSpec extends FlatSpec { "ALBert Embeddings" should "correctly load pretrained model" ignore { val smallCorpus = ResourceHelper.spark.read.option("header","true") .csv("src/test/resources/embeddings/sentence_embeddings.csv") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = AlbertEmbeddings.pretrained() .setInputCols("sentence", "token") .setOutputCol("embeddings") val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentence, tokenizer, embeddings )) val pipelineDF = pipeline.fit(smallCorpus).transform(smallCorpus) println(pipelineDF.count()) pipelineDF.show() // pipelineDF.printSchema() pipelineDF.select("token.result").show(4, false) pipelineDF.select("embeddings.result").show(4, false) pipelineDF.select("embeddings.metadata").show(4, false) pipelineDF.select("embeddings.embeddings").show(4, truncate = 300) pipelineDF.select(size(pipelineDF("embeddings.embeddings")).as("embeddings_size")).show Benchmark.time("Time to save BertEmbeddings results") { pipelineDF.select("embeddings").write.mode("overwrite").parquet("./tmp_albert_embeddings") } } }
Example 102
Source File: FunctionsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotator.{PerceptronApproach, Tokenizer} import com.johnsnowlabs.nlp.training.POS import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.types.ArrayType import org.scalatest._ class FunctionsTestSpec extends FlatSpec { "functions in functions" should "work successfully" in { import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._ val trainingPerceptronDF = POS().readDataset(ResourceHelper.spark, "src/test/resources/anc-pos-corpus-small/", "|", "tags") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val pos = new PerceptronApproach() .setInputCols("document", "token") .setOutputCol("pos") .setPosColumn("tags") .setNIterations(3) val pipeline = new Pipeline() .setStages(Array( documentAssembler, tokenizer, pos )) val model = pipeline.fit(trainingPerceptronDF) val data = model.transform(Seq("Peter is a very good and compromised person.").toDF("text")) import functions._ val mapped = data.mapAnnotationsCol("pos", "modpos", (annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") }) val modified = data.mapAnnotationsCol("pos", "modpos", (_: Seq[Annotation]) => { "hello world" }) val filtered = data.filterByAnnotationsCol("pos", (annotations: Seq[Annotation]) => { annotations.exists(_.result == "JJ") }) import org.apache.spark.sql.functions.col val udfed = data.select(mapAnnotations((annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") }, ArrayType(Annotation.dataType))(col("pos"))) val udfed2 = data.select(mapAnnotationsStrict((annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") })(col("pos"))) mapped.show(truncate = false) modified.show(truncate = false) filtered.show(truncate = false) udfed.show(truncate = false) udfed2.show(truncate = false) } }
Example 103
Source File: DependencyParserBehaviors.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.parser.dep import com.johnsnowlabs.nlp._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalatest.FlatSpec import com.johnsnowlabs.util.PipelineModels import org.apache.spark.ml.Pipeline trait DependencyParserBehaviors { this: FlatSpec => def initialAnnotations(testDataSet: Dataset[Row]): Unit = { val fixture = createFixture(testDataSet) it should "add annotations" in { assert(fixture.dependencies.count > 0, "Annotations count should be greater than 0") } it should "add annotations with the correct annotationType" in { fixture.depAnnotations.foreach { a => assert(a.annotatorType == AnnotatorType.DEPENDENCY, s"Annotation type should ${AnnotatorType.DEPENDENCY}") } } it should "annotate each token" in { assert(fixture.tokenAnnotations.size == fixture.depAnnotations.size, s"Every token should be annotated") } it should "annotate each word with a head" in { fixture.depAnnotations.foreach { a => assert(a.result.nonEmpty, s"Result should have a head") } } it should "annotate each word with the correct indexes" in { fixture.depAnnotations .zip(fixture.tokenAnnotations) .foreach { case (dep, token) => assert(dep.begin == token.begin && dep.end == token.end, s"Token and word should have equal indixes") } } } private def createFixture(testDataSet: Dataset[Row]) = new { val dependencies: DataFrame = testDataSet.select("dependency") val depAnnotations: Seq[Annotation] = dependencies .collect .flatMap { r => r.getSeq[Row](0) } .map { r => Annotation(r.getString(0), r.getInt(1), r.getInt(2), r.getString(3), r.getMap[String, String](4)) } val tokens: DataFrame = testDataSet.select("token") val tokenAnnotations: Seq[Annotation] = tokens .collect .flatMap { r => r.getSeq[Row](0) } .map { r => Annotation(r.getString(0), r.getInt(1), r.getInt(2), r.getString(3), r.getMap[String, String](4)) } } def relationshipsBetweenWordsPredictor(testDataSet: Dataset[Row], pipeline: Pipeline): Unit = { val emptyDataSet = PipelineModels.dummyDataset val dependencyParserModel = pipeline.fit(emptyDataSet) it should "train a model" in { val model = dependencyParserModel.stages.last.asInstanceOf[DependencyParserModel] assert(model.isInstanceOf[DependencyParserModel]) } val dependencyParserDataFrame = dependencyParserModel.transform(testDataSet) //dependencyParserDataFrame.collect() dependencyParserDataFrame.select("dependency").show(false) it should "predict relationships between words" in { assert(dependencyParserDataFrame.isInstanceOf[DataFrame]) } } }
Example 104
Source File: LemmatizerTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ class LemmatizerTestSpec extends FlatSpec with LemmatizerBehaviors { require(Some(SparkAccessor).isDefined) val lemmatizer = new Lemmatizer "a lemmatizer" should s"be of type ${AnnotatorType.TOKEN}" in { assert(lemmatizer.outputAnnotatorType == AnnotatorType.TOKEN) } val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody) "A full Normalizer pipeline with latin content" should behave like fullLemmatizerPipeline(latinBodyData) "A lemmatizer" should "be readable and writable" taggedAs Tag("LinuxOnly") in { val lemmatizer = new Lemmatizer().setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\t") val path = "./test-output-tmp/lemmatizer" try { lemmatizer.write.overwrite.save(path) val lemmatizerRead = Lemmatizer.read.load(path) assert(lemmatizer.getDictionary.path == lemmatizerRead.getDictionary.path) } catch { case _: java.io.IOException => succeed } } "A lemmatizer" should "work under a pipeline framework" in { val data = ContentProvider.parquetData.limit(1000) val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val lemmatizer = new Lemmatizer() .setInputCols(Array("token")) .setOutputCol("lemma") .setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\t") val finisher = new Finisher() .setInputCols("lemma") val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, lemmatizer, finisher )) val recursivePipeline = new RecursivePipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, lemmatizer, finisher )) val model = pipeline.fit(data) model.transform(data).show() val PIPE_PATH = "./tmp_pipeline" model.write.overwrite().save(PIPE_PATH) val loadedPipeline = PipelineModel.read.load(PIPE_PATH) loadedPipeline.transform(data).show val recursiveModel = recursivePipeline.fit(data) recursiveModel.transform(data).show() recursiveModel.write.overwrite().save(PIPE_PATH) val loadedRecPipeline = PipelineModel.read.load(PIPE_PATH) loadedRecPipeline.transform(data).show succeed } }
Example 105
Source File: LanguageDetectorDLTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ld.dl import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.{Pipeline, PipelineModel} import org.scalatest._ class LanguageDetectorDLTestSpec extends FlatSpec { "LanguageDetectorDL" should "correctly load saved model" in { val smallCorpus = ResourceHelper.spark.read .option("header", true) .option("delimiter", "|") .csv("src/test/resources/language-detector/multilingual_sample.txt") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val languageDetector = LanguageDetectorDL.pretrained("ld_wiki_20") .setInputCols("sentence") .setOutputCol("language") .setThreshold(0.3f) .setCoalesceSentences(true) val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentence, languageDetector )) val pipelineDF = pipeline.fit(smallCorpus).transform(smallCorpus) println(pipelineDF.count()) smallCorpus.show(2) pipelineDF.show(2) pipelineDF.select("sentence").show(4, false) pipelineDF.select("language.metadata").show(20, false) pipelineDF.select("language.result", "lang").show(20, false) pipeline.fit(smallCorpus).write.overwrite().save("./tmp_ld_pipeline") val pipelineModel = PipelineModel.load("./tmp_ld_pipeline") pipelineModel.transform(smallCorpus).select("language.result", "lang").show(20, false) } }
Example 106
Source File: SentimentDLTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.Pipeline import org.scalatest._ class SentimentDLTestSpec extends FlatSpec { val spark = ResourceHelper.spark "SentimentDL" should "correctly train on a test dataset" ignore { val smallCorpus = ResourceHelper.spark.read.option("header", "true").csv("src/test/resources/classifier/sentiment.csv") smallCorpus.show val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val useEmbeddings = UniversalSentenceEncoder.pretrained() .setInputCols("document") .setOutputCol("sentence_embeddings") val docClassifier = new SentimentDLApproach() .setInputCols("sentence_embeddings") .setOutputCol("sentiment") .setLabelColumn("label") .setBatchSize(32) .setMaxEpochs(1) .setLr(5e-3f) .setDropout(0.5f) val pipeline = new Pipeline() .setStages( Array( documentAssembler, useEmbeddings, docClassifier ) ) val pipelineModel = pipeline.fit(smallCorpus) pipelineModel.stages.last.asInstanceOf[SentimentDLModel].write.overwrite().save("./tmp_sentimentDL_model") val pipelineDF = pipelineModel.transform(smallCorpus) pipelineDF.select("document").show(10) pipelineDF.select("sentiment").show(10) pipelineDF.select("sentiment.result").show(10, false) pipelineDF.select("sentiment.metadata").show(10, false) } }
Example 107
Source File: ClassifierDLTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.Pipeline import org.scalatest._ class ClassifierDLTestSpec extends FlatSpec { "ClassifierDL" should "correctly train IMDB train dataset" ignore { val smallCorpus = ResourceHelper.spark.read.option("header","true").csv("src/test/resources/classifier/sentiment.csv") println("count of training dataset: ", smallCorpus.count) smallCorpus.show() smallCorpus.printSchema() val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val useEmbeddings = UniversalSentenceEncoder.pretrained() .setInputCols("document") .setOutputCol("sentence_embeddings") val docClassifier = new ClassifierDLApproach() .setInputCols("sentence_embeddings") .setOutputCol("category") .setLabelColumn("label") .setBatchSize(64) .setMaxEpochs(20) .setLr(5e-3f) .setDropout(0.5f) val pipeline = new Pipeline() .setStages( Array( documentAssembler, useEmbeddings, docClassifier ) ) val pipelineModel = pipeline.fit(smallCorpus) pipelineModel.transform(smallCorpus).select("document").show(1, false) } }
Example 108
Source File: StopWordsCleanerTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType.TOKEN import com.johnsnowlabs.nlp.Annotation import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.annotator._ import org.scalatest.FlatSpec import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.Pipeline import org.apache.spark.sql.functions.size class StopWordsCleanerTestSpec extends FlatSpec { "StopWordsCleaner" should "correctly remove stop words from tokenizer's results" in { val testData = ResourceHelper.spark.createDataFrame(Seq( (1, "This is my first sentence. This is my second."), (2, "This is my third sentence. This is my forth.") )).toDF("id", "text") // Let's remove "this" and "is" as stop words val expectedWithoutStopWords = Seq( Annotation(TOKEN, 8, 9, "my", Map("sentence" -> "0")), Annotation(TOKEN, 11, 15, "first", Map("sentence" -> "0")), Annotation(TOKEN, 17, 24, "sentence", Map("sentence" -> "0")), Annotation(TOKEN, 25, 25, ".", Map("sentence" -> "0")), Annotation(TOKEN, 35, 36, "my", Map("sentence" -> "1")), Annotation(TOKEN, 38, 43, "second", Map("sentence" -> "1")), Annotation(TOKEN, 44, 44, ".", Map("sentence" -> "1")), Annotation(TOKEN, 8, 9, "my", Map("sentence" -> "0")), Annotation(TOKEN, 11, 15, "third", Map("sentence" -> "0")), Annotation(TOKEN, 17, 24, "sentence", Map("sentence" -> "0")), Annotation(TOKEN, 25, 25, ".", Map("sentence" -> "0")), Annotation(TOKEN, 35, 36, "my", Map("sentence" -> "1")), Annotation(TOKEN, 38, 42, "forth", Map("sentence" -> "1")), Annotation(TOKEN, 43, 43, ".", Map("sentence" -> "1")) ) val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val stopWords = new StopWordsCleaner() .setInputCols("token") .setOutputCol("cleanTokens") .setStopWords(Array("this", "is")) .setCaseSensitive(false) val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentence, tokenizer, stopWords )) val pipelineDF = pipeline.fit(testData).transform(testData) pipelineDF.select(size(pipelineDF("token.result")).as("totalTokens")).show pipelineDF.select(size(pipelineDF("cleanTokens.result")).as("totalCleanedTokens")).show val tokensWithoutStopWords = Annotation.collect(pipelineDF, "cleanTokens").flatten.toSeq assert(tokensWithoutStopWords == expectedWithoutStopWords) } }
Example 109
Source File: ChunkTokenizerTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.util.io.ReadAs import com.johnsnowlabs.nlp.{Annotation, DocumentAssembler, Finisher, SparkAccessor} import org.apache.spark.ml.Pipeline import org.scalatest.FlatSpec class ChunkTokenizerTestSpec extends FlatSpec { "a ChunkTokenizer" should "correctly identify origin source and in correct order" in { import SparkAccessor.spark.implicits._ val data = Seq( "Hello world, my name is Michael, I am an artist and I work at Benezar", "Robert, an engineer from Farendell, graduated last year. The other one, Lucas, graduated last week." ).toDS.toDF("text") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val entityExtractor = new TextMatcher() .setInputCols("sentence", "token") .setEntities("src/test/resources/entity-extractor/test-chunks.txt", ReadAs.TEXT) .setOutputCol("entity") val chunkTokenizer = new ChunkTokenizer() .setInputCols("entity") .setOutputCol("chunk_token") val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, entityExtractor, chunkTokenizer )) val result = pipeline.fit(data).transform(data) result.show(truncate=true) result.select("entity", "chunk_token").as[(Array[Annotation], Array[Annotation])].foreach(column => { val chunks = column._1 val chunkTokens = column._2 chunkTokens.foreach{chunkToken => { val index = chunkToken.metadata("chunk").toInt require(chunks.apply(index).result.contains(chunkToken.result), s"because ${chunks(index)} does not contain ${chunkToken.result}") }} require(chunkTokens.flatMap(_.metadata.values).distinct.length == chunks.length, s"because amount of chunks ${chunks.length} does not equal to amount of token belongers") }) succeed } }
Example 110
Source File: Doc2ChunkTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.ml.Pipeline import org.scalatest._ class Doc2ChunkTestSpec extends FlatSpec { "a chunk assembler" should "correctly chunk ranges" in { import ResourceHelper.spark.implicits._ val sampleDataset = Seq[(String, String)]( ("Hello world, this is a sentence out of nowhere", "a sentence out"), ("Hey there, there is no chunk here", ""), ("Woah here, don't go so fast", "this is not there") ).toDF("sentence", "target") val answer = Array( Seq[Annotation](Annotation(AnnotatorType.CHUNK, 21, 34, "a sentence out", Map("sentence" -> "0", "chunk" -> "0"))), Seq.empty[Annotation], Seq.empty[Annotation] ) val documentAssembler = new DocumentAssembler().setInputCol("sentence").setOutputCol("document") val chunkAssembler = new Doc2Chunk().setInputCols("document").setChunkCol("target").setOutputCol("chunk") val pipeline = new Pipeline().setStages(Array(documentAssembler, chunkAssembler)) val results = pipeline.fit(Seq.empty[(String, String)].toDF("sentence", "target")) .transform(sampleDataset) .select( "chunk") .as[Seq[Annotation]] .collect() for ((a,b) <- results.zip(answer)) { assert(a == b) } } "a chunk assembler" should "correctly chunk array ranges" in { import ResourceHelper.spark.implicits._ val sampleDataset = Seq[(String, Seq[String])]( ("Hello world, this is a sentence out of nowhere", Seq("world", "out of nowhere")), ("Hey there, there is no chunk here", Seq.empty[String]), ("Woah here, don't go so fast", Seq[String]("this is not there", "so fast")) ).toDF("sentence", "target") val answer = Array( Seq[Annotation]( Annotation(AnnotatorType.CHUNK, 6, 10, "world", Map("sentence" -> "0", "chunk" -> "0")), Annotation(AnnotatorType.CHUNK, 32, 45, "out of nowhere", Map("sentence" -> "0", "chunk" -> "1")) ), Seq.empty[Annotation], Seq[Annotation]( Annotation(AnnotatorType.CHUNK, 20, 26, "so fast", Map("sentence" -> "0", "chunk" -> "1")) ) ) val documentAssembler = new DocumentAssembler().setInputCol("sentence").setOutputCol("document") val chunkAssembler = new Doc2Chunk().setIsArray(true).setInputCols("document").setChunkCol("target").setOutputCol("chunk") val pipeline = new Pipeline().setStages(Array(documentAssembler, chunkAssembler)) val results = pipeline.fit(Seq.empty[(String, Seq[String])].toDF("sentence", "target")) .transform(sampleDataset) .select( "chunk") .as[Seq[Annotation]] .collect() for ((a,b) <- results.zip(answer)) { assert(a == b) } } }
Example 111
Source File: ACMEModel.scala From cdsw-simple-serving with Apache License 2.0 | 5 votes |
// Don't execute these lines in the workbench -- skip to "Start workbench session" package acme import org.apache.spark.ml.PipelineModel import com.cloudera.datascience.cdsw.acme.ACMEData import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.ml.{Pipeline, PipelineModel} import scala.util.Random // Read and cache training data prepared from acme-dataeng: val training = ACMEData.readData() training.cache() training.show() // Build a logistic regression model, val assembler = new VectorAssembler(). setInputCols(training.columns.filter(_ != "Occupancy")). setOutputCol("featureVec") val lr = new LogisticRegression(). setFeaturesCol("featureVec"). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val pipeline = new Pipeline().setStages(Array(assembler, lr)) // and tune that model: val paramGrid = new ParamGridBuilder(). addGrid(lr.regParam, Seq(0.00001, 0.001, 0.1)). addGrid(lr.elasticNetParam, Seq(1.0)). build() val eval = new BinaryClassificationEvaluator(). setLabelCol("Occupancy"). setRawPredictionCol("rawPrediction") val validator = new TrainValidationSplit(). setSeed(Random.nextLong()). setEstimator(pipeline). setEvaluator(eval). setEstimatorParamMaps(paramGrid). setTrainRatio(0.9) val validatorModel = validator.fit(training) val pipelineModel = validatorModel.bestModel.asInstanceOf[PipelineModel] val lrModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel] // Logistic regression model parameters: training.columns.zip(lrModel.coefficients.toArray).foreach(println) // Model hyperparameters: lrModel.getElasticNetParam lrModel.getRegParam // Validation metric (accuracy): validatorModel.validationMetrics.max pipelineModel // End workbench session } }
Example 112
Source File: Featurize.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.{Estimator, Pipeline, PipelineModel} import org.apache.spark.sql._ import org.apache.spark.sql.types._ private[spark] object FeaturizeUtilities { // 2^18 features by default val NumFeaturesDefault = 262144 // 2^12 features for tree-based or NN-based learners val NumFeaturesTreeOrNNBased = 4096 } object Featurize extends DefaultParamsReadable[Featurize] override def fit(dataset: Dataset[_]): PipelineModel = { val pipeline = assembleFeaturesEstimators(getFeatureColumns) pipeline.fit(dataset) } private def assembleFeaturesEstimators(featureColumns: Map[String, Seq[String]]): Pipeline = { val assembleFeaturesEstimators = featureColumns.map(newColToFeatures => { new AssembleFeatures() .setColumnsToFeaturize(newColToFeatures._2.toArray) .setFeaturesCol(newColToFeatures._1) .setNumberOfFeatures(getNumberOfFeatures) .setOneHotEncodeCategoricals(getOneHotEncodeCategoricals) .setAllowImages(getAllowImages) }).toArray new Pipeline().setStages(assembleFeaturesEstimators) } override def copy(extra: ParamMap): Estimator[PipelineModel] = { new Featurize() } @DeveloperApi override def transformSchema(schema: StructType): StructType = assembleFeaturesEstimators(getFeatureColumns).transformSchema(schema) }
Example 113
Source File: RepartitionSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.MLReadable class RepartitionSuite extends TestBase with TransformerFuzzing[Repartition] { import session.implicits._ lazy val input = Seq( (0, "guitars", "drums"), (1, "piano", "trumpet"), (2, "bass", "cymbals"), (3, "guitars", "drums"), (4, "piano", "trumpet"), (5, "bass", "cymbals"), (6, "guitars", "drums"), (7, "piano", "trumpet"), (8, "bass", "cymbals"), (9, "guitars", "drums"), (10, "piano", "trumpet"), (11, "bass", "cymbals") ).toDF("numbers", "words", "more") test("Work for several values of n") { def test(n: Int): Unit = { val result = new Repartition() .setN(n) .transform(input) assert(result.rdd.getNumPartitions == n) () } List(1, 2, 3, 10).foreach(test) } test("Should allow a user to set the partitions specifically in pipeline transform") { val r = new Repartition().setN(1) val pipe = new Pipeline().setStages(Array(r)) val fitPipe = pipe.fit(input) assert(fitPipe.transform(input).rdd.getNumPartitions==1) assert(fitPipe.transform(input, ParamMap(r.n->5)).rdd.getNumPartitions ==5) } def testObjects(): Seq[TestObject[Repartition]] = List(new TestObject( new Repartition().setN(1), input)) def reader: MLReadable[_] = Repartition }
Example 114
Source File: TimerSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame class TimerSuite extends EstimatorFuzzing[Timer] { lazy val df: DataFrame = session .createDataFrame(Seq((0, "Hi I"), (1, "I wish for snow today"), (2, "we Cant go to the park, because of the snow!"), (3, ""))) .toDF("label", "sentence") test("Work with transformers and estimators") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val df2 = new Timer().setStage(tok).fit(df).transform(df) val df3 = new HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2) val idf = new IDF().setInputCol("hash").setOutputCol("idf") val df4 = new Timer().setStage(idf).fit(df3).transform(df3) } test("should work within pipelines") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) pipe.fit(df).transform(df) } test("should be able to turn off timing") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) val model = pipe.fit(df) println("Transforming") println(model.stages(0).params.foreach(println(_))) model.stages(0).asInstanceOf[TimerModel].setDisable(true) model.stages(2).asInstanceOf[TimerModel].setDisable(true) println("here") println(model.stages(0).getParam("disableMaterialization")) model.stages(0).params.foreach(p =>println("foo: " + p.toString)) model.transform(df) } val reader: MLReadable[_] = Timer val modelReader: MLReadable[_] = TimerModel override def testObjects(): Seq[TestObject[Timer]] = Seq(new TestObject[Timer]({ val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") new Timer().setStage(tok) }, df)) }
Example 115
Source File: IForestExample.scala From spark-iforest with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.iforest.{IForest, IForestModel} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Row, SparkSession} object IForestExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local") // test in local mode .appName("iforest example") .getOrCreate() val startTime = System.currentTimeMillis() // Dataset from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original) val dataset = spark.read.option("inferSchema", "true") .csv("data/anomaly-detection/breastw.csv") // Index label values: 2 -> 0, 4 -> 1 val indexer = new StringIndexer() .setInputCol("_c10") .setOutputCol("label") val assembler = new VectorAssembler() assembler.setInputCols(Array("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")) assembler.setOutputCol("features") val iForest = new IForest() .setNumTrees(100) .setMaxSamples(256) .setContamination(0.35) .setBootstrap(false) .setMaxDepth(100) .setSeed(123456L) val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest)) val model = pipeline.fit(dataset) val predictions = model.transform(dataset) // Save pipeline model model.write.overwrite().save("/tmp/iforest.model") // Load pipeline model val loadedPipelineModel = PipelineModel.load("/tmp/iforest.model") // Get loaded iforest model val loadedIforestModel = loadedPipelineModel.stages(2).asInstanceOf[IForestModel] println(s"The loaded iforest model has no summary: model.hasSummary = ${loadedIforestModel.hasSummary}") val binaryMetrics = new BinaryClassificationMetrics( predictions.select("prediction", "label").rdd.map { case Row(label: Double, ground: Double) => (label, ground) } ) val endTime = System.currentTimeMillis() println(s"Training and predicting time: ${(endTime - startTime) / 1000} seconds.") println(s"The model's auc: ${binaryMetrics.areaUnderROC()}") } } // scalastyle:on println
Example 116
Source File: MultilayerPerceptronClassifierWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 117
Source File: SimpleTextClassificationPipeline.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } }
Example 118
Source File: SparkXGBoostRegressorSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.DecisionTreeRegressor import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.SquareLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class SparkXGBoostRegressorSuite extends FunSuite with TestData with MLlibTestSparkContext { test("Compare with DecisionTree using simple data") { val data = sqlContext.createDataFrame(sc.parallelize(simpleData, 2)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(1) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(1) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } test("Compare with DecisionTree using random data") { val data = sqlContext.createDataFrame(randomLabelPointRDD(sc, 40, 10, 2, 999)) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostRegressor = new SparkXGBoostRegressor(new SquareLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(5) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostRegressor)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures") .setMaxDepth(5) val dtPipeLine = new Pipeline() .setStages(Array(featureIndexer, dt)) val dtModel = dtPipeLine.fit(data) val evaluator = new RegressionEvaluator() val sXGBoostrmse = evaluator.evaluate(sXGBoostModel.transform(data)) val dtrmse = evaluator.evaluate(dtModel.transform(data)) assert(sXGBoostrmse ~== dtrmse relTol 1e-5) } }
Example 119
Source File: SparkXGBoostClassifierSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.functions.udf import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.LogisticLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext { test("test with simple data") { val rawdata = Seq( LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(0, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(1, Vectors.dense(1.0, 1.0)) ) val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2)) val truthUDF = udf { feature: Vector => if (feature(0) == feature(1)) 0.0 else 1.0 } val dataWithTruth = data.withColumn("truth", truthUDF(data("features"))) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(2) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostClassifier)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("truth") .setPredictionCol("prediction") .setMetricName("precision") val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth)) assert(precision === 1.0) } }
Example 120
Source File: GradientBoostedTreeRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label")//标签列名 //预测结果列名 .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //rmse均方根误差说明样本的离散程度 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel] println("Learned regression GBT model:\n" + gbtModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 121
Source File: RandomForestRegressorExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} predictions.select("prediction", "label", "features").show(5) // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") //算法预测结果的存储列的名称, 默认是”prediction” .setPredictionCol("prediction") //rmse均方根误差说明样本的离散程度 .setMetricName("rmse") val rmse = evaluator.evaluate(predictions) //Root Mean Squared Error (RMSE) on test data = 0.09854713827168428 println("Root Mean Squared Error (RMSE) on test data = " + rmse) val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel] println("Learned regression forest model:\n" + rfModel.toDebugString) // $example off$ sc.stop() } } // scalastyle:on println
Example 122
Source File: SparkRWrappers.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.api.r import org.apache.spark.ml.attribute._ import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.DataFrame private[r] object SparkRWrappers { def fitRModelFormula( value: String, df: DataFrame, family: String, lambda: Double, alpha: Double): PipelineModel = { val formula = new RFormula().setFormula(value) val estimator = family match { case "gaussian" => new LinearRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) case "binomial" => new LogisticRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) } val pipeline = new Pipeline().setStages(Array(formula, estimator)) pipeline.fit(df) } def getModelWeights(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => Array(m.intercept) ++ m.weights.toArray case _: LogisticRegressionModel => throw new UnsupportedOperationException( "No weights available for LogisticRegressionModel") // SPARK-9492 } } def getModelFeatures(model: PipelineModel): Array[String] = { model.stages.last match { case m: LinearRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) case _: LogisticRegressionModel => throw new UnsupportedOperationException( "No features names available for LogisticRegressionModel") // SPARK-9492 } } }
Example 123
Source File: MultilayerPerceptronClassifierWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ private val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] lazy val weights: Array[Double] = mlpModel.weights.toArray lazy val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 124
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import breeze.linalg._ import breeze.plot._ import org.jfree.chart.axis.NumberTickUnit object ROC extends App { val conf = new SparkConf().setAppName("ROC") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val transformedTest = sqlContext.read.parquet("transformedTest.parquet") val labelScores = transformedTest.select("probability", "label").map { case Row(probability:Vector, label:Double) => (probability(1), label) } val bm = new BinaryClassificationMetrics(labelScores, 300) val roc = bm.roc.collect roc.foreach { println } val falsePositives = roc.map { _._1 } val truePositives = roc.map { _._2 } val f = Figure() val p = f.subplot(0) p += plot(falsePositives, truePositives) p.xlabel = "false positives" p.ylabel = "true positives" p.xlim = (0.0, 0.1) p.xaxis.setTickUnit(new NumberTickUnit(0.01)) p.yaxis.setTickUnit(new NumberTickUnit(0.1)) f.refresh f.saveas("roc.png") }
Example 125
Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.SaveMode case class LabelledDocument(fileName:String, text:String, category:String) object LogisticRegressionDemo extends App { val conf = new SparkConf().setAppName("LrTest") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val spamText = sc.wholeTextFiles("spam/*") val hamText = sc.wholeTextFiles("ham/*") val spamDocuments = spamText.map { case (fileName, text) => LabelledDocument(fileName, text, "spam") } val hamDocuments = hamText.map { case (fileName, text) => LabelledDocument(fileName, text, "ham") } val documentsDF = spamDocuments.union(hamDocuments).toDF documentsDF.persist val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3)) val indexer = new StringIndexer().setInputCol("category").setOutputCol("label") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val hasher = new HashingTF().setInputCol("words").setOutputCol("features") val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0) val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr)) val model = pipeline.fit(trainDF) val transformedTrain = model.transform(trainDF) transformedTrain.persist val transformedTest = model.transform(testDF) transformedTest.persist println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count, " / ",transformedTrain.count) println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count, " / ",transformedTest.count) transformedTrain.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet") transformedTest.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet") }
Example 126
Source File: SimpleTextClassificationPipeline.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } } // scalastyle:on println
Example 127
Source File: SparkRWrappers.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.api.r import org.apache.spark.ml.attribute._ import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.DataFrame private[r] object SparkRWrappers { def fitRModelFormula( value: String, df: DataFrame, family: String, lambda: Double, alpha: Double, standardize: Boolean, solver: String): PipelineModel = { val formula = new RFormula().setFormula(value) val estimator = family match { case "gaussian" => new LinearRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) .setStandardization(standardize) .setSolver(solver) case "binomial" => new LogisticRegression() .setRegParam(lambda) .setElasticNetParam(alpha) .setFitIntercept(formula.hasIntercept) .setStandardization(standardize) } val pipeline = new Pipeline().setStages(Array(formula, estimator)) pipeline.fit(df) } def getModelCoefficients(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => { val coefficientStandardErrorsR = Array(m.summary.coefficientStandardErrors.last) ++ m.summary.coefficientStandardErrors.dropRight(1) val tValuesR = Array(m.summary.tValues.last) ++ m.summary.tValues.dropRight(1) val pValuesR = Array(m.summary.pValues.last) ++ m.summary.pValues.dropRight(1) if (m.getFitIntercept) { Array(m.intercept) ++ m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR } else { m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR } } case m: LogisticRegressionModel => { if (m.getFitIntercept) { Array(m.intercept) ++ m.coefficients.toArray } else { m.coefficients.toArray } } } } def getModelDevianceResiduals(model: PipelineModel): Array[Double] = { model.stages.last match { case m: LinearRegressionModel => m.summary.devianceResiduals case m: LogisticRegressionModel => throw new UnsupportedOperationException( "No deviance residuals available for LogisticRegressionModel") } } def getModelFeatures(model: PipelineModel): Array[String] = { model.stages.last match { case m: LinearRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) if (m.getFitIntercept) { Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) } else { attrs.attributes.get.map(_.name.get) } case m: LogisticRegressionModel => val attrs = AttributeGroup.fromStructField( m.summary.predictions.schema(m.summary.featuresCol)) if (m.getFitIntercept) { Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get) } else { attrs.attributes.get.map(_.name.get) } } } def getModelName(model: PipelineModel): String = { model.stages.last match { case m: LinearRegressionModel => "LinearRegressionModel" case m: LogisticRegressionModel => "LogisticRegressionModel" } } }
Example 128
Source File: OneHotEncoderDemo2.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator object OneHotEncoderDemo2 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() spark.stop() } }
Example 129
Source File: StringIndexerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql._ import org.apache.spark.sql.SQLContext object StringIndexerDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("label") .fit(df) val indexed = indexer.transform(df) indexed.show(false) spark.stop() } }
Example 130
Source File: TokenizerWithNGram.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License | 5 votes |
package org.googlielmo.sparknlpbench import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.NGram import org.apache.spark.sql.SparkSession object TokenizerWithNGram { def main(args: Array[String]): Unit = { val sparkSession: SparkSession = SparkSession .builder() .appName("Tokenize with n-gram example") .master("local[*]") .config("spark.driver.memory", "1G") .config("spark.kryoserializer.buffer.max","200M") .config("spark.serializer","org.apache.spark.serializer.KryoSerializer") .getOrCreate() import sparkSession.implicits._ sparkSession.sparkContext.setLogLevel("WARN") val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val token = new Tokenizer() .setInputCols("document") .setOutputCol("token") val normalizer = new Normalizer() .setInputCols("token") .setOutputCol("normal") val finisher = new Finisher() .setInputCols("normal") val ngram = new NGram() .setN(3) .setInputCol("finished_normal") .setOutputCol("3-gram") val gramAssembler = new DocumentAssembler() .setInputCol("3-gram") .setOutputCol("3-grams") val pipeline = new Pipeline().setStages(Array(document, token, normalizer, finisher, ngram, gramAssembler)) val testing = Seq( (1, "Packt is a famous publishing company"), (2, "Guglielmo is an author") ).toDS.toDF( "_id", "text") val result = pipeline.fit(Seq.empty[String].toDS.toDF("text")).transform(testing) Benchmark.time("Time to convert and show") {result.show(truncate=false)} sparkSession.stop } }
Example 131
Source File: TrainViveknSentiment.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License | 5 votes |
package org.googlielmo.sparknlpbench import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession object TrainViveknSentiment { def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder .appName("Train Vivek N Sentiment Analysis") .master("local[*]") .config("spark.driver.memory", "2G") .config("spark.kryoserializer.buffer.max","200M") .config("spark.serializer","org.apache.spark.serializer.KryoSerializer") .getOrCreate spark.sparkContext.setLogLevel("WARN") import spark.implicits._ val training = Seq( ("I really liked it!", "positive"), ("The cast is horrible", "negative"), ("Never going to watch this again or recommend it", "negative"), ("It's a waste of time", "negative"), ("I loved the main character", "positive"), ("The soundtrack was really good", "positive") ).toDS.toDF("train_text", "train_sentiment") val testing = Array( "I don't recommend this movie, it's horrible", "Dont waste your time!!!" ) val document = new DocumentAssembler() .setInputCol("train_text") .setOutputCol("document") val token = new Tokenizer() .setInputCols("document") .setOutputCol("token") val normalizer = new Normalizer() .setInputCols("token") .setOutputCol("normal") val vivekn = new ViveknSentimentApproach() .setInputCols("document", "normal") .setOutputCol("result_sentiment") .setSentimentCol("train_sentiment") val finisher = new Finisher() .setInputCols("result_sentiment") .setOutputCols("final_sentiment") val pipeline = new Pipeline().setStages(Array(document, token, normalizer, vivekn, finisher)) val sparkPipeline = pipeline.fit(training) //val lightPipeline = new LightPipeline(sparkPipeline) //Benchmark.time("Light pipeline quick annotation") { lightPipeline.annotate(testing) } Benchmark.time("Spark pipeline, this may be too much for just two rows!") { val testingDS = testing.toSeq.toDS.toDF("testing_text") println("Updating DocumentAssembler input column") document.setInputCol("testing_text") sparkPipeline.transform(testingDS).show() } } }
Example 132
Source File: NerDLPipeline.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License | 5 votes |
package org.googlielmo.sparknlpbench import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.annotators.ner.NerConverter import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession object NerDLPipeline { def main(args: Array[String]): Unit = { val sparkSession: SparkSession = SparkSession .builder() .appName("Ner DL Pipeline") .master("local[*]") .getOrCreate() import sparkSession.implicits._ sparkSession.sparkContext.setLogLevel("WARN") val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val token = new Tokenizer() .setInputCols("document") .setOutputCol("token") val normalizer = new Normalizer() .setInputCols("token") .setOutputCol("normal") val ner = NerDLModel.pretrained() .setInputCols("normal", "document") .setOutputCol("ner") val nerConverter = new NerConverter() .setInputCols("document", "normal", "ner") .setOutputCol("ner_converter") val finisher = new Finisher() .setInputCols("ner", "ner_converter") .setIncludeMetadata(true) .setOutputAsArray(false) .setCleanAnnotations(false) .setAnnotationSplitSymbol("@") .setValueSplitSymbol("#") val pipeline = new Pipeline().setStages(Array(document, token, normalizer, ner, nerConverter, finisher)) val testing = Seq( (1, "Packt is a famous publishing company"), (2, "Guglielmo is an author") ).toDS.toDF( "_id", "text") val result = pipeline.fit(Seq.empty[String].toDS.toDF("text")).transform(testing) Benchmark.time("Time to convert and show") {result.select("ner", "ner_converter").show(truncate=false)} sparkSession.stop() } }
Example 133
Source File: GBTLRExample.scala From spark-gbtlr with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.gbtlr.GBTLRClassifier import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession // scalastyle:off println object GBTLRExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("gbtlr example") .getOrCreate() val startTime = System.currentTimeMillis() val dataset = spark.read.option("header", "true").option("inferSchema", "true") .option("delimiter", ";").csv("data/bank/bank-full.csv") val columnNames = Array("job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome", "y") val indexers = columnNames.map(name => new StringIndexer() .setInputCol(name).setOutputCol(name + "_index")) val pipeline = new Pipeline().setStages(indexers) val data1 = pipeline.fit(dataset).transform(dataset) val data2 = data1.withColumnRenamed("y_index", "label") val assembler = new VectorAssembler() assembler.setInputCols(Array("age", "job_index", "marital_index", "education_index", "default_index", "balance", "housing_index", "loan_index", "contact_index", "day", "month_index", "duration", "campaign", "pdays", "previous", "poutcome_index")) assembler.setOutputCol("features") val data3 = assembler.transform(data2) val data4 = data3.randomSplit(Array(4, 1)) val gBTLRClassifier = new GBTLRClassifier() .setFeaturesCol("features") .setLabelCol("label") .setGBTMaxIter(10) .setLRMaxIter(100) .setRegParam(0.01) .setElasticNetParam(0.5) val model = gBTLRClassifier.fit(data4(0)) val summary = model.evaluate(data4(1)) val endTime = System.currentTimeMillis() val auc = summary.binaryLogisticRegressionSummary .asInstanceOf[BinaryLogisticRegressionSummary].areaUnderROC println(s"Training and evaluating cost ${(endTime - startTime) / 1000} seconds") println(s"The model's auc: ${auc}") } } // scalastyle:on println
Example 134
Source File: Word2Vector.scala From xgbspark-text-classification with Apache License 2.0 | 5 votes |
package com.lenovo.ml import org.apache.spark.sql.SparkSession import DataPreprocess.segWords import org.apache.spark.ml.feature._ import org.apache.spark.ml.Pipeline object Word2Vector { def main(args:Array[String]): Unit = { // 1、创建Spark程序入口 val sparkSession = SparkSession.builder().appName("Word2Vector").enableHiveSupport().getOrCreate() // 2、读取训练数据,对文本预处理后分词 val tableName = args(0) val matrix = sparkSession.sql("SELECT text FROM " + tableName + " where text is not null") val words = segWords(sparkSession, args(1), args(2), args(3), args(4), matrix).repartition(6).cache() // 3、数据准备 val tokenizer = new RegexTokenizer().setInputCol("words").setOutputCol("wordsArray") val remover = new StopWordsRemover().setInputCol("wordsArray").setOutputCol("filteredWords") // 4、训练Word2Vec模型 val word2Vec = new Word2Vec().setInputCol("filteredWords").setOutputCol("features").setStepSize(0.025).setNumPartitions(1) .setMaxIter(1).setMaxSentenceLength(1000).setWindowSize(5).setVectorSize(args(5).toInt).setMinCount(10).setSeed(12345L) val pipeline = new Pipeline().setStages(Array(tokenizer, remover, word2Vec)) val Word2VecModel = pipeline.fit(words) // 5、保存模型 Word2VecModel.write.save(args(6)) sparkSession.stop() } }
Example 135
Source File: PipelineExampleTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.ml import com.github.dnvriend.TestSpec import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{ HashingTF, Tokenizer } import org.apache.spark.ml.{ Pipeline, PipelineModel } import org.apache.spark.sql.Row class PipelineExampleTest extends TestSpec { it should "PipelineExample" in withSparkSession { spark => import spark.implicits._ // Prepare training documents from a list of (id, text, label) tuples. val training = Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) ).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training) // Now we can optionally save the fitted pipeline to disk model.write.overwrite().save("/tmp/spark-logistic-regression-model") // We can also save this unfit pipeline to disk pipeline.write.overwrite().save("/tmp/unfit-lr-model") // And load it back in during production val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. val test = Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop"), (8L, "spark f g h"), (9L, "d e f spark a b c"), (10L, "spark baz bar a b c"), (11L, "foo bar a b c spark"), (12L, "a b c scala d e f"), (13L, "spark mapreduce") ).toDF("id", "text") // Make predictions on test documents. model.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } } }
Example 136
Source File: MyPipeLine.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.sql.SparkSession import org.apache.log4j.{Level, Logger} object MyPipeLine { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("My PipeLine") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val trainset = spark.createDataFrame(Seq( (1L, 1, "spark rocks"), (2L, 0, "flink is the best"), (3L, 1, "Spark rules"), (4L, 0, "mapreduce forever"), (5L, 0, "Kafka is great") )).toDF("id", "label", "words") val tokenizer = new Tokenizer() .setInputCol("words") .setOutputCol("tokens") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(15) .setRegParam(0.01) // three stage pipeline val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(trainset) val testSet = spark.createDataFrame(Seq( (10L, 1, "use spark please"), (11L, 2, "Kafka") )).toDF("id", "label", "words") model.transform(testSet).select("probability","prediction").show(false) spark.stop() } }