org.apache.spark.ml.feature.StringIndexer Scala Examples
The following examples show how to use org.apache.spark.ml.feature.StringIndexer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 2
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 6 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 3
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 4
Source File: IndexToStringExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.attribute.Attribute import org.apache.spark.ml.feature.{IndexToString, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object IndexToStringExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("IndexToStringExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) println(s"Transformed string column '${indexer.getInputCol}' " + s"to indexed column '${indexer.getOutputCol}'") indexed.show() val inputColSchema = indexed.schema(indexer.getOutputCol) println(s"StringIndexer will store labels in output column metadata: " + s"${Attribute.fromStructField(inputColSchema).toString}\n") val converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory") val converted = converter.transform(indexed) println(s"Transformed indexed column '${converter.getInputCol}' back to original string " + s"column '${converter.getOutputCol}' using labels in metadata") converted.select("id", "categoryIndex", "originalCategory").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 5
Source File: IForestExample.scala From spark-iforest with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.iforest.{IForest, IForestModel} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Row, SparkSession} object IForestExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local") // test in local mode .appName("iforest example") .getOrCreate() val startTime = System.currentTimeMillis() // Dataset from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original) val dataset = spark.read.option("inferSchema", "true") .csv("data/anomaly-detection/breastw.csv") // Index label values: 2 -> 0, 4 -> 1 val indexer = new StringIndexer() .setInputCol("_c10") .setOutputCol("label") val assembler = new VectorAssembler() assembler.setInputCols(Array("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")) assembler.setOutputCol("features") val iForest = new IForest() .setNumTrees(100) .setMaxSamples(256) .setContamination(0.35) .setBootstrap(false) .setMaxDepth(100) .setSeed(123456L) val pipeline = new Pipeline().setStages(Array(indexer, assembler, iForest)) val model = pipeline.fit(dataset) val predictions = model.transform(dataset) // Save pipeline model model.write.overwrite().save("/tmp/iforest.model") // Load pipeline model val loadedPipelineModel = PipelineModel.load("/tmp/iforest.model") // Get loaded iforest model val loadedIforestModel = loadedPipelineModel.stages(2).asInstanceOf[IForestModel] println(s"The loaded iforest model has no summary: model.hasSummary = ${loadedIforestModel.hasSummary}") val binaryMetrics = new BinaryClassificationMetrics( predictions.select("prediction", "label").rdd.map { case Row(label: Double, ground: Double) => (label, ground) } ) val endTime = System.currentTimeMillis() println(s"Training and predicting time: ${(endTime - startTime) / 1000} seconds.") println(s"The model's auc: ${binaryMetrics.areaUnderROC()}") } } // scalastyle:on println
Example 6
Source File: MultiColumnAdapterSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.feature.{StringIndexer, Tokenizer} import org.apache.spark.ml.util.MLReadable import scala.collection.mutable class MultiColumnAdapterSpec extends TestBase with EstimatorFuzzing[MultiColumnAdapter] { lazy val wordDF = session.createDataFrame(Seq( (0, "This is a test", "this is one too"), (1, "could be a test", "bar"), (2, "foo", "bar"), (3, "foo", "maybe not"))) .toDF("label", "words1", "words2") lazy val inputCols = Array[String]("words1", "words2") lazy val outputCols = Array[String]("output1", "output2") lazy val stage = new StringIndexer() lazy val adaptedEstimator = new MultiColumnAdapter().setBaseStage(stage) .setInputCols(inputCols).setOutputCols(outputCols) test("parallelize transformers") { val stage1 = new Tokenizer() val transformer = new MultiColumnAdapter().setBaseStage(stage1) .setInputCols(inputCols).setOutputCols(outputCols) val tokenizedDF = transformer.fit(wordDF).transform(wordDF) val lines = tokenizedDF.getColAs[Array[String]]("output2") val trueLines = Array( Array("this", "is", "one", "too"), Array("bar"), Array("bar"), Array("maybe", "not") ) assert(lines === trueLines) } test("parallelize estimator") { val stringIndexedDF = adaptedEstimator.fit(wordDF).transform(wordDF) val lines1 = stringIndexedDF.getColAs[Array[String]]("output1") val trueLines1 = mutable.ArraySeq(1, 2, 0, 0) assert(lines1 === trueLines1) val lines2 = stringIndexedDF.getColAs[Array[String]]("output2") val trueLines2 = mutable.ArraySeq(1, 0, 0, 2) assert(lines2 === trueLines2) } def testObjects(): Seq[TestObject[MultiColumnAdapter]] = List(new TestObject(adaptedEstimator, wordDF)) override def reader: MLReadable[_] = MultiColumnAdapter override def modelReader: MLReadable[_] = PipelineModel }
Example 7
Source File: Preprocessor.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions import config.paramconf.PreprocessParams import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer} import org.apache.spark.sql.DataFrame def preprocess(data: DataFrame): Pipeline = { val spark = data.sparkSession val params = new PreprocessParams val indexModel = new StringIndexer() .setHandleInvalid(params.handleInvalid) .setInputCol("label") .setOutputCol("indexedLabel") .fit(data) val cleaner = new Cleaner() .setFanJian(params.fanjian) .setQuanBan(params.quanban) .setMinLineLen(params.minLineLen) .setInputCol("content") .setOutputCol("cleand") val segmenter = new Segmenter() .isAddNature(params.addNature) .isDelEn(params.delEn) .isDelNum(params.delNum) .isNatureFilter(params.natureFilter) .setMinTermLen(params.minTermLen) .setMinTermNum(params.minTermNum) .setSegType(params.segmentType) .setInputCol(cleaner.getOutputCol) .setOutputCol("segmented") val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect() val remover = new StopWordsRemover() .setStopWords(stopwords) .setInputCol(segmenter.getOutputCol) .setOutputCol("removed") val vectorizer = new CountVectorizer() .setMinTF(params.minTF) .setVocabSize(params.vocabSize) .setInputCol(remover.getOutputCol) .setOutputCol("vectorized") val idf = new IDF() .setMinDocFreq(params.minDocFreq) .setInputCol(vectorizer.getOutputCol) .setOutputCol("features") val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf) new Pipeline().setStages(stages) } }
Example 8
Source File: OneHotEncoderExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("OneHotEncoderExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: StringIndexerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StringIndexer // $example off$ import org.apache.spark.sql.SparkSession object StringIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StringIndexerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame( Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) ).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") val indexed = indexer.fit(df).transform(df) indexed.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 10
Source File: IndexToStringExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.attribute.Attribute import org.apache.spark.ml.feature.{IndexToString, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object IndexToStringExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("IndexToStringExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) println(s"Transformed string column '${indexer.getInputCol}' " + s"to indexed column '${indexer.getOutputCol}'") indexed.show() val inputColSchema = indexed.schema(indexer.getOutputCol) println(s"StringIndexer will store labels in output column metadata: " + s"${Attribute.fromStructField(inputColSchema).toString}\n") val converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory") val converted = converter.transform(indexed) println(s"Transformed indexed column '${converter.getInputCol}' back to original string " + s"column '${converter.getOutputCol}' using labels in metadata") converted.select("id", "categoryIndex", "originalCategory").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 11
Source File: OneHotEncoderExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.regression.bikesharing import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} val spark = SparkSession .builder() .appName("Spark SQL basic example").master("local[1]") .config("spark.some.config.option", "some-value") .getOrCreate() // For implicit conversions like converting RDDs to DataFrames val df = spark.createDataFrame(Seq( (0, 3), (1, 2), (2, 4), (3, 3), (4, 3), (5, 4) )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.select("id", "categoryVec").show() } }
Example 12
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 13
Source File: StringIndexerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StringIndexer // $example off$ import org.apache.spark.sql.SparkSession object StringIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StringIndexerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame( Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) ).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") val indexed = indexer.fit(df).transform(df) indexed.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 14
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 15
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 16
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/NB.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/NaiveBayes.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 17
Source File: GradientBoostedTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object GradientBoostedTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def gradientBoostedTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val gbt = new GBTClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxIter(10) stages += vectorAssembler stages += gbt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/GBT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/GBT.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 18
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/DT.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/DecisionTree.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 19
Source File: RandomForestPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.mllib.evaluation.{MulticlassMetrics, RegressionMetrics} import org.apache.spark.sql.DataFrame import scala.collection.mutable object RandomForestPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val rf = new RandomForestClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setNumTrees(20) .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += rf val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // have to do a type conversion for RegressionMetrics val rm = new RegressionMetrics(holdout.rdd.map(x => (x(0).asInstanceOf[Double], x(1).asInstanceOf[Double]))) logger.info("Test Metrics") logger.info("Test Explained Variance:") logger.info(rm.explainedVariance) logger.info("Test R^2 Coef:") logger.info(rm.r2) logger.info("Test MSE:") logger.info(rm.meanSquaredError) logger.info("Test RMSE:") logger.info(rm.rootMeanSquaredError) val predictions = model.transform(test).select("prediction").rdd.map(_.getDouble(0)) val labels = model.transform(test).select("label").rdd.map(_.getDouble(0)) val accuracy = new MulticlassMetrics(predictions.zip(labels)).precision println(s" Accuracy : $accuracy") holdout.rdd.map(x => x(0).asInstanceOf[Double]).repartition(1).saveAsTextFile("/home/ubuntu/work/ml-resources/spark-ml/results/RF.xls") savePredictions(holdout, test, rm, "/home/ubuntu/work/ml-resources/spark-ml/results/RandomForest.csv") } def savePredictions(predictions:DataFrame, testRaw:DataFrame, regressionMetrics: RegressionMetrics, filePath:String) = { predictions .coalesce(1) .write.format("com.databricks.spark.csv") .option("header", "true") .save(filePath) } }
Example 20
Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.SaveMode case class LabelledDocument(fileName:String, text:String, category:String) object LogisticRegressionDemo extends App { val conf = new SparkConf().setAppName("LrTest") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val spamText = sc.wholeTextFiles("spam/*") val hamText = sc.wholeTextFiles("ham/*") val spamDocuments = spamText.map { case (fileName, text) => LabelledDocument(fileName, text, "spam") } val hamDocuments = hamText.map { case (fileName, text) => LabelledDocument(fileName, text, "ham") } val documentsDF = spamDocuments.union(hamDocuments).toDF documentsDF.persist val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3)) val indexer = new StringIndexer().setInputCol("category").setOutputCol("label") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val hasher = new HashingTF().setInputCol("words").setOutputCol("features") val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0) val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr)) val model = pipeline.fit(trainDF) val transformedTrain = model.transform(trainDF) transformedTrain.persist val transformedTest = model.transform(testDF) transformedTest.persist println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count, " / ",transformedTrain.count) println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count, " / ",transformedTest.count) transformedTrain.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet") transformedTest.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet") }
Example 21
Source File: IndexToStringExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.attribute.Attribute import org.apache.spark.ml.feature.{IndexToString, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object IndexToStringExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("IndexToStringExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) println(s"Transformed string column '${indexer.getInputCol}' " + s"to indexed column '${indexer.getOutputCol}'") indexed.show() val inputColSchema = indexed.schema(indexer.getOutputCol) println(s"StringIndexer will store labels in output column metadata: " + s"${Attribute.fromStructField(inputColSchema).toString}\n") val converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory") val converted = converter.transform(indexed) println(s"Transformed indexed column '${converter.getInputCol}' back to original string " + s"column '${converter.getOutputCol}' using labels in metadata") converted.select("id", "categoryIndex", "originalCategory").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 22
Source File: GBTLRExample.scala From spark-gbtlr with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import org.apache.spark.ml.gbtlr.GBTLRClassifier import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession // scalastyle:off println object GBTLRExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("gbtlr example") .getOrCreate() val startTime = System.currentTimeMillis() val dataset = spark.read.option("header", "true").option("inferSchema", "true") .option("delimiter", ";").csv("data/bank/bank-full.csv") val columnNames = Array("job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome", "y") val indexers = columnNames.map(name => new StringIndexer() .setInputCol(name).setOutputCol(name + "_index")) val pipeline = new Pipeline().setStages(indexers) val data1 = pipeline.fit(dataset).transform(dataset) val data2 = data1.withColumnRenamed("y_index", "label") val assembler = new VectorAssembler() assembler.setInputCols(Array("age", "job_index", "marital_index", "education_index", "default_index", "balance", "housing_index", "loan_index", "contact_index", "day", "month_index", "duration", "campaign", "pdays", "previous", "poutcome_index")) assembler.setOutputCol("features") val data3 = assembler.transform(data2) val data4 = data3.randomSplit(Array(4, 1)) val gBTLRClassifier = new GBTLRClassifier() .setFeaturesCol("features") .setLabelCol("label") .setGBTMaxIter(10) .setLRMaxIter(100) .setRegParam(0.01) .setElasticNetParam(0.5) val model = gBTLRClassifier.fit(data4(0)) val summary = model.evaluate(data4(1)) val endTime = System.currentTimeMillis() val auc = summary.binaryLogisticRegressionSummary .asInstanceOf[BinaryLogisticRegressionSummary].areaUnderROC println(s"Training and evaluating cost ${(endTime - startTime) / 1000} seconds") println(s"The model's auc: ${auc}") } } // scalastyle:on println
Example 23
Source File: TitanicLogisticRegression.scala From spark-spec with MIT License | 5 votes |
package com.github.mrpowers.spark.spec.ml.classification import com.github.mrpowers.spark.spec.sql.SparkSessionWrapper import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.DataFrame object TitanicLogisticRegression extends SparkSessionWrapper { def withVectorizedFeatures( featureColNames: Array[String] = Array("Gender", "Age", "SibSp", "Parch", "Fare"), outputColName: String = "features" )(df: DataFrame): DataFrame = { val assembler: VectorAssembler = new VectorAssembler() .setInputCols(featureColNames) .setOutputCol(outputColName) assembler.transform(df) } def withLabel( inputColName: String = "Survived", outputColName: String = "label" )(df: DataFrame) = { val labelIndexer: StringIndexer = new StringIndexer() .setInputCol(inputColName) .setOutputCol(outputColName) labelIndexer .fit(df) .transform(df) } def model(df: DataFrame = TitanicData.trainingDF()): LogisticRegressionModel = { val trainFeatures: DataFrame = df .transform(withVectorizedFeatures()) .transform(withLabel()) .select("features", "label") // only uses the features and label columns new LogisticRegression() .fit(trainFeatures) } def persistModel(): Unit = { model().save("./tmp/titanic_model/") } }
Example 24
Source File: StringIndexerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql._ import org.apache.spark.sql.SQLContext object StringIndexerDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("label") .fit(df) val indexed = indexer.transform(df) indexed.show(false) spark.stop() } }
Example 25
Source File: OneHotEncoderDemo2.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer } import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.functions.year import org.apache.spark.ml.{ Pipeline, PipelineStage } import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel } import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.{ DataFrame, SparkSession } import scala.collection.mutable import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator object OneHotEncoderDemo2 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, "Jason", "Germany"), (1, "David", "France"), (2, "Martin", "Spain"), (3, "Jason", "USA"), (4, "Daiel", "UK"), (5, "Moahmed", "Bangladesh"), (6, "David", "Ireland"), (7, "Jason", "Netherlands"))).toDF("id", "name", "address") df.show(false) val indexer = new StringIndexer() .setInputCol("name") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() spark.stop() } }
Example 26
Source File: OneHotEncoderExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object OneHotEncoderExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("OneHotEncoderExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val df = sqlContext.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.select("id", "categoryVec").show() // $example off$ sc.stop() } } // scalastyle:on println
Example 27
Source File: StringIndexerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StringIndexer // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object StringIndexerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StringIndexerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val df = sqlContext.createDataFrame( Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) ).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") val indexed = indexer.fit(df).transform(df) indexed.show() // $example off$ sc.stop() } } // scalastyle:on println
Example 28
Source File: IndexToStringExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.ml.feature.{StringIndexer, IndexToString} // $example off$ object IndexToStringExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("IndexToStringExample") val sc = new SparkContext(conf) val sqlContext = SQLContext.getOrCreate(sc) // $example on$ val df = sqlContext.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory") val converted = converter.transform(indexed) converted.select("id", "originalCategory").show() // $example off$ sc.stop() } } // scalastyle:on println
Example 29
Source File: StringIndexerModelSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import com.opendatagroup.hadrian.errors.PFAUserException import org.apache.spark.SparkException import org.apache.spark.ml.feature.StringIndexer class StringIndexerModelSuite extends SparkFeaturePFASuiteBase[StringIndexerResult] { import spark.implicits._ val df = Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") ).toDF("id", "category") val testHandleInvalidDF = Seq( (0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "c") ).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") override val sparkTransformer = indexer.fit(df) val result = sparkTransformer.transform(df) val sparkOutput = result.select(indexer.getOutputCol).toDF() override val input = result.select(indexer.getInputCol).toJSON.collect() override val expectedOutput = sparkOutput.toJSON.collect() // Additional test for handleInvalid test("StringIndexer with handleInvalid=keep") { val sparkTransformer = indexer.setHandleInvalid("keep").fit(df) val result = sparkTransformer.transform(testHandleInvalidDF) val input = testHandleInvalidDF.select(indexer.getInputCol).toJSON.collect() val expectedOutput = result.select(indexer.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StringIndexer with handleInvalid=error") { val sparkTransformer = indexer.setHandleInvalid("error").fit(df) intercept[SparkException] { val result = sparkTransformer.transform(testHandleInvalidDF) result.foreach(_ => Unit) } intercept[PFAUserException] { val input = testHandleInvalidDF.select(indexer.getInputCol).toJSON.collect() // we transform on df here to avoid Spark throwing the error and to ensure we match // the sizes of expected input / output. The error should be thrown before the comparison // would fail val expectedOutput = sparkTransformer.transform(df).select(indexer.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } } } case class StringIndexerResult(categoryIndex: Double) extends Result
Example 30
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import breeze.linalg._ import breeze.plot._ import org.jfree.chart.axis.NumberTickUnit object ROC extends App { val conf = new SparkConf().setAppName("ROC") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val transformedTest = sqlContext.read.parquet("transformedTest.parquet") val labelScores = transformedTest.select("probability", "label").map { case Row(probability:Vector, label:Double) => (probability(1), label) } val bm = new BinaryClassificationMetrics(labelScores, 300) val roc = bm.roc.collect roc.foreach { println } val falsePositives = roc.map { _._1 } val truePositives = roc.map { _._2 } val f = Figure() val p = f.subplot(0) p += plot(falsePositives, truePositives) p.xlabel = "false positives" p.ylabel = "true positives" p.xlim = (0.0, 0.1) p.xaxis.setTickUnit(new NumberTickUnit(0.01)) p.yaxis.setTickUnit(new NumberTickUnit(0.1)) f.refresh f.saveas("roc.png") }
Example 31
Source File: MLTestSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.Row class MLTestSuite extends MLTest { import testImplicits._ test("test transformer on stream data") { val data = Seq((0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "f")) .toDF("id", "label") val indexer = new StringIndexer().setStringOrderType("alphabetAsc") .setInputCol("label").setOutputCol("indexed") val indexerModel = indexer.fit(data) testTransformer[(Int, String)](data, indexerModel, "id", "indexed") { case Row(id: Int, indexed: Double) => assert(id === indexed.toInt) } testTransformerByGlobalCheckFunc[(Int, String)] (data, indexerModel, "id", "indexed") { rows => assert(rows.map(_.getDouble(1)).max === 5.0) } intercept[Exception] { testTransformerOnStreamData[(Int, String)](data, indexerModel, "id", "indexed") { case Row(id: Int, indexed: Double) => assert(id != indexed.toInt) } } intercept[Exception] { testTransformerOnStreamData[(Int, String)](data, indexerModel, "id", "indexed") { rows: Seq[Row] => assert(rows.map(_.getDouble(1)).max === 1.0) } } } }
Example 32
Source File: StringIndexerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StringIndexer // $example off$ import org.apache.spark.sql.SparkSession object StringIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StringIndexerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame( Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) ).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") val indexed = indexer.fit(df).transform(df) indexed.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 33
Source File: IndexToStringExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.attribute.Attribute import org.apache.spark.ml.feature.{IndexToString, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object IndexToStringExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("IndexToStringExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) println(s"Transformed string column '${indexer.getInputCol}' " + s"to indexed column '${indexer.getOutputCol}'") indexed.show() val inputColSchema = indexed.schema(indexer.getOutputCol) println(s"StringIndexer will store labels in output column metadata: " + s"${Attribute.fromStructField(inputColSchema).toString}\n") val converter = new IndexToString() .setInputCol("categoryIndex") .setOutputCol("originalCategory") val converted = converter.transform(indexed) println(s"Transformed indexed column '${converter.getInputCol}' back to original string " + s"column '${converter.getOutputCol}' using labels in metadata") converted.select("id", "categoryIndex", "originalCategory").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 34
Source File: StringIndexerExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StringIndexer // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") //fit()方法将DataFrame转化为一个Transformer的算法 //transform()方法将DataFrame转化为另外一个DataFrame的算法 val indexed = indexer.fit(df).transform(df) indexed.show() // $example off$ sc.stop() } } // scalastyle:on println
Example 35
Source File: OpStringIndexer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.feature.{StringIndexerHandleInvalid => Inv} import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper import enumeratum._ import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel} import scala.reflect.runtime.universe.TypeTag def setHandleInvalid(value: StringIndexerHandleInvalid): this.type = { require(Seq(Inv.Skip, Inv.Error, Inv.Keep).contains(value), "OpStringIndexer only supports Skip, Error, and Keep for handle invalid") getSparkMlStage().get.setHandleInvalid(value.entryName.toLowerCase) this } } sealed trait StringIndexerHandleInvalid extends EnumEntry with Serializable object StringIndexerHandleInvalid extends Enum[StringIndexerHandleInvalid] { val values = findValues case object Skip extends StringIndexerHandleInvalid case object Error extends StringIndexerHandleInvalid case object Keep extends StringIndexerHandleInvalid case object NoFilter extends StringIndexerHandleInvalid }
Example 36
Source File: TypedStringIndexer.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml package feature import frameless.ml.feature.TypedStringIndexer.HandleInvalid import frameless.ml.internals.UnaryInputsChecker import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel} final class TypedStringIndexer[Inputs] private[ml](stringIndexer: StringIndexer, inputCol: String) extends TypedEstimator[Inputs, TypedStringIndexer.Outputs, StringIndexerModel] { val estimator: StringIndexer = stringIndexer .setInputCol(inputCol) .setOutputCol(AppendTransformer.tempColumnName) def setHandleInvalid(value: HandleInvalid): TypedStringIndexer[Inputs] = copy(stringIndexer.setHandleInvalid(value.sparkValue)) private def copy(newStringIndexer: StringIndexer): TypedStringIndexer[Inputs] = new TypedStringIndexer[Inputs](newStringIndexer, inputCol) } object TypedStringIndexer { case class Outputs(indexedOutput: Double) sealed abstract class HandleInvalid(val sparkValue: String) object HandleInvalid { case object Error extends HandleInvalid("error") case object Skip extends HandleInvalid("skip") case object Keep extends HandleInvalid("keep") } def apply[Inputs](implicit inputsChecker: UnaryInputsChecker[Inputs, String]): TypedStringIndexer[Inputs] = { new TypedStringIndexer[Inputs](new StringIndexer(), inputsChecker.inputCol) } }
Example 37
Source File: OneHotEncoderExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("OneHotEncoderExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 38
Source File: TrainValidationSplitParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.sql.DataFrame class TrainValidationSplitParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new TrainValidationSplit(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 39
Source File: OneVsRestParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame class OneVsRestParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new OneVsRest().setClassifier(new LogisticRegression()). setLabelCol("fico_index"). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "classifier", "labelCol") }
Example 40
Source File: DecisionTreeClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class DecisionTreeClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new DecisionTreeClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 41
Source File: GaussianMixtureParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.GaussianMixture import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class GaussianMixtureParitySpec extends SparkParityBase { override val dataset: DataFrame = { baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") } override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GaussianMixture(). setFeaturesCol("features"). setPredictionCol("prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "tol") }
Example 42
Source File: KMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.clustering.KMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class KMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new KMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "initMode", "initSteps", "maxIter", "tol", "k", "seed") }
Example 43
Source File: BisectingKMeansParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.clustering.BisectingKMeans import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class BisectingKMeansParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "fico_score_group_fnl") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new BisectingKMeans(). setFeaturesCol("features"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "k", "maxIter", "seed", "minDivisibleClusterSize") }
Example 44
Source File: VectorIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class VectorIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount", "state") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("state"). setOutputCol("state_index"), new VectorAssembler(). setInputCols(Array("dti", "loan_amount", "state_index")). setOutputCol("features"), new VectorIndexer(). setInputCol("features"). setOutputCol("scaled_features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 45
Source File: ReverseStringIndexerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{IndexToString, StringIndexer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class ReverseStringIndexerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("state") override val sparkTransformer: Transformer = { val stringIndexer = new StringIndexer(). setInputCol("state"). setOutputCol("state_index"). fit(dataset) val reverseStringIndexer = new IndexToString(). setInputCol("state_index"). setOutputCol("state_reverse"). setLabels(stringIndexer.labels) new Pipeline().setStages(Array(stringIndexer, reverseStringIndexer)).fit(dataset) } override val unserializedParams = Set("stringOrderType") }
Example 46
Source File: OneHotEncoderParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class OneHotEncoderParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("state") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer().setInputCol("state").setOutputCol("state_index"), new StringIndexer().setInputCol("state").setOutputCol("state_index2"), new OneHotEncoderEstimator() .setInputCols(Array("state_index", "state_index2")) .setOutputCols(Array("state_oh", "state_oh2")) )) .fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 47
Source File: LogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame import org.apache.spark.ml.linalg.Vectors class LogisticRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficients = Vectors.dense(0.44, 0.77), intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 48
Source File: CrossValidatorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.validation import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.{DecisionTreeRegressor, RandomForestRegressor} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.DataFrame class CrossValidatorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = { val regressor = new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction") val paramGrid = new ParamGridBuilder() .addGrid(regressor.numTrees, Array(2, 3, 4)) .build() new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new CrossValidator(). setEvaluator(new RegressionEvaluator(). setLabelCol("loan_amount"). setPredictionCol("prediction")). setEstimator(regressor). setEstimatorParamMaps(paramGrid))).fit(dataset) } override val ignoreSerializationTest = true }
Example 49
Source File: LinearSVCParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.parity import org.apache.spark.ml.classification.LinearSVCModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearSVCParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LinearSVCModel("linear_svc", Vectors.dense(0.44, 0.77), 0.66).setThreshold(0.5).setFeaturesCol("features"))) .fit(dataset) // The string order type is ignored, because once the transformer is built based on some order type, we need to serialize only the string to index map // but not the order in which it has to index. This value we can ignore while we check the transformer values. override val unserializedParams: Set[String] = Set("stringOrderType") }
Example 50
Source File: PipelineConstruction.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} object PipelineConstruction { // Index labels, adding metadata to the label column. Fit on whole dataset to include all labels in index. val ipindexer = new StringIndexer() .setInputCol("international_plan") .setOutputCol("iplanIndex") val labelindexer = new StringIndexer() .setInputCol("churn") .setOutputCol("label") val featureCols = Array("account_length", "iplanIndex", "num_voice_mail", "total_day_mins", "total_day_calls", "total_evening_mins", "total_evening_calls", "total_night_mins", "total_night_calls", "total_international_mins", "total_international_calls", "total_international_num_calls") val assembler = new VectorAssembler() .setInputCols(featureCols) .setOutputCol("features") }
Example 51
Source File: Preproessing.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML import org.apache.spark.ml.feature.{ StringIndexer, StringIndexerModel} import org.apache.spark.ml.feature.VectorAssembler object Preproessing { var trainSample = 1.0 var testSample = 1.0 val train = "data/insurance_train.csv" val test = "data/insurance_test.csv" val spark = SparkSessionCreate.createSession() import spark.implicits._ println("Reading data from " + train + " file") val trainInput = spark.read .option("header", "true") .option("inferSchema", "true") .format("com.databricks.spark.csv") .load(train) .cache val testInput = spark.read .option("header", "true") .option("inferSchema", "true") .format("com.databricks.spark.csv") .load(test) .cache println("Preparing data for training model") var data = trainInput.withColumnRenamed("loss", "label").sample(false, trainSample) var DF = data.na.drop() // Null check if (data == DF) println("No null values in the DataFrame") else { println("Null values exist in the DataFrame") data = DF } val seed = 12345L val splits = data.randomSplit(Array(0.75, 0.25), seed) val (trainingData, validationData) = (splits(0), splits(1)) trainingData.cache validationData.cache val testData = testInput.sample(false, testSample).cache def isCateg(c: String): Boolean = c.startsWith("cat") def categNewCol(c: String): String = if (isCateg(c)) s"idx_${c}" else c // Function to remove categorical columns with too many categories def removeTooManyCategs(c: String): Boolean = !(c matches "cat(109$|110$|112$|113$|116$)") // Function to select only feature columns (omit id and label) def onlyFeatureCols(c: String): Boolean = !(c matches "id|label") // Definitive set of feature columns val featureCols = trainingData.columns .filter(removeTooManyCategs) .filter(onlyFeatureCols) .map(categNewCol) // StringIndexer for categorical columns (OneHotEncoder should be evaluated as well) val stringIndexerStages = trainingData.columns.filter(isCateg) .map(c => new StringIndexer() .setInputCol(c) .setOutputCol(categNewCol(c)) .fit(trainInput.select(c).union(testInput.select(c)))) // VectorAssembler for training features val assembler = new VectorAssembler() .setInputCols(featureCols) .setOutputCol("features") }
Example 52
Source File: StringIndexerWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.feature.{StringIndexer, Tokenizer} class StringIndexerWrapper extends TransformerWrapper { override val transformer = new StringIndexer() override var parent: TransformerWrapper = _ override val requiredInputCols: Array[String] = Array("words") override val requiredOutputCols: Array[String] = Array("outStringIndexer") override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override def declareInAndOut(): this.type = { transformer.asInstanceOf[Tokenizer].setInputCol(getInputCols(0)) transformer.asInstanceOf[Tokenizer].setOutputCol(getOutputCols(0)) this } }
Example 53
Source File: OneHotEncoderExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} // $example off$ import org.apache.spark.sql.SparkSession object OneHotEncoderExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("OneHotEncoderExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") )).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") .fit(df) val indexed = indexer.transform(df) val encoder = new OneHotEncoder() .setInputCol("categoryIndex") .setOutputCol("categoryVec") val encoded = encoder.transform(indexed) encoded.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 54
Source File: StringIndexerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StringIndexer // $example off$ import org.apache.spark.sql.SparkSession object StringIndexerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StringIndexerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame( Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")) ).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") val indexed = indexer.fit(df).transform(df) indexed.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 55
Source File: DecisionTreeRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class DecisionTreeRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new DecisionTreeRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 56
Source File: DecisionTreePipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.DecisionTreeClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object DecisionTreePipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val dt = new DecisionTreeClassifier() .setFeaturesCol(vectorAssembler.getOutputCol) .setLabelCol("indexedLabel") .setMaxDepth(5) .setMaxBins(32) .setMinInstancesPerNode(1) .setMinInfoGain(0.0) .setCacheNodeIds(false) .setCheckpointInterval(10) stages += vectorAssembler stages += dt val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 57
Source File: SupportVectorMachineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.classification import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql._ class SupportVectorMachineParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new SVMModel(uid = "svm", model = new mllib.classification.SVMModel(weights = Vectors.dense(0.53, 0.67), intercept = 0.77)). setRawPredictionCol("raw_prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 58
Source File: MathBinaryParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.BinaryOperation.Multiply import ml.combust.mleap.core.feature.MathBinaryModel import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.mleap.feature.MathBinary import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MathBinaryParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new MathBinary(uid = "math_bin", model = MathBinaryModel(Multiply)). setInputA("fico_index"). setInputB("dti"). setOutputCol("bin_out") )).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 59
Source File: MultinomialLabelerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.{MultinomialLabelerModel, ReverseStringIndexerModel} import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.feature.MultinomialLabeler import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MultinomialLabelerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new MultinomialLabeler(uid = "multinomial_labeler", model = MultinomialLabelerModel(threshold = 0.1, indexer = ReverseStringIndexerModel(Seq("fico", "dtizy")))). setFeaturesCol("features"). setProbabilitiesCol("probabilities"). setLabelsCol("labels"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 60
Source File: MathUnaryParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.feature import ml.combust.mleap.core.feature.MathUnaryModel import ml.combust.mleap.core.feature.UnaryOperation.Tan import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.mleap.feature.MathUnary import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MathUnaryParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new MathUnary(uid = "math_unary", model = MathUnaryModel(Tan)). setInputCol("dti"). setOutputCol("dti_tan") )).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 61
Source File: TestSparkMl.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.databricks.runtime.testkit import java.io.File import java.nio.file.{Files, StandardCopyOption} import ml.combust.bundle.BundleFile import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.SparkSession import com.databricks.spark.avro._ import ml.combust.mleap.spark.SparkSupport._ import ml.combust.mleap.runtime.MleapSupport._ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression class TestSparkMl(session: SparkSession) extends Runnable { override def run(): Unit = { val sqlContext = session.sqlContext // Create a temporary file and copy the contents of the resource avro to it val path = Files.createTempFile("mleap-databricks-runtime-testkit", ".avro") Files.copy(getClass.getClassLoader.getResource("datasources/lending_club_sample.avro").openStream(), path, StandardCopyOption.REPLACE_EXISTING) val sampleData = sqlContext.read.avro(path.toString) sampleData.show() val stringIndexer = new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index") val featureAssembler = new VectorAssembler(). setInputCols(Array(stringIndexer.getOutputCol, "dti", "loan_amount")). setOutputCol("features") val logisticRegression = new LogisticRegression(). setFeaturesCol(featureAssembler.getOutputCol). setLabelCol("approved"). setPredictionCol("prediction") val pipeline = new Pipeline().setStages(Array(stringIndexer, featureAssembler, logisticRegression)) val model = pipeline.fit(sampleData) val modelPath = Files.createTempFile("mleap-databricks-runtime-testkit", ".zip") Files.delete(modelPath) // Save the model { println("Writing model to...", modelPath) implicit val sbc = SparkBundleContext.defaultContext.withDataset(model.transform(sampleData)) val bf = BundleFile(new File(modelPath.toString)) model.writeBundle.save(bf).get bf.close() } // Load the model { val bf = BundleFile(new File(modelPath.toString)) bf.loadMleapBundle().get bf.close() } } }
Example 62
Source File: TestXgboost.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.databricks.runtime.testkit import java.io.File import java.nio.file.{Files, StandardCopyOption} import ml.combust.bundle.BundleFile import org.apache.spark.ml.bundle.SparkBundleContext import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.sql.SparkSession import com.databricks.spark.avro._ import ml.combust.mleap.spark.SparkSupport._ import ml.combust.mleap.runtime.MleapSupport._ import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier import org.apache.spark.ml.Pipeline class TestXgboost(session: SparkSession) extends Runnable { private val xgboostParams: Map[String, Any] = Map( "eta" -> 0.3, "max_depth" -> 2, "objective" -> "binary:logistic", "early_stopping_rounds" ->2, "num_round" -> 15, "nworkers" -> 2 ) override def run(): Unit = { val sqlContext = session.sqlContext // Create a temporary file and copy the contents of the resource avro to it val path = Files.createTempFile("mleap-databricks-runtime-testkit", ".avro") Files.copy(getClass.getClassLoader.getResource("datasources/lending_club_sample.avro").openStream(), path, StandardCopyOption.REPLACE_EXISTING) val sampleData = sqlContext.read.avro(path.toString) sampleData.show() val stringIndexer = new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index") val featureAssembler = new VectorAssembler(). setInputCols(Array(stringIndexer.getOutputCol, "dti", "loan_amount")). setOutputCol("features") val logisticRegression = new XGBoostClassifier(xgboostParams). setFeaturesCol("features"). setLabelCol("approved"). setPredictionCol("prediction") val pipeline = new Pipeline().setStages(Array(stringIndexer, featureAssembler, logisticRegression)) val model = pipeline.fit(sampleData) val modelPath = Files.createTempFile("mleap-databricks-runtime-testkit", ".zip") Files.delete(modelPath) { println("Writing model to...", modelPath) implicit val sbc = SparkBundleContext.defaultContext.withDataset(model.transform(sampleData)) val bf = BundleFile(new File(modelPath.toString)) model.writeBundle.save(bf).get bf.close() } { val bf = BundleFile(new File(modelPath.toString)) bf.loadMleapBundle() bf.close() } } }
Example 63
Source File: GeneralizedLinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.sql._ class GeneralizedLinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new GeneralizedLinearRegression(). setFamily("gaussian"). setLink("log"). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "maxIter", "tol", "regParam", "solver", "variancePower") }
Example 64
Source File: NaiveBayesPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.DataFrame import scala.collection.mutable object NaiveBayesPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = { val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345) // Set up Pipeline val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") stages += labelIndexer val nb = new NaiveBayes() stages += vectorAssembler stages += nb val pipeline = new Pipeline().setStages(stages.toArray) // Fit the Pipeline val startTime = System.nanoTime() //val model = pipeline.fit(training) val model = pipeline.fit(dataFrame) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") //val holdout = model.transform(test).select("prediction","label") val holdout = model.transform(dataFrame).select("prediction","label") // Select (prediction, true label) and compute test error val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val mAccuracy = evaluator.evaluate(holdout) println("Test set accuracy = " + mAccuracy) } }
Example 65
Source File: RandomForestRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class RandomForestRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new RandomForestRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 66
Source File: LinearRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new LinearRegression(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "elasticNetParam", "maxIter", "tol", "epsilon", "labelCol", "loss", "regParam", "solver") }
Example 67
Source File: AFTSurvivalRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.AFTSurvivalRegression import org.apache.spark.sql._ import org.apache.spark.sql.functions.lit class AFTSurvivalRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").withColumn("censor", lit(1.0)) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new AFTSurvivalRegression(). setQuantileProbabilities(Array(0.5)). setFeaturesCol("features"). setLabelCol("loan_amount"). setQuantilesCol("quant"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol", "stringOrderType", "maxIter", "tol") }
Example 68
Source File: GBTRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.regression.GBTRegressor import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new GBTRegressor(). setFeaturesCol("features"). setLabelCol("loan_amount"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }
Example 69
Source File: NaiveBayesClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class NaiveBayesClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new NaiveBayes(uid = "nb"). setModelType("multinomial"). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "smoothing") }
Example 70
Source File: RandomForestClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class RandomForestClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new RandomForestClassifier(). setThresholds(Array(0.4)). setFeaturesCol("features"). setLabelCol("label"))).fit(dataset) override val unserializedParams = Set("stringOrderType", "seed") }
Example 71
Source File: GBTClassifierParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql._ class GBTClassifierParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new StringIndexer(). setInputCol("approved"). setOutputCol("label"), new GBTClassifier(). setFeaturesCol("features"). setLabelCol("label"). setThresholds(Array(1.0, 1.0)). setProbabilityCol("myProbability"). setPredictionCol("myPrediction"). setRawPredictionCol("myRawPrediction") )).fit(dataset) override val unserializedParams = Set("stringOrderType", "labelCol", "seed") }