org.apache.spark.ml.feature.HashingTF Scala Examples
The following examples show how to use org.apache.spark.ml.feature.HashingTF.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HashingTFTransformer.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.ml.feature.HashingTF import ai.deepsense.deeplang.doperables.SparkTransformerAsMultiColumnTransformer import ai.deepsense.deeplang.params.Param import ai.deepsense.deeplang.params.validators.RangeValidator import ai.deepsense.deeplang.params.wrappers.spark.IntParamWrapper class HashingTFTransformer extends SparkTransformerAsMultiColumnTransformer[HashingTF] { val numFeatures = new IntParamWrapper[HashingTF]( name = "num features", description = Some("The number of features."), sparkParamGetter = _.numFeatures, validator = RangeValidator(1.0, Int.MaxValue, step = Some(1.0))) // With default setting in Bundled Image (1 << 20) makes jvm run out of memory even for few rows. setDefault(numFeatures, (1 << 18).toDouble) override protected def getSpecificParams: Array[Param[_]] = Array(numFeatures) def setNumFeatures(value: Int): this.type = { set(numFeatures -> value) } }
Example 2
Source File: HashingTFTransformer.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.ml.feature.HashingTF import io.deepsense.deeplang.doperables.SparkTransformerAsMultiColumnTransformer import io.deepsense.deeplang.params.Param import io.deepsense.deeplang.params.validators.RangeValidator import io.deepsense.deeplang.params.wrappers.spark.IntParamWrapper class HashingTFTransformer extends SparkTransformerAsMultiColumnTransformer[HashingTF] { val numFeatures = new IntParamWrapper[HashingTF]( name = "num features", description = Some("The number of features."), sparkParamGetter = _.numFeatures, validator = RangeValidator(1.0, Int.MaxValue, step = Some(1.0))) // With default setting in Bundled Image (1 << 20) makes jvm run out of memory even for few rows. setDefault(numFeatures, (1 << 18).toDouble) override protected def getSpecificParams: Array[Param[_]] = Array(numFeatures) def setNumFeatures(value: Int): this.type = { set(numFeatures -> value) } }
Example 3
Source File: MyPipeLine.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.sql.SparkSession import org.apache.log4j.{Level, Logger} object MyPipeLine { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("My PipeLine") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val trainset = spark.createDataFrame(Seq( (1L, 1, "spark rocks"), (2L, 0, "flink is the best"), (3L, 1, "Spark rules"), (4L, 0, "mapreduce forever"), (5L, 0, "Kafka is great") )).toDF("id", "label", "words") val tokenizer = new Tokenizer() .setInputCol("words") .setOutputCol("tokens") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(15) .setRegParam(0.01) // three stage pipeline val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(trainset) val testSet = spark.createDataFrame(Seq( (10L, 1, "use spark please"), (11L, 2, "Kafka") )).toDF("id", "label", "words") model.transform(testSet).select("probability","prediction").show(false) spark.stop() } }
Example 4
Source File: PipelineExampleTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.ml import com.github.dnvriend.TestSpec import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{ HashingTF, Tokenizer } import org.apache.spark.ml.{ Pipeline, PipelineModel } import org.apache.spark.sql.Row class PipelineExampleTest extends TestSpec { it should "PipelineExample" in withSparkSession { spark => import spark.implicits._ // Prepare training documents from a list of (id, text, label) tuples. val training = Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) ).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training) // Now we can optionally save the fitted pipeline to disk model.write.overwrite().save("/tmp/spark-logistic-regression-model") // We can also save this unfit pipeline to disk pipeline.write.overwrite().save("/tmp/unfit-lr-model") // And load it back in during production val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. val test = Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop"), (8L, "spark f g h"), (9L, "d e f spark a b c"), (10L, "spark baz bar a b c"), (11L, "foo bar a b c spark"), (12L, "a b c scala d e f"), (13L, "spark mapreduce") ).toDF("id", "text") // Make predictions on test documents. model.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } } }
Example 5
Source File: NaiveBayes.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter12.NaiveBayes import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql.SparkSession import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineStage; import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} object NaiveBayesExample { def main(args: Array[String]): Unit = { // Create the Spark session val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() // Load the data stored in LIBSVM format as a DataFrame. val data = spark.read.format("libsvm").load("C:/Users/rezkar/Downloads/spark-2.1.0-bin-hadoop2.7/data/sample.data") // Split the data into training and test sets (30% held out for testing) val Array(trainingData, validationData) = data.randomSplit(Array(0.75, 0.25), seed = 12345L) // Train a NaiveBayes model. val nb = new NaiveBayes().setSmoothing(0.00001) val model = nb.fit(trainingData) // Select example rows to display. val predictions = model.transform(validationData) predictions.show() // Select (prediction, true label) and compute test error obtain evaluator and compute the classification performnce metrics like accuracy, precision, recall and f1 measure. val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setMetricName("areaUnderROC") val evaluator1 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy") val evaluator2 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedPrecision") val evaluator3 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedRecall") val evaluator4 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("f1") // compute the classification accuracy, precision, recall, f1 measure and error on test data. val areaUnderROC = evaluator.evaluate(predictions) val accuracy = evaluator1.evaluate(predictions) val precision = evaluator2.evaluate(predictions) val recall = evaluator3.evaluate(predictions) val f1 = evaluator4.evaluate(predictions) // Print the performance metrics println("areaUnderROC = " + areaUnderROC) println("Accuracy = " + accuracy) println("Precision = " + precision) println("Recall = " + recall) println("F1 = " + f1) println(s"Test Error = ${1 - accuracy}") data.show(20) spark.stop() } }
Example 6
Source File: SimpleTextClassificationPipeline.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } } // scalastyle:on println
Example 7
Source File: TfIdfExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object TfIdfExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TfIdfExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val sentenceData = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("features", "label").take(3).foreach(println) // $example off$ } } // scalastyle:on println
Example 8
Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.SaveMode case class LabelledDocument(fileName:String, text:String, category:String) object LogisticRegressionDemo extends App { val conf = new SparkConf().setAppName("LrTest") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val spamText = sc.wholeTextFiles("spam/*") val hamText = sc.wholeTextFiles("ham/*") val spamDocuments = spamText.map { case (fileName, text) => LabelledDocument(fileName, text, "spam") } val hamDocuments = hamText.map { case (fileName, text) => LabelledDocument(fileName, text, "ham") } val documentsDF = spamDocuments.union(hamDocuments).toDF documentsDF.persist val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3)) val indexer = new StringIndexer().setInputCol("category").setOutputCol("label") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val hasher = new HashingTF().setInputCol("words").setOutputCol("features") val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0) val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr)) val model = pipeline.fit(trainDF) val transformedTrain = model.transform(trainDF) transformedTrain.persist val transformedTest = model.transform(testDF) transformedTest.persist println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count, " / ",transformedTrain.count) println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count, " / ",transformedTest.count) transformedTrain.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet") transformedTest.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet") }
Example 9
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import breeze.linalg._ import breeze.plot._ import org.jfree.chart.axis.NumberTickUnit object ROC extends App { val conf = new SparkConf().setAppName("ROC") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val transformedTest = sqlContext.read.parquet("transformedTest.parquet") val labelScores = transformedTest.select("probability", "label").map { case Row(probability:Vector, label:Double) => (probability(1), label) } val bm = new BinaryClassificationMetrics(labelScores, 300) val roc = bm.roc.collect roc.foreach { println } val falsePositives = roc.map { _._1 } val truePositives = roc.map { _._2 } val f = Figure() val p = f.subplot(0) p += plot(falsePositives, truePositives) p.xlabel = "false positives" p.ylabel = "true positives" p.xlim = (0.0, 0.1) p.xaxis.setTickUnit(new NumberTickUnit(0.01)) p.yaxis.setTickUnit(new NumberTickUnit(0.1)) f.refresh f.saveas("roc.png") }
Example 10
Source File: TfIdfExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 11
Source File: TfIdfExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} rescaledData.show() rescaledData.select("features", "label").take(3).foreach(println) // $example off$ sc.stop() } } // scalastyle:on println
Example 12
Source File: OpHashingTFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ import com.salesforce.op.test.{SwTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.DataFrame import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpHashingTFTest extends SwTransformerSpec[OPVector, HashingTF, OpHashingTF] { // scalastyle:off val testData = Seq( "Hamlet: To be or not to be - that is the question.", "Гамлет: Быть или не быть - вот в чём вопрос.", "המלט: להיות או לא להיות - זאת השאלה.", "Hamlet: Être ou ne pas être - telle est la question." ).map(_.toLowerCase.split(" ").toSeq.toTextList) // scalastyle:on val (inputData, f1): (DataFrame, Feature[TextList]) = TestFeatureBuilder(testData) val hashed = f1.tf(numTerms = 5) val transformer = hashed.originStage.asInstanceOf[OpHashingTF] val expectedResult: Seq[OPVector] = Seq( Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(2.0, 4.0, 2.0, 3.0, 1.0)), Vectors.sparse(5, Array(0, 1, 2, 3, 4), Array(4.0, 1.0, 3.0, 1.0, 1.0)), Vectors.sparse(5, Array(0, 2, 3, 4), Array(2.0, 2.0, 2.0, 2.0)), Vectors.sparse(5, Array(0, 1, 2, 4), Array(3.0, 5.0, 1.0, 2.0)) ).map(_.toOPVector) def hash( s: String, numOfFeatures: Int = TransmogrifierDefaults.DefaultNumOfFeatures, binary: Boolean = false ): Int = new org.apache.spark.mllib.feature.HashingTF(numOfFeatures).setBinary(binary).indexOf(s) it should "hash categorical data" in { val hashed = f1.tf() val transformedData = hashed.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.select(hashed.name).collect(hashed) hashed.name shouldBe hashed.originStage.getOutputFeatureName // scalastyle:off results.forall(_.value.size == TransmogrifierDefaults.DefaultNumOfFeatures) shouldBe true results(0).value(hash("be")) shouldBe 2.0 results(0).value(hash("that")) shouldBe 1.0 results(1).value(hash("быть")) shouldBe 2.0 results(2).value(hash("להיות")) shouldBe 2.0 results(3).value(hash("être")) shouldBe 2.0 // scalastyle:on } it should "hash categorical data with custom numFeatures" in { val numFeatures = 100 val hashed = f1.tf(numTerms = numFeatures) val transformedData = hashed.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.select(hashed.name).collect(hashed) // scalastyle:off results.forall(_.value.size == numFeatures) shouldBe true results(0).value(hash("be", numOfFeatures = numFeatures)) shouldBe 2.0 results(1).value(hash("быть", numOfFeatures = numFeatures)) shouldBe 2.0 results(2).value(hash("question", numOfFeatures = numFeatures)) shouldBe 0.0 // scalastyle:on } it should "hash categorical data when binary = true" in { val binary = true val hashed = f1.tf(binary = binary) val transformedData = hashed.originStage.asInstanceOf[Transformer].transform(inputData) val results = transformedData.select(hashed.name).collect(hashed) // scalastyle:off val values = Set(0.0, 1.0) results.forall(_.value.toArray.forall(values contains _)) shouldBe true results(0).value(hash("be", binary = binary)) shouldBe 1.0 results(1).value(hash("быть", binary = binary)) shouldBe 1.0 results(2).value(hash("question", binary = binary)) shouldBe 0.0 // scalastyle:on } }
Example 13
Source File: TfIdfExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 14
Source File: PipelineSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.mockito.Matchers.{any, eq => meq} import org.mockito.Mockito.when import org.scalatest.mock.MockitoSugar.mock import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.DataFrame class PipelineSuite extends SparkFunSuite { abstract class MyModel extends Model[MyModel] test("pipeline") { val estimator0 = mock[Estimator[MyModel]] val model0 = mock[MyModel] val transformer1 = mock[Transformer] val estimator2 = mock[Estimator[MyModel]] val model2 = mock[MyModel] val transformer3 = mock[Transformer] val dataset0 = mock[DataFrame] val dataset1 = mock[DataFrame] val dataset2 = mock[DataFrame] val dataset3 = mock[DataFrame] val dataset4 = mock[DataFrame] when(estimator0.copy(any[ParamMap])).thenReturn(estimator0) when(model0.copy(any[ParamMap])).thenReturn(model0) when(transformer1.copy(any[ParamMap])).thenReturn(transformer1) when(estimator2.copy(any[ParamMap])).thenReturn(estimator2) when(model2.copy(any[ParamMap])).thenReturn(model2) when(transformer3.copy(any[ParamMap])).thenReturn(transformer3) when(estimator0.fit(meq(dataset0))).thenReturn(model0) when(model0.transform(meq(dataset0))).thenReturn(dataset1) when(model0.parent).thenReturn(estimator0) when(transformer1.transform(meq(dataset1))).thenReturn(dataset2) when(estimator2.fit(meq(dataset2))).thenReturn(model2) when(model2.transform(meq(dataset2))).thenReturn(dataset3) when(model2.parent).thenReturn(estimator2) when(transformer3.transform(meq(dataset3))).thenReturn(dataset4) val pipeline = new Pipeline() .setStages(Array(estimator0, transformer1, estimator2, transformer3)) val pipelineModel = pipeline.fit(dataset0) assert(pipelineModel.stages.length === 4) assert(pipelineModel.stages(0).eq(model0)) assert(pipelineModel.stages(1).eq(transformer1)) assert(pipelineModel.stages(2).eq(model2)) assert(pipelineModel.stages(3).eq(transformer3)) val output = pipelineModel.transform(dataset0) assert(output.eq(dataset4)) } test("pipeline with duplicate stages") { val estimator = mock[Estimator[MyModel]] val pipeline = new Pipeline() .setStages(Array(estimator, estimator)) val dataset = mock[DataFrame] intercept[IllegalArgumentException] { pipeline.fit(dataset) } } test("PipelineModel.copy") { val hashingTF = new HashingTF() .setNumFeatures(100) val model = new PipelineModel("pipeline", Array[Transformer](hashingTF)) val copied = model.copy(ParamMap(hashingTF.numFeatures -> 10)) require(copied.stages(0).asInstanceOf[HashingTF].getNumFeatures === 10, "copy should handle extra stage params") } }
Example 15
Source File: SimpleTextClassificationPipeline.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } }
Example 16
Source File: TfIdfExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 17
Source File: HashingTFSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.ml import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.SparseVector class HashingTFSpec extends TestBase { test("operation on tokenized strings") { val wordDataFrame = session.createDataFrame(Seq( (0, Array("Hi", "I", "can", "not", "foo", "foo")), (1, Array("I")), (2, Array("Logistic", "regression")), (3, Array("Log", "f", "reg")) )).toDF("label", "words") val hashDF = new HashingTF().setInputCol("words").setOutputCol("hashedTF").transform(wordDataFrame) val lines = hashDF.getSVCol("hashedTF") val trueLines = List( new SparseVector(262144, Array(36073, 51654, 113890, 139098, 242088), Array(1.0, 2.0, 1.0, 1.0, 1.0)), new SparseVector(262144, Array(113890), Array(1.0)), new SparseVector(262144, Array(13671, 142455), Array(1.0, 1.0)), new SparseVector(262144, Array(24152, 74466, 122984), Array(1.0, 1.0, 1.0)) ) assert(lines === trueLines) } test("support several values for number of features") { val featureSizes = List(1, 5, 100, 100000) val words = Array("Hi", "I", "can", "not", "foo", "bar", "foo", "afk") val wordDataFrame = session.createDataFrame(Seq((0, words))).toDF("label", "words") val fsResults = featureSizes.map { n => new HashingTF() .setNumFeatures(n) .setInputCol("words") .setOutputCol("hashedTF") .transform(wordDataFrame) .getSVCol("hashedTF")(0) } val trueResults = Array( new SparseVector(1, Array(0), Array(8.0)), new SparseVector(5, Array(0, 2, 3), Array(4.0, 2.0, 2.0)), new SparseVector(100, Array(0, 10, 18, 33, 62, 67, 80), Array(1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0)), new SparseVector(100000, Array(5833, 9467, 16680, 29018, 68900, 85762, 97510), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0)) ) assert(fsResults === trueResults) } test("treat empty strings as another word") { val wordDataFrame = session.createDataFrame(Seq( (0, "hey you no way"), (1, ""))) .toDF("label", "sentence") val tokenized = new Tokenizer().setInputCol("sentence").setOutputCol("tokens").transform(wordDataFrame) val hashDF = new HashingTF().setInputCol("tokens").setOutputCol("HashedTF").transform(tokenized) val lines = hashDF.getSVCol("hashedTF") assert(lines(1) === new SparseVector(262144, Array(249180), Array(1.0))) } test("raise an error when applied to a null array") { val tokenDataFrame = session.createDataFrame(Seq( (0, Some(Array("Hi", "I", "can", "not", "foo"))), (1, None)) ).toDF("label", "tokens") assertSparkException[org.apache.spark.SparkException](new HashingTF().setInputCol("tokens"), tokenDataFrame) } test("raise an error when given strange values of n") { List(0, -1, -10).foreach { n => intercept[IllegalArgumentException] { new HashingTF().setNumFeatures(n) } } } }
Example 18
Source File: TimerSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame class TimerSuite extends EstimatorFuzzing[Timer] { lazy val df: DataFrame = session .createDataFrame(Seq((0, "Hi I"), (1, "I wish for snow today"), (2, "we Cant go to the park, because of the snow!"), (3, ""))) .toDF("label", "sentence") test("Work with transformers and estimators") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val df2 = new Timer().setStage(tok).fit(df).transform(df) val df3 = new HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2) val idf = new IDF().setInputCol("hash").setOutputCol("idf") val df4 = new Timer().setStage(idf).fit(df3).transform(df3) } test("should work within pipelines") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) pipe.fit(df).transform(df) } test("should be able to turn off timing") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) val model = pipe.fit(df) println("Transforming") println(model.stages(0).params.foreach(println(_))) model.stages(0).asInstanceOf[TimerModel].setDisable(true) model.stages(2).asInstanceOf[TimerModel].setDisable(true) println("here") println(model.stages(0).getParam("disableMaterialization")) model.stages(0).params.foreach(p =>println("foo: " + p.toString)) model.transform(df) } val reader: MLReadable[_] = Timer val modelReader: MLReadable[_] = TimerModel override def testObjects(): Seq[TestObject[Timer]] = Seq(new TestObject[Timer]({ val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") new Timer().setStage(tok) }, df)) }
Example 19
Source File: TfIdfExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 20
Source File: TextClassificationPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.textclassifier import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row import org.utils.StandaloneSpark object TextClassificationPipeline { def main(args: Array[String]): Unit = { val spark = StandaloneSpark.getSparkInstance() // Prepare training documents from a list of (id, text, label) tuples. val training = spark.createDataFrame(Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) )).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training) // Now we can optionally save the fitted pipeline to disk model.write.overwrite().save("/tmp/spark-logistic-regression-model") // We can also save this unfit pipeline to disk pipeline.write.overwrite().save("/tmp/unfit-lr-model") // And load it back in during production val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "spark hadoop spark"), (7L, "apache hadoop") )).toDF("id", "text") // Make predictions on test documents. model.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } } }
Example 21
Source File: HashingTermFrequencyParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import ml.combust.mleap.spark.SparkSupport._ class HashingTermFrequencyParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new HashingTF(). setNumFeatures(1 << 17). setInputCol("loan_title_tokens"). setOutputCol("loan_title_tf"))).fit(dataset) }
Example 22
Source File: HashingTermFrequencyOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.param.Param class HashingTermFrequencyOp extends SimpleSparkOp[HashingTF] { override val Model: OpModel[SparkBundleContext, HashingTF] = new OpModel[SparkBundleContext, HashingTF] { override val klazz: Class[HashingTF] = classOf[HashingTF] override def opName: String = Bundle.BuiltinOps.feature.hashing_term_frequency override def store(model: Model, obj: HashingTF) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("num_features", Value.long(obj.getNumFeatures)). withValue("binary", Value.boolean(obj.getBinary)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): HashingTF = { new HashingTF(uid = "").setNumFeatures(model.value("num_features").getLong.toInt). setBinary(model.value("binary").getBoolean) } } override def sparkLoad(uid: String, shape: NodeShape, model: HashingTF): HashingTF = { new HashingTF(uid = uid).setBinary(model.getBinary).setNumFeatures(model.getNumFeatures) } override def sparkInputs(obj: HashingTF): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: HashingTF): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 23
Source File: HashingTFWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.HashingTF class HashingTFWrapper(numFeatures: Int) extends TransformerWrapper { override val transformer: Transformer = new HashingTF().setNumFeatures(numFeatures) override var parent: TransformerWrapper = _ override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override val requiredInputCols: Array[String] = Array("words") override val requiredOutputCols: Array[String] = Array("outHashingTF") override def declareInAndOut(): this.type = { transformer.asInstanceOf[HashingTF].setInputCol(getInputCols(0)) transformer.asInstanceOf[HashingTF].setOutputCol(getOutputCols(0)) this } }
Example 24
Source File: LocalHashingTF.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.HashingTF import org.apache.spark.mllib.feature.{HashingTF => HTF} class LocalHashingTF(override val sparkTransformer: HashingTF) extends LocalTransformer[HashingTF] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val htf = new HTF(sparkTransformer.getNumFeatures).setBinary(sparkTransformer.getBinary) val newData = column.data.map { m => htf.transform(m.asInstanceOf[Seq[_]]).toList } localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalHashingTF extends SimpleModelLoader[HashingTF] with TypedTransformerConverter[HashingTF] { override def build(metadata: Metadata, data: LocalData): HashingTF = { new HashingTF(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setBinary(metadata.paramMap("binary").asInstanceOf[Boolean]) .setNumFeatures(metadata.paramMap("numFeatures").asInstanceOf[Number].intValue()) } override implicit def toLocal(transformer: HashingTF) = new LocalHashingTF(transformer) }