org.apache.spark.ml.feature.IDF Scala Examples
The following examples show how to use org.apache.spark.ml.feature.IDF.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TfIdfExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: TokenizerSuite.scala From spark-nkp with Apache License 2.0 | 5 votes |
package com.github.uosdmlab.nkp import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, BeforeAndAfter, FunSuite} class TokenizerSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfter { private var tokenizer: Tokenizer = _ private val spark: SparkSession = SparkSession.builder() .master("local[2]") .appName("Tokenizer Suite") .getOrCreate spark.sparkContext.setLogLevel("WARN") import spark.implicits._ override protected def afterAll(): Unit = { try { spark.stop } finally { super.afterAll() } } before { tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") } private val df = spark.createDataset( Seq( "아버지가방에들어가신다.", "사랑해요 제플린!", "스파크는 재밌어", "나는야 데이터과학자", "데이터야~ 놀자~" ) ).toDF("text") test("Default parameters") { assert(tokenizer.getFilter sameElements Array.empty[String]) } test("Basic operation") { val words = tokenizer.transform(df) assert(df.count == words.count) assert(words.schema.fieldNames.contains(tokenizer.getOutputCol)) } test("POS filter") { val nvTokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("nvWords") .setFilter("N", "V") val words = tokenizer.transform(df).join(nvTokenizer.transform(df), "text") assert(df.count == words.count) assert(words.schema.fieldNames.contains(nvTokenizer.getOutputCol)) assert(words.where(s"SIZE(${tokenizer.getOutputCol}) < SIZE(${nvTokenizer.getOutputCol})").count == 0) } test("TF-IDF pipeline") { tokenizer.setFilter("N") val cntVec = new CountVectorizer() .setInputCol("words") .setOutputCol("tf") val idf = new IDF() .setInputCol("tf") .setOutputCol("tfidf") val pipe = new Pipeline() .setStages(Array(tokenizer, cntVec, idf)) val pipeModel = pipe.fit(df) val result = pipeModel.transform(df) assert(result.count == df.count) val fields = result.schema.fieldNames assert(fields.contains(tokenizer.getOutputCol)) assert(fields.contains(cntVec.getOutputCol)) assert(fields.contains(idf.getOutputCol)) result.show } }
Example 3
Source File: IDFWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.feature.IDF class IDFWrapper extends TransformerWrapper { override val transformer = new IDF() override var parent: TransformerWrapper = _ override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override val requiredInputCols: Array[String] = Array("rawFeatures") override val requiredOutputCols: Array[String] = Array("outIDF") override def declareInAndOut(): this.type = { transformer.asInstanceOf[IDF].setInputCol(getInputCols(0)) transformer.asInstanceOf[IDF].setOutputCol(getOutputCols(0)) this } }
Example 4
Source File: TfIdfExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 5
Source File: Preprocessor.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions import config.paramconf.PreprocessParams import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer} import org.apache.spark.sql.DataFrame def preprocess(data: DataFrame): Pipeline = { val spark = data.sparkSession val params = new PreprocessParams val indexModel = new StringIndexer() .setHandleInvalid(params.handleInvalid) .setInputCol("label") .setOutputCol("indexedLabel") .fit(data) val cleaner = new Cleaner() .setFanJian(params.fanjian) .setQuanBan(params.quanban) .setMinLineLen(params.minLineLen) .setInputCol("content") .setOutputCol("cleand") val segmenter = new Segmenter() .isAddNature(params.addNature) .isDelEn(params.delEn) .isDelNum(params.delNum) .isNatureFilter(params.natureFilter) .setMinTermLen(params.minTermLen) .setMinTermNum(params.minTermNum) .setSegType(params.segmentType) .setInputCol(cleaner.getOutputCol) .setOutputCol("segmented") val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect() val remover = new StopWordsRemover() .setStopWords(stopwords) .setInputCol(segmenter.getOutputCol) .setOutputCol("removed") val vectorizer = new CountVectorizer() .setMinTF(params.minTF) .setVocabSize(params.vocabSize) .setInputCol(remover.getOutputCol) .setOutputCol("vectorized") val idf = new IDF() .setMinDocFreq(params.minDocFreq) .setInputCol(vectorizer.getOutputCol) .setOutputCol("features") val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf) new Pipeline().setStages(stages) } }
Example 6
Source File: TimerSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame class TimerSuite extends EstimatorFuzzing[Timer] { lazy val df: DataFrame = session .createDataFrame(Seq((0, "Hi I"), (1, "I wish for snow today"), (2, "we Cant go to the park, because of the snow!"), (3, ""))) .toDF("label", "sentence") test("Work with transformers and estimators") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val df2 = new Timer().setStage(tok).fit(df).transform(df) val df3 = new HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2) val idf = new IDF().setInputCol("hash").setOutputCol("idf") val df4 = new Timer().setStage(idf).fit(df3).transform(df3) } test("should work within pipelines") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) pipe.fit(df).transform(df) } test("should be able to turn off timing") { val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") val ttok = new Timer().setStage(tok) val hash = new HashingTF().setInputCol("tokens").setOutputCol("hash") val idf = new IDF().setInputCol("hash").setOutputCol("idf") val tidf = new Timer().setStage(idf) val pipe = new Pipeline().setStages(Array(ttok, hash, tidf)) val model = pipe.fit(df) println("Transforming") println(model.stages(0).params.foreach(println(_))) model.stages(0).asInstanceOf[TimerModel].setDisable(true) model.stages(2).asInstanceOf[TimerModel].setDisable(true) println("here") println(model.stages(0).getParam("disableMaterialization")) model.stages(0).params.foreach(p =>println("foo: " + p.toString)) model.transform(df) } val reader: MLReadable[_] = Timer val modelReader: MLReadable[_] = TimerModel override def testObjects(): Seq[TestObject[Timer]] = Seq(new TestObject[Timer]({ val tok = new Tokenizer() .setInputCol("sentence") .setOutputCol("tokens") new Timer().setStage(tok) }, df)) }
Example 7
Source File: TfIdfExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: IDFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.IDF import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.{Estimator, Transformer} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class IDFTest extends FlatSpec with TestSparkContext { val data = Seq( Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(4, Array(1), Array(1.0)) ) lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector)) Spec[IDF] should "compute inverted document frequency" in { val idf = f1.idf() val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((data.length + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } it should "compute inverted document frequency when minDocFreq is 1" in { val idf = f1.idf(minDocFreq = 1) val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } }
Example 9
Source File: TfIdfExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} rescaledData.show() rescaledData.select("features", "label").take(3).foreach(println) // $example off$ sc.stop() } } // scalastyle:on println
Example 10
Source File: TfIdfExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SparkSession object TfIdfExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("TfIdfExample") .getOrCreate() // $example on$ val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) // alternatively, CountVectorizer can also be used to get term frequency vectors val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 11
Source File: TfIdfExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object TfIdfExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TfIdfExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val sentenceData = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("features", "label").take(3).foreach(println) // $example off$ } } // scalastyle:on println