org.apache.spark.ml.feature.CountVectorizer Scala Examples
The following examples show how to use org.apache.spark.ml.feature.CountVectorizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CountVectorizerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: TokenizerSuite.scala From spark-nkp with Apache License 2.0 | 5 votes |
package com.github.uosdmlab.nkp import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, BeforeAndAfter, FunSuite} class TokenizerSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfter { private var tokenizer: Tokenizer = _ private val spark: SparkSession = SparkSession.builder() .master("local[2]") .appName("Tokenizer Suite") .getOrCreate spark.sparkContext.setLogLevel("WARN") import spark.implicits._ override protected def afterAll(): Unit = { try { spark.stop } finally { super.afterAll() } } before { tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") } private val df = spark.createDataset( Seq( "아버지가방에들어가신다.", "사랑해요 제플린!", "스파크는 재밌어", "나는야 데이터과학자", "데이터야~ 놀자~" ) ).toDF("text") test("Default parameters") { assert(tokenizer.getFilter sameElements Array.empty[String]) } test("Basic operation") { val words = tokenizer.transform(df) assert(df.count == words.count) assert(words.schema.fieldNames.contains(tokenizer.getOutputCol)) } test("POS filter") { val nvTokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("nvWords") .setFilter("N", "V") val words = tokenizer.transform(df).join(nvTokenizer.transform(df), "text") assert(df.count == words.count) assert(words.schema.fieldNames.contains(nvTokenizer.getOutputCol)) assert(words.where(s"SIZE(${tokenizer.getOutputCol}) < SIZE(${nvTokenizer.getOutputCol})").count == 0) } test("TF-IDF pipeline") { tokenizer.setFilter("N") val cntVec = new CountVectorizer() .setInputCol("words") .setOutputCol("tf") val idf = new IDF() .setInputCol("tf") .setOutputCol("tfidf") val pipe = new Pipeline() .setStages(Array(tokenizer, cntVec, idf)) val pipeModel = pipe.fit(df) val result = pipeModel.transform(df) assert(result.count == df.count) val fields = result.schema.fieldNames assert(fields.contains(tokenizer.getOutputCol)) assert(fields.contains(cntVec.getOutputCol)) assert(fields.contains(idf.getOutputCol)) result.show } }
Example 3
Source File: CountVectorizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{CountVectorizer, Tokenizer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class CountVectorizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new CountVectorizer(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_token_counts") .setMinTF(2))).fit(dataset) }
Example 4
Source File: MinMaxScalerPipelineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, MinMaxScaler, QuantileDiscretizer, VectorAssembler} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class MinMaxScalerPipelineParitySpec extends SparkParityBase { private val getKeys: Map[String, Double] => Seq[String] = { input: Map[String, Double] => input.keySet.toSeq } val keyUdf = functions.udf(getKeys) override val dataset = spark.createDataFrame(Seq( (Array("1"), 1.0, Map("a" -> 0.1, "b" -> 0.2, "c" -> 0.3), 1), (Array("2"), 10.0, Map("d" -> 0.1, "e" -> 0.2, "c" -> 0.3), 0), (Array("3"), 20.0, Map("x" -> 0.1, "a" -> 0.2, "b" -> 0.3), 0), (Array("4"), 15.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("5"), 18.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0), (Array("6"), 25.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 1), (Array("6"), 5.0, Map("a" -> 0.1, "b" -> 0.2, "d" -> 0.3), 0), (Array("7"), 30.0, Map("c" -> 0.1, "b" -> 0.2, "w" -> 0.3), 0)) ) .toDF("book_id", "pv", "myInputCol0", "label") .withColumn("myInputCol", keyUdf(functions.col("myInputCol0"))) .drop("myInputCol0") override val sparkTransformer = new Pipeline() .setStages(Array(new CountVectorizer() .setInputCol("book_id") .setOutputCol("book_id_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new QuantileDiscretizer() .setInputCol("pv") .setOutputCol("pv_bucket") .setNumBuckets(3), new CountVectorizer() .setInputCol("myInputCol") .setOutputCol("myInputCol1_vec") .setMinDF(1) .setMinTF(1) .setBinary(true), new VectorAssembler() .setInputCols(Array("pv_bucket", "book_id_vec", "myInputCol1_vec")) .setOutputCol("vectorFeature"), new MinMaxScaler().setInputCol("vectorFeature").setOutputCol("scaledFeatures"))).fit(dataset) }
Example 5
Source File: LDAParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import org.scalatest.Ignore @Ignore class LDAParitySpec extends SparkParityBase { override val dataset: DataFrame = textDataset.select("text") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val remover = new StopWordsRemover() .setInputCol(tokenizer.getOutputCol) .setOutputCol("words_filtered") val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000) val lda = new LDA().setK(5).setMaxIter(2) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = { val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution") val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution") sparkDataset.collect().zip(mleapDataset.collect()).foreach { case (sv, mv) => val sparkPrediction = sv.getAs[Vector](sparkPredictionCol) val mleapPrediction = mv.getAs[Vector](mleapPredictionCol) sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach { case (s, m) => assert(Math.abs(m - s) < 0.001) } } } }
Example 6
Source File: CountVectorizerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 7
Source File: Preprocessor.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions import config.paramconf.PreprocessParams import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer} import org.apache.spark.sql.DataFrame def preprocess(data: DataFrame): Pipeline = { val spark = data.sparkSession val params = new PreprocessParams val indexModel = new StringIndexer() .setHandleInvalid(params.handleInvalid) .setInputCol("label") .setOutputCol("indexedLabel") .fit(data) val cleaner = new Cleaner() .setFanJian(params.fanjian) .setQuanBan(params.quanban) .setMinLineLen(params.minLineLen) .setInputCol("content") .setOutputCol("cleand") val segmenter = new Segmenter() .isAddNature(params.addNature) .isDelEn(params.delEn) .isDelNum(params.delNum) .isNatureFilter(params.natureFilter) .setMinTermLen(params.minTermLen) .setMinTermNum(params.minTermNum) .setSegType(params.segmentType) .setInputCol(cleaner.getOutputCol) .setOutputCol("segmented") val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect() val remover = new StopWordsRemover() .setStopWords(stopwords) .setInputCol(segmenter.getOutputCol) .setOutputCol("removed") val vectorizer = new CountVectorizer() .setMinTF(params.minTF) .setVocabSize(params.vocabSize) .setInputCol(remover.getOutputCol) .setOutputCol("vectorized") val idf = new IDF() .setMinDocFreq(params.minDocFreq) .setInputCol(vectorizer.getOutputCol) .setOutputCol("features") val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf) new Pipeline().setStages(stages) } }
Example 8
Source File: CountVectorizerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: OpCountVectorizer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryModel import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} import org.apache.spark.sql.Dataset def setVocabSize(value: Int): this.type = { getSparkMlStage().get.setVocabSize(value) this } override def fit(dataset: Dataset[_]): SwUnaryModel[TextList, OPVector, CountVectorizerModel] = { val model = super.fit(dataset) val vocab = model.getSparkMlStage().map(_.vocabulary).getOrElse(Array.empty[String]) val tf = getTransientFeatures() val metadataCols = for { f <- tf word <- vocab } yield OpVectorColumnMetadata( parentFeatureName = Seq(f.name), parentFeatureType = Seq(f.typeName), grouping = None, // TODO do we want to test each word for label pred? indicatorValue = Option(word) ) model.setMetadata( OpVectorMetadata(getOutputFeatureName, metadataCols, Transmogrifier.inputFeaturesToHistory(tf, stageName)).toMetadata ) model } }
Example 10
Source File: CountVectorizerExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} //transform()方法将DataFrame转化为另外一个DataFrame的算法 cvm.transform(df).select("features").show() // $example off$ sc.stop() } } // scalastyle:on println
Example 11
Source File: CountVectorizerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 12
Source File: CountVectorizerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object CountVectorizerExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("CounterVectorizerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val df = sqlContext.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).select("features").show() // $example off$ } } // scalastyle:on println
Example 13
Source File: CountVectorizerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel } object CountVectorizerDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, Array("Jason", "David")), (1, Array("David", "Martin")), (2, Array("Martin", "Jason")), (3, Array("Jason", "Daiel")), (4, Array("Daiel", "Martin")), (5, Array("Moahmed", "Jason")), (6, Array("David", "David")), (7, Array("Jason", "Martin")))).toDF("id", "name") df.show(false) // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("name") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) val feature = cvModel.transform(df) feature.show(false) spark.stop() } }
Example 14
Source File: CountVectorizerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.CountVectorizer class CountVectorizerSuite extends SparkFeaturePFASuiteBase[CountVectorizerResult] { val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c", "d", "e", "f")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") val cv = new CountVectorizer() .setInputCol("words") .setOutputCol("features") override val sparkTransformer = cv.fit(df) val result = sparkTransformer.transform(df) override val input = result.select(cv.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect() // Additional test for MinTF test("CountVectorizer with MinTF = 0.3") { val cv = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setMinTF(0.3) val sparkTransformer = cv.fit(df) val result = sparkTransformer.transform(df) val input = result.select(cv.getInputCol).toJSON.collect() val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("CountVectorizer with MinTF = 2.0") { val cv = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setMinTF(2.0) val sparkTransformer = cv.fit(df) val result = sparkTransformer.transform(df) val input = result.select(cv.getInputCol).toJSON.collect() val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } // Additional test for binary test("CountVectorizer with binary") { val cv = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setBinary(true) val sparkTransformer = cv.fit(df) val result = sparkTransformer.transform(df) val input = result.select(cv.getInputCol).toJSON.collect() val expectedOutput = withColumnAsArray(result, cv.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } } case class CountVectorizerResult(features: Seq[Double]) extends Result