org.apache.spark.ml.feature.StopWordsRemover Scala Examples
The following examples show how to use org.apache.spark.ml.feature.StopWordsRemover.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StopWordsRemoverExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StopWordsRemover // $example off$ import org.apache.spark.sql.SparkSession object StopWordsRemoverExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StopWordsRemoverExample") .getOrCreate() // $example on$ val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataSet = spark.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "balloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") remover.transform(dataSet).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: ProcessWord2Vec20.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter12 import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, Word2Vec} import org.apache.spark.sql.SparkSession object ProcessWord2Vec20 { def main(args: Array[String]) { val input = "../data/sparkml2/chapter12/pg62.txt" val spark = SparkSession .builder .master("local[*]") .appName("Process Word2Vec App") .config("spark.sql.warehouse.dir", ".") .getOrCreate() //import spark.implicits._ Logger.getRootLogger.setLevel(Level.WARN) val df = spark.read.text(input).toDF("text") val tokenizer = new RegexTokenizer() .setPattern("\\W+") .setToLowercase(true) .setMinTokenLength(4) .setInputCol("text") .setOutputCol("raw") val rawWords = tokenizer.transform(df) val stopWords = new StopWordsRemover() .setInputCol("raw") .setOutputCol("terms") .setCaseSensitive(false) val wordTerms = stopWords.transform(rawWords) wordTerms.show(false) val word2Vec = new Word2Vec() .setInputCol("terms") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(wordTerms) val synonyms = model.findSynonyms("martian", 10) synonyms.show(false) spark.stop() } }
Example 3
Source File: StopWordsRemoverSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.StopWordsRemover class StopWordsRemoverSuite extends SparkFeaturePFASuiteBase[StopWordsResult]{ val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataset = spark.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "balloon")), (1, Seq("Mary", "had", "a", "little", "lamb")), (2, Seq("The", "the")) )).toDF("id", "raw") override val sparkTransformer = remover val result = sparkTransformer.transform(dataset) override val input = result.select(remover.getInputCol).toJSON.collect() override val expectedOutput = result.select(remover.getOutputCol).toJSON.collect() test("StopWordsRemover case sensitive") { val transformer = remover.setCaseSensitive(true) val result = transformer.transform(dataset) val input = result.select(remover.getInputCol).toJSON.collect() val expectedOutput = result.select(remover.getOutputCol).toJSON.collect() parityTest(transformer, input, expectedOutput) } } case class StopWordsResult(filtered: Seq[String]) extends Result
Example 4
Source File: StopWordsRemover.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument} import com.ibm.aardpfark.pfa.expression.PFAExpression import com.ibm.aardpfark.pfa.types.WithSchema import com.ibm.aardpfark.spark.ml.PFAModel import com.sksamuel.avro4s.{AvroNamespace, AvroSchema} import org.apache.avro.{Schema, SchemaBuilder} import org.apache.spark.ml.feature.StopWordsRemover @AvroNamespace("com.ibm.aardpfark.exec.spark.spark.ml.feature") case class StopWords(words: Seq[String]) extends WithSchema { def schema = AvroSchema[this.type ] } class PFAStopWordsRemover(override val sparkTransformer: StopWordsRemover) extends PFAModel[StopWords] { import com.ibm.aardpfark.pfa.dsl._ private val inputCol = sparkTransformer.getInputCol private val outputCol = sparkTransformer.getOutputCol private val inputExpr = StringExpr(s"input.${inputCol}") private val stopWords = sparkTransformer.getStopWords private val caseSensitive = sparkTransformer.getCaseSensitive private def filterFn = FunctionDef[String, Boolean]("word") { w => Seq(core.not(a.contains(wordsRef, if (caseSensitive) w else s.lower(w)))) } override def inputSchema: Schema = { SchemaBuilder.record(withUid(inputBaseName)).fields() .name(inputCol).`type`().array().items().stringType().noDefault() .endRecord() } override def outputSchema: Schema = { SchemaBuilder.record(withUid(outputBaseName)).fields() .name(outputCol).`type`().array().items().stringType().noDefault() .endRecord() } override protected def cell = { Cell(StopWords(stopWords)) } private val wordsRef = modelCell.ref("words") override def action: PFAExpression = { NewRecord(outputSchema, Map(outputCol -> a.filter(inputExpr, filterFn))) } override def pfa: PFADocument = PFABuilder() .withName(sparkTransformer.uid) .withMetadata(getMetadata) .withInput(inputSchema) .withOutput(outputSchema) .withCell(modelCell) .withAction(action) .pfa }
Example 5
Source File: StopWordsRemoverExample.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer } import org.apache.spark.sql.functions._ import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.StopWordsRemover object StopWordsRemoverExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val sentence = spark.createDataFrame(Seq( (0, "Tokenization,is the process of enchanting words,from the raw text"), (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"), (2, " Here,will provide a sample example on how to tockenize sentences"), (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W+") .setGaps(true) val countTokens = udf { (words: Seq[String]) => words.length } val regexTokenized = regexTokenizer.transform(sentence) val remover = new StopWordsRemover() .setInputCol("words") .setOutputCol("filtered") val newDF = remover.transform(regexTokenized) newDF.select("id", "filtered").show(false) } }
Example 6
Source File: StopWordsRemoverExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StopWordsRemover // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object StopWordsRemoverExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StopWordsRemoverExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataSet = sqlContext.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "baloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") remover.transform(dataSet).show() // $example off$ sc.stop() } } // scalastyle:on println
Example 7
Source File: StopWordsRemoverExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StopWordsRemover // $example off$ import org.apache.spark.sql.SparkSession object StopWordsRemoverExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StopWordsRemoverExample") .getOrCreate() // $example on$ val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataSet = spark.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "balloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") remover.transform(dataSet).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: StopWordsRemoverExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StopWordsRemover // $example off$ import org.apache.spark.sql.Row import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} //transform()方法将DataFrame转化为另外一个DataFrame的算法 remover.transform(dataSet).show() // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: OpStopWordsRemoverTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.test.{SwTransformerSpec, TestFeatureBuilder} import org.apache.spark.ml.feature.StopWordsRemover import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpStopWordsRemoverTest extends SwTransformerSpec[TextList, StopWordsRemover, OpStopWordsRemover] { val data = Seq( "I AM groot", "Groot call me human", "or I will crush you" ).map(_.split(" ").toSeq.toTextList) val (inputData, textListFeature) = TestFeatureBuilder(data) val bigrams = textListFeature.removeStopWords() val transformer = bigrams.originStage.asInstanceOf[OpStopWordsRemover] val expectedResult = Seq(Seq("groot"), Seq("Groot", "call", "human"), Seq("crush")).map(_.toTextList) it should "allow case sensitivity" in { val noStopWords = textListFeature.removeStopWords(caseSensitive = true) val res = noStopWords.originStage.asInstanceOf[OpStopWordsRemover].transform(inputData) res.collect(noStopWords) shouldBe Seq( Seq("I", "AM", "groot"), Seq("Groot", "call", "human"), Seq("I", "crush")).map(_.toTextList) } it should "set custom stop words" in { val noStopWords = textListFeature.removeStopWords(stopWords = Array("Groot", "I")) val res = noStopWords.originStage.asInstanceOf[OpStopWordsRemover].transform(inputData) res.collect(noStopWords) shouldBe Seq( Seq("AM"), Seq("call", "me", "human"), Seq("or", "will", "crush", "you")).map(_.toTextList) } }
Example 10
Source File: OpTransformerWrapperTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.{Normalizer, StopWordsRemover} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpTransformerWrapperTest extends FlatSpec with TestSparkContext { val (testData, featureVector) = TestFeatureBuilder( Seq[MultiPickList]( Set("I", "saw", "the", "red", "balloon").toMultiPickList, Set("Mary", "had", "a", "little", "lamb").toMultiPickList ) ) val (testDataNorm, _, _) = TestFeatureBuilder("label", "features", Seq[(Real, OPVector)]( 0.0.toReal -> Vectors.dense(1.0, 0.5, -1.0).toOPVector, 1.0.toReal -> Vectors.dense(2.0, 1.0, 1.0).toOPVector, 2.0.toReal -> Vectors.dense(4.0, 10.0, 2.0).toOPVector ) ) val (targetDataNorm, targetLabelNorm, featureVectorNorm) = TestFeatureBuilder("label", "features", Seq[(Real, OPVector)]( 0.0.toReal -> Vectors.dense(0.4, 0.2, -0.4).toOPVector, 1.0.toReal -> Vectors.dense(0.5, 0.25, 0.25).toOPVector, 2.0.toReal -> Vectors.dense(0.25, 0.625, 0.125).toOPVector ) ) Spec[OpTransformerWrapper[_, _, _]] should "remove stop words with caseSensitivity=true" in { val remover = new StopWordsRemover().setCaseSensitive(true) val swFilter = new OpTransformerWrapper[MultiPickList, MultiPickList, StopWordsRemover](remover).setInput(featureVector) val output = swFilter.transform(testData) output.collect(swFilter.getOutput()) shouldBe Array( Seq("I", "saw", "red", "balloon").toMultiPickList, Seq("Mary", "little", "lamb").toMultiPickList ) } it should "should properly normalize each feature vector instance with non-default norm of 1" in { val baseNormalizer = new Normalizer().setP(1.0) val normalizer = new OpTransformerWrapper[OPVector, OPVector, Normalizer](baseNormalizer).setInput(featureVectorNorm) val output = normalizer.transform(testDataNorm) val sumSqDist = validateDataframeDoubleColumn(output, normalizer.getOutput().name, targetDataNorm, "features") assert(sumSqDist <= 1E-6, "==> the sum of squared distances between actual and expected should be below tolerance.") } def validateDataframeDoubleColumn( normalizedFeatureDF: DataFrame, normalizedFeatureName: String, targetFeatureDF: DataFrame, targetColumnName: String ): Double = { val sqDistUdf = udf { (leftColVec: Vector, rightColVec: Vector) => Vectors.sqdist(leftColVec, rightColVec) } val targetColRename = "targetFeatures" val renamedTargedDF = targetFeatureDF.withColumnRenamed(targetColumnName, targetColRename) val joinedDF = normalizedFeatureDF.join(renamedTargedDF, Seq("label")) // compute sum of squared distances between expected and actual val finalDF = joinedDF.withColumn("sqDist", sqDistUdf(joinedDF(normalizedFeatureName), joinedDF(targetColRename))) val sumSqDist: Double = finalDF.agg(sum(finalDF("sqDist"))).first().getDouble(0) sumSqDist } }
Example 11
Source File: StopWordsRemoverExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StopWordsRemover // $example off$ import org.apache.spark.sql.SparkSession object StopWordsRemoverExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StopWordsRemoverExample") .getOrCreate() // $example on$ val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataSet = spark.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "balloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") remover.transform(dataSet).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 12
Source File: Preprocessor.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions import config.paramconf.PreprocessParams import functions.clean.Cleaner import functions.segment.Segmenter import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer} import org.apache.spark.sql.DataFrame def preprocess(data: DataFrame): Pipeline = { val spark = data.sparkSession val params = new PreprocessParams val indexModel = new StringIndexer() .setHandleInvalid(params.handleInvalid) .setInputCol("label") .setOutputCol("indexedLabel") .fit(data) val cleaner = new Cleaner() .setFanJian(params.fanjian) .setQuanBan(params.quanban) .setMinLineLen(params.minLineLen) .setInputCol("content") .setOutputCol("cleand") val segmenter = new Segmenter() .isAddNature(params.addNature) .isDelEn(params.delEn) .isDelNum(params.delNum) .isNatureFilter(params.natureFilter) .setMinTermLen(params.minTermLen) .setMinTermNum(params.minTermNum) .setSegType(params.segmentType) .setInputCol(cleaner.getOutputCol) .setOutputCol("segmented") val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect() val remover = new StopWordsRemover() .setStopWords(stopwords) .setInputCol(segmenter.getOutputCol) .setOutputCol("removed") val vectorizer = new CountVectorizer() .setMinTF(params.minTF) .setVocabSize(params.vocabSize) .setInputCol(remover.getOutputCol) .setOutputCol("vectorized") val idf = new IDF() .setMinDocFreq(params.minDocFreq) .setInputCol(vectorizer.getOutputCol) .setOutputCol("features") val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf) new Pipeline().setStages(stages) } }
Example 13
Source File: StopWordsRemoverExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.StopWordsRemover // $example off$ import org.apache.spark.sql.SparkSession object StopWordsRemoverExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("StopWordsRemoverExample") .getOrCreate() // $example on$ val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataSet = spark.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "balloon")), (1, Seq("Mary", "had", "a", "little", "lamb")) )).toDF("id", "raw") remover.transform(dataSet).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 14
Source File: LDAParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import org.scalatest.Ignore @Ignore class LDAParitySpec extends SparkParityBase { override val dataset: DataFrame = textDataset.select("text") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val remover = new StopWordsRemover() .setInputCol(tokenizer.getOutputCol) .setOutputCol("words_filtered") val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000) val lda = new LDA().setK(5).setMaxIter(2) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = { val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution") val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution") sparkDataset.collect().zip(mleapDataset.collect()).foreach { case (sv, mv) => val sparkPrediction = sv.getAs[Vector](sparkPredictionCol) val mleapPrediction = mv.getAs[Vector](mleapPredictionCol) sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach { case (s, m) => assert(Math.abs(m - s) < 0.001) } } } }
Example 15
Source File: StopWordsRemoverParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer} import org.apache.spark.sql.DataFrame class StopWordsRemoverParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new StopWordsRemover(). setInputCol("loan_title_tokens"). setOutputCol("loan_title_stop"). setStopWords(Array("loan")))).fit(dataset) }
Example 16
Source File: StopWordsRemoverOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.StopWordsRemover class StopWordsRemoverOp extends SimpleSparkOp[StopWordsRemover] { override val Model: OpModel[SparkBundleContext, StopWordsRemover] = new OpModel[SparkBundleContext, StopWordsRemover] { override val klazz: Class[StopWordsRemover] = classOf[StopWordsRemover] override def opName: String = Bundle.BuiltinOps.feature.stopwords_remover override def store(model: Model, obj: StopWordsRemover) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("stop_words", Value.stringList(obj.getStopWords)). withValue("case_sensitive", Value.boolean(obj.getCaseSensitive)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): StopWordsRemover = { new StopWordsRemover(uid = "").setStopWords(model.value("stop_words").getStringList.toArray). setCaseSensitive(model.value("case_sensitive").getBoolean) } } override def sparkLoad(uid: String, shape: NodeShape, model: StopWordsRemover): StopWordsRemover = { new StopWordsRemover(uid = uid).setStopWords(model.getStopWords).setCaseSensitive(model.getCaseSensitive) } override def sparkInputs(obj: StopWordsRemover): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: StopWordsRemover): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 17
Source File: StopWordsRemoverWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.StopWordsRemover class StopWordsRemoverWrapper extends TransformerWrapper { override val transformer: Transformer = new StopWordsRemover() override var parent: TransformerWrapper = _ override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override val requiredInputCols: Array[String] = Array("words") override val requiredOutputCols: Array[String] = Array("stopwords") override def declareInAndOut(): this.type = { transformer.asInstanceOf[StopWordsRemover].setInputCol(getInputCols(0)) transformer.asInstanceOf[StopWordsRemover].setOutputCol(getOutputCols(0)) this } }
Example 18
Source File: Components.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer} import org.apache.spark.sql.DataFrame import scala.collection.mutable.ArrayBuffer object Components { def sample(data: DataFrame, fraction: Double): DataFrame = { data.sample(false, fraction) } def addSampler(components: ArrayBuffer[PipelineStage], inputCol: String, fraction: Double): Unit = { val sampler = new Sampler(fraction) .setInputCol("features") components += sampler } def addTokenizer(components: ArrayBuffer[PipelineStage], inputCol: String, outputCol: String): Unit = { val tokenizer = new Tokenizer() .setInputCol(inputCol) .setOutputCol(outputCol) components += tokenizer } def addStopWordsRemover(components: ArrayBuffer[PipelineStage], inputCol: String, outputCol: String): Unit = { val remover = new StopWordsRemover() .setInputCol(inputCol) .setOutputCol(outputCol) components += remover } }
Example 19
Source File: LocalStopWordsRemover.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.StopWordsRemover class LocalStopWordsRemover(override val sparkTransformer: StopWordsRemover) extends LocalTransformer[StopWordsRemover] { override def transform(localData: LocalData): LocalData = { val stopWordsSet = sparkTransformer.getStopWords val toLower = (s: String) => if (s != null) s.toLowerCase else s val lowerStopWords = stopWordsSet.map(toLower(_)).toSet localData.column(sparkTransformer.getInputCol) match { case Some(column) => val newData = column.data.map(r => { if (sparkTransformer.getCaseSensitive) { r.asInstanceOf[Seq[String]].filter(s => !stopWordsSet.contains(s)) } else { r.asInstanceOf[Seq[String]].filter(s => !lowerStopWords.contains(toLower(s))) } }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalStopWordsRemover extends SimpleModelLoader[StopWordsRemover] with TypedTransformerConverter[StopWordsRemover] { override def build(metadata: Metadata, data: LocalData): StopWordsRemover = { new StopWordsRemover(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setCaseSensitive(metadata.paramMap("caseSensitive").asInstanceOf[Boolean]) .setStopWords(metadata.paramMap("stopWords").asInstanceOf[Seq[String]].toArray) } override implicit def toLocal(transformer: StopWordsRemover) = new LocalStopWordsRemover(transformer) }