org.apache.spark.ml.feature.RegexTokenizer Scala Examples
The following examples show how to use org.apache.spark.ml.feature.RegexTokenizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TokenizerExample.scala From drizzle-spark with Apache License 2.0 | 7 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} import org.apache.spark.sql.functions._ // $example off$ import org.apache.spark.sql.SparkSession object TokenizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("TokenizerExample") .getOrCreate() // $example on$ val sentenceDataFrame = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") )).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) val countTokens = udf { (words: Seq[String]) => words.length } val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: LocalRegexTokenizer.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.RegexTokenizer class LocalRegexTokenizer(val sparkTransformer: RegexTokenizer) extends LocalTransformer[RegexTokenizer] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(x) => val newData = x.data.map { d => val originStr = d.toString val re = sparkTransformer.getPattern.r val str = if (sparkTransformer.getToLowercase) originStr.toLowerCase() else originStr val tokens = if (sparkTransformer.getGaps) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = sparkTransformer.getMinTokenLength tokens.filter(_.length >= minLength).toList } localData.withColumn( LocalDataColumn( sparkTransformer.getOutputCol, newData ) ) case None => localData } } } object LocalRegexTokenizer extends SimpleModelLoader[RegexTokenizer] with TypedTransformerConverter[RegexTokenizer] { override def build(metadata: Metadata, data: LocalData): RegexTokenizer = { new RegexTokenizer(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setPattern(metadata.paramMap("pattern").asInstanceOf[String]) .setGaps(metadata.paramMap("gaps").asInstanceOf[Boolean]) .setMinTokenLength(metadata.paramMap("minTokenLength").asInstanceOf[Number].intValue()) .setToLowercase(metadata.paramMap("toLowercase").asInstanceOf[Boolean]) } override implicit def toLocal(transformer: RegexTokenizer) = new LocalRegexTokenizer(transformer) }
Example 3
Source File: RegexTokenizerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.RegexTokenizer class RegexTokenizerOp extends SimpleSparkOp[RegexTokenizer] { override val Model: OpModel[SparkBundleContext, RegexTokenizer] = new OpModel[SparkBundleContext, RegexTokenizer] { val RegexIdentifier = "regex" val MatchGapsIdentifier = "match_gaps" val MinTokenLengthIdentifer = "token_min_length" val LowercaseText = "lowercase_text" override val klazz: Class[RegexTokenizer] = classOf[RegexTokenizer] override def opName: String = Bundle.BuiltinOps.feature.regex_tokenizer override def store(model: Model, obj: RegexTokenizer) (implicit context: BundleContext[SparkBundleContext]): Model = { model .withValue(RegexIdentifier, Value.string(obj.getPattern)) .withValue(MatchGapsIdentifier, Value.boolean(obj.getGaps)) .withValue(MinTokenLengthIdentifer, Value.int(obj.getMinTokenLength)) .withValue(LowercaseText, Value.boolean(obj.getToLowercase)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): RegexTokenizer = { new RegexTokenizer(uid = "") .setPattern(model.value(RegexIdentifier).getString) .setGaps(model.value(MatchGapsIdentifier).getBoolean) .setMinTokenLength(model.value(MinTokenLengthIdentifer).getInt) .setToLowercase(model.value(LowercaseText).getBoolean) } } override def sparkLoad(uid: String, shape: NodeShape, model: RegexTokenizer): RegexTokenizer = { new RegexTokenizer(uid = uid) .setPattern(model.getPattern) .setGaps(model.getGaps) .setMinTokenLength(model.getMinTokenLength) .setToLowercase(model.getToLowercase) } override def sparkInputs(obj: RegexTokenizer): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: RegexTokenizer): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 4
Source File: RegexTokenizerParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.Transformer import org.apache.spark.ml.feature.RegexTokenizer import org.apache.spark.sql.DataFrame class RegexTokenizerParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new RegexTokenizer() .setInputCol("loan_title") .setOutputCol("loan_title_tokens") .setGaps(true) .setToLowercase(true) .setMinTokenLength(2) .setPattern("\\s") }
Example 5
Source File: TokenizerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} import org.apache.spark.sql.functions._ // $example off$ import org.apache.spark.sql.SparkSession object TokenizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("TokenizerExample") .getOrCreate() // $example on$ val sentenceDataFrame = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") )).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) val countTokens = udf { (words: Seq[String]) => words.length } val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: TokenizerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} import org.apache.spark.sql.functions._ // $example off$ import org.apache.spark.sql.SparkSession object TokenizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("TokenizerExample") .getOrCreate() // $example on$ val sentenceDataFrame = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") )).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) val countTokens = udf { (words: Seq[String]) => words.length } val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 7
Source File: TokenizerExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} regexTokenized.show() regexTokenized.select("words", "label").take(3).foreach(println) // $example off$ sc.stop() } } // scalastyle:on println
Example 8
Source File: TokenizerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ // $example off$ object TokenizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("TokenizerExample") .getOrCreate() // $example on$ val sentenceDataFrame = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") )).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) val countTokens = udf { (words: Seq[String]) => words.length } val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: TokenizerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object TokenizerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("TokenizerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val sentenceDataFrame = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("words", "label").take(3).foreach(println) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("words", "label").take(3).foreach(println) // $example off$ sc.stop() } } // scalastyle:on println
Example 10
Source File: StopWordsRemoverExample.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer } import org.apache.spark.sql.functions._ import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.StopWordsRemover object StopWordsRemoverExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val sentence = spark.createDataFrame(Seq( (0, "Tokenization,is the process of enchanting words,from the raw text"), (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"), (2, " Here,will provide a sample example on how to tockenize sentences"), (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W+") .setGaps(true) val countTokens = udf { (words: Seq[String]) => words.length } val regexTokenized = regexTokenizer.transform(sentence) val remover = new StopWordsRemover() .setInputCol("words") .setOutputCol("filtered") val newDF = remover.transform(regexTokenized) newDF.select("id", "filtered").show(false) } }
Example 11
Source File: TockenizerExample.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer } import org.apache.spark.sql.functions._ import org.apache.spark.sql.SparkSession object TockenizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val sentence = spark.createDataFrame(Seq( (0, "Tokenization,is the process of enchanting words,from the raw text"), (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"), (2, " Here,will provide a sample example on how to tockenize sentences"), (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W+") .setGaps(true) val countTokens = udf { (words: Seq[String]) => words.length } val tokenized = tokenizer.transform(sentence) tokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))) .show(false) val regexTokenized = regexTokenizer.transform(sentence) regexTokenized.select("sentence", "words") .withColumn("tokens", countTokens(col("words"))) .show(false) } }
Example 12
Source File: RegexTokenizer.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.document.{PFABuilder, PFADocument} import com.ibm.aardpfark.pfa.expression.PFAExpression import com.ibm.aardpfark.spark.ml.PFATransformer import org.apache.avro.SchemaBuilder import org.apache.spark.ml.feature.RegexTokenizer // TODO missing token count filter and gaps vs tokens class PFARegexTokenizer(override val sparkTransformer: RegexTokenizer) extends PFATransformer { import com.ibm.aardpfark.pfa.dsl._ private val inputCol = sparkTransformer.getInputCol private val outputCol = sparkTransformer.getOutputCol private val inputExpr = StringExpr(s"input.${inputCol}") private val pattern = sparkTransformer.getPattern private val gaps = sparkTransformer.getGaps private val minTokenLength = sparkTransformer.getMinTokenLength private val toLowerCase = sparkTransformer.getToLowercase override def inputSchema = { SchemaBuilder.record(withUid(inputBaseName)).fields() .name(inputCol).`type`().stringType().noDefault() .endRecord() } override def outputSchema = { SchemaBuilder.record(withUid(outputBaseName)).fields() .name(outputCol).`type`().array().items().stringType().noDefault() .endRecord() } override def action: PFAExpression = { val a = if (toLowerCase) { re.split(s.lower(inputExpr), pattern) } else { re.split(inputExpr, pattern) } NewRecord(outputSchema, Map(outputCol -> a)) } override def pfa: PFADocument = { PFABuilder() .withName(sparkTransformer.uid) .withMetadata(getMetadata) .withInput(inputSchema) .withOutput(outputSchema) .withAction(action) .pfa } }
Example 13
Source File: ProcessWord2Vec20.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter12 import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, Word2Vec} import org.apache.spark.sql.SparkSession object ProcessWord2Vec20 { def main(args: Array[String]) { val input = "../data/sparkml2/chapter12/pg62.txt" val spark = SparkSession .builder .master("local[*]") .appName("Process Word2Vec App") .config("spark.sql.warehouse.dir", ".") .getOrCreate() //import spark.implicits._ Logger.getRootLogger.setLevel(Level.WARN) val df = spark.read.text(input).toDF("text") val tokenizer = new RegexTokenizer() .setPattern("\\W+") .setToLowercase(true) .setMinTokenLength(4) .setInputCol("text") .setOutputCol("raw") val rawWords = tokenizer.transform(df) val stopWords = new StopWordsRemover() .setInputCol("raw") .setOutputCol("terms") .setCaseSensitive(false) val wordTerms = stopWords.transform(rawWords) wordTerms.show(false) val word2Vec = new Word2Vec() .setInputCol("terms") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(wordTerms) val synonyms = model.findSynonyms("martian", 10) synonyms.show(false) spark.stop() } }