org.apache.spark.ml.feature.RegexTokenizer Scala Examples

The following examples show how to use org.apache.spark.ml.feature.RegexTokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TokenizerExample.scala    From drizzle-spark   with Apache License 2.0 7 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
import org.apache.spark.sql.functions._
// $example off$
import org.apache.spark.sql.SparkSession

object TokenizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("TokenizerExample")
      .getOrCreate()

    // $example on$
    val sentenceDataFrame = spark.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (1, "I wish Java could use case classes"),
      (2, "Logistic,regression,models,are,neat")
    )).toDF("id", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)

    val countTokens = udf { (words: Seq[String]) => words.length }

    val tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)

    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: LocalRegexTokenizer.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.RegexTokenizer

class LocalRegexTokenizer(val sparkTransformer: RegexTokenizer)
  extends LocalTransformer[RegexTokenizer] {

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(x) =>
        val newData = x.data.map { d =>
          val originStr = d.toString
          val re        = sparkTransformer.getPattern.r
          val str       = if (sparkTransformer.getToLowercase) originStr.toLowerCase() else originStr
          val tokens =
            if (sparkTransformer.getGaps) re.split(str).toSeq else re.findAllIn(str).toSeq
          val minLength = sparkTransformer.getMinTokenLength
          tokens.filter(_.length >= minLength).toList
        }
        localData.withColumn(
          LocalDataColumn(
            sparkTransformer.getOutputCol,
            newData
          )
        )
      case None => localData
    }
  }
}

object LocalRegexTokenizer
  extends SimpleModelLoader[RegexTokenizer]
  with TypedTransformerConverter[RegexTokenizer] {

  override def build(metadata: Metadata, data: LocalData): RegexTokenizer = {
    new RegexTokenizer(metadata.uid)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      .setPattern(metadata.paramMap("pattern").asInstanceOf[String])
      .setGaps(metadata.paramMap("gaps").asInstanceOf[Boolean])
      .setMinTokenLength(metadata.paramMap("minTokenLength").asInstanceOf[Number].intValue())
      .setToLowercase(metadata.paramMap("toLowercase").asInstanceOf[Boolean])
  }

  override implicit def toLocal(transformer: RegexTokenizer) =
    new LocalRegexTokenizer(transformer)
} 
Example 3
Source File: RegexTokenizerOp.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.RegexTokenizer

class RegexTokenizerOp extends SimpleSparkOp[RegexTokenizer] {
  override val Model: OpModel[SparkBundleContext, RegexTokenizer] = new OpModel[SparkBundleContext, RegexTokenizer] {

    val RegexIdentifier = "regex"
    val MatchGapsIdentifier = "match_gaps"
    val MinTokenLengthIdentifer = "token_min_length"
    val LowercaseText = "lowercase_text"

    override val klazz: Class[RegexTokenizer] = classOf[RegexTokenizer]

    override def opName: String = Bundle.BuiltinOps.feature.regex_tokenizer

    override def store(model: Model, obj: RegexTokenizer)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {

      model
        .withValue(RegexIdentifier, Value.string(obj.getPattern))
        .withValue(MatchGapsIdentifier, Value.boolean(obj.getGaps))
        .withValue(MinTokenLengthIdentifer, Value.int(obj.getMinTokenLength))
        .withValue(LowercaseText, Value.boolean(obj.getToLowercase))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): RegexTokenizer = {

      new RegexTokenizer(uid = "")
        .setPattern(model.value(RegexIdentifier).getString)
        .setGaps(model.value(MatchGapsIdentifier).getBoolean)
        .setMinTokenLength(model.value(MinTokenLengthIdentifer).getInt)
        .setToLowercase(model.value(LowercaseText).getBoolean)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: RegexTokenizer): RegexTokenizer = {
    new RegexTokenizer(uid = uid)
    .setPattern(model.getPattern)
    .setGaps(model.getGaps)
    .setMinTokenLength(model.getMinTokenLength)
    .setToLowercase(model.getToLowercase)
  }

  override def sparkInputs(obj: RegexTokenizer): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: RegexTokenizer): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
} 
Example 4
Source File: RegexTokenizerParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.feature

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.feature.RegexTokenizer
import org.apache.spark.sql.DataFrame

class RegexTokenizerParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("loan_title")
  override val sparkTransformer: Transformer = new RegexTokenizer()
    .setInputCol("loan_title")
    .setOutputCol("loan_title_tokens")
    .setGaps(true)
    .setToLowercase(true)
    .setMinTokenLength(2)
    .setPattern("\\s")
} 
Example 5
Source File: TokenizerExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
import org.apache.spark.sql.functions._
// $example off$
import org.apache.spark.sql.SparkSession

object TokenizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("TokenizerExample")
      .getOrCreate()

    // $example on$
    val sentenceDataFrame = spark.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (1, "I wish Java could use case classes"),
      (2, "Logistic,regression,models,are,neat")
    )).toDF("id", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)

    val countTokens = udf { (words: Seq[String]) => words.length }

    val tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)

    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: TokenizerExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
import org.apache.spark.sql.functions._
// $example off$
import org.apache.spark.sql.SparkSession

object TokenizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("TokenizerExample")
      .getOrCreate()

    // $example on$
    val sentenceDataFrame = spark.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (1, "I wish Java could use case classes"),
      (2, "Logistic,regression,models,are,neat")
    )).toDF("id", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)

    val countTokens = udf { (words: Seq[String]) => words.length }

    val tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)

    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 7
Source File: TokenizerExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    regexTokenized.show()
    regexTokenized.select("words", "label").take(3).foreach(println)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: TokenizerExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
// $example off$

object TokenizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("TokenizerExample")
      .getOrCreate()

    // $example on$
    val sentenceDataFrame = spark.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (1, "I wish Java could use case classes"),
      (2, "Logistic,regression,models,are,neat")
    )).toDF("id", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)

    val countTokens = udf { (words: Seq[String]) => words.length }

    val tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)

    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("sentence", "words")
        .withColumn("tokens", countTokens(col("words"))).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: TokenizerExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object TokenizerExample {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("TokenizerExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    val sentenceDataFrame = sqlContext.createDataFrame(Seq(
      (0, "Hi I heard about Spark"),
      (1, "I wish Java could use case classes"),
      (2, "Logistic,regression,models,are,neat")
    )).toDF("label", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)

    val tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("words", "label").take(3).foreach(println)
    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("words", "label").take(3).foreach(println)
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println 
Example 10
package com.chapter11.SparkMachineLearning

import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StopWordsRemover

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val sentence = spark.createDataFrame(Seq(
      (0, "Tokenization,is the process of enchanting words,from the raw text"),
      (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
      (2, " Here,will provide a sample example on how to tockenize sentences"),
      (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")

    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W+")
      .setGaps(true)

    val countTokens = udf { (words: Seq[String]) => words.length }
    val regexTokenized = regexTokenizer.transform(sentence)

    val remover = new StopWordsRemover()
      .setInputCol("words")
      .setOutputCol("filtered")

    val newDF = remover.transform(regexTokenized)
    newDF.select("id", "filtered").show(false)

  }
} 
Example 11
Source File: TockenizerExample.scala    From Scala-and-Spark-for-Big-Data-Analytics   with MIT License 5 votes vote down vote up
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

object TockenizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val sentence = spark.createDataFrame(Seq(
      (0, "Tokenization,is the process of enchanting words,from the raw text"),
      (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
      (2, " Here,will provide a sample example on how to tockenize sentences"),
      (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W+")
      .setGaps(true)

    val countTokens = udf { (words: Seq[String]) => words.length }

    val tokenized = tokenizer.transform(sentence)
    
    tokenized.select("sentence", "words")
            .withColumn("tokens", countTokens(col("words")))
            .show(false)

    val regexTokenized = regexTokenizer.transform(sentence)
    
    regexTokenized.select("sentence", "words")   
                .withColumn("tokens", countTokens(col("words")))
                .show(false)
  }
} 
Example 12
Source File: RegexTokenizer.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.document.{PFABuilder, PFADocument}
import com.ibm.aardpfark.pfa.expression.PFAExpression
import com.ibm.aardpfark.spark.ml.PFATransformer
import org.apache.avro.SchemaBuilder

import org.apache.spark.ml.feature.RegexTokenizer


// TODO missing token count filter and gaps vs tokens
class PFARegexTokenizer(override val sparkTransformer: RegexTokenizer) extends PFATransformer {
  import com.ibm.aardpfark.pfa.dsl._

  private val inputCol = sparkTransformer.getInputCol
  private val outputCol = sparkTransformer.getOutputCol
  private val inputExpr = StringExpr(s"input.${inputCol}")

  private val pattern = sparkTransformer.getPattern
  private val gaps = sparkTransformer.getGaps
  private val minTokenLength = sparkTransformer.getMinTokenLength
  private val toLowerCase = sparkTransformer.getToLowercase

  override def inputSchema = {
    SchemaBuilder.record(withUid(inputBaseName)).fields()
      .name(inputCol).`type`().stringType().noDefault()
      .endRecord()
  }

  override def outputSchema = {
    SchemaBuilder.record(withUid(outputBaseName)).fields()
      .name(outputCol).`type`().array().items().stringType().noDefault()
      .endRecord()
  }

  override def action: PFAExpression = {
    val a = if (toLowerCase) {
      re.split(s.lower(inputExpr), pattern)
    } else {
      re.split(inputExpr, pattern)
    }
    NewRecord(outputSchema, Map(outputCol -> a))
  }

  override def pfa: PFADocument = {
    PFABuilder()
      .withName(sparkTransformer.uid)
      .withMetadata(getMetadata)
      .withInput(inputSchema)
      .withOutput(outputSchema)
      .withAction(action)
      .pfa
  }
} 
Example 13
package spark.ml.cookbook.chapter12

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, Word2Vec}
import org.apache.spark.sql.SparkSession

object ProcessWord2Vec20 {

  def main(args: Array[String]) {

    val input = "../data/sparkml2/chapter12/pg62.txt"

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("Process Word2Vec  App")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    //import spark.implicits._

    Logger.getRootLogger.setLevel(Level.WARN)

    val df = spark.read.text(input).toDF("text")

    val tokenizer = new RegexTokenizer()
      .setPattern("\\W+")
      .setToLowercase(true)
      .setMinTokenLength(4)
      .setInputCol("text")
      .setOutputCol("raw")
    val rawWords = tokenizer.transform(df)

    val stopWords = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("terms")
      .setCaseSensitive(false)

    val wordTerms = stopWords.transform(rawWords)

    wordTerms.show(false)

    val word2Vec = new Word2Vec()
      .setInputCol("terms")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(wordTerms)

    val synonyms = model.findSynonyms("martian", 10)

    synonyms.show(false)

    spark.stop()
  }
}