org.apache.spark.ml.feature.Word2Vec Scala Examples

The following examples show how to use org.apache.spark.ml.feature.Word2Vec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: Word2VecExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession

object Word2VecExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("Word2Vec example")
      .getOrCreate()

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    val documentDF = spark.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text")

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)

    val result = model.transform(documentDF)
    result.collect().foreach { case Row(text: Seq[_], features: Vector) =>
      println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") }
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: Word2VecWrapper.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature.preprocess

import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne}
import com.tencent.angel.spark.automl.feature.TransformerWrapper
import org.apache.spark.ml.feature.Word2Vec

class Word2VecWrapper extends TransformerWrapper {

  override val transformer = new Word2Vec()
  override var parent: TransformerWrapper = _

  override val requiredInputCols: Array[String] = Array("sentences")
  override val requiredOutputCols: Array[String] = Array("outWord2Vec")

  override val hasMultiInputs: Boolean = false
  override val hasMultiOutputs: Boolean = false
  override val needAncestorInputs: Boolean = false

  override val relation: InToOutRelation = OneToOne

  override def declareInAndOut(): this.type = {
    transformer.setInputCol(getInputCols(0))
    transformer.setOutputCol(getOutputCols(0))
    this
  }
} 
Example 3
Source File: WordToVectorParitySpec.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.parity.feature

import org.apache.spark.ml.feature.{Tokenizer, Word2Vec}
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.sql._


class WordToVectorParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("loan_title")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer().
    setInputCol("loan_title").
    setOutputCol("loan_title_tokens"),
    new Word2Vec(uid = "words").
      setInputCol("loan_title_tokens").
      setOutputCol("loan_title_token_counts"))).fit(dataset)

  override val unserializedParams = Set("seed")
} 
Example 4
Source File: Word2VecMl.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
import org.apache.spark.{SparkConf}
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.sql.SparkSession

object Word2VecMl {
  case class Record(name: String)

  def main(args: Array[String]) {
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder
      .appName("Word2Vec Sample").config(spConfig)
      .getOrCreate()

    import spark.implicits._

    val rawDF = spark.sparkContext
      .wholeTextFiles("./data/20news-bydate-train/alt.atheism/*")

    val temp = rawDF.map( x => {
      (x._2.filter(_ >= ' ').filter(! _.toString.startsWith("(")) )
    })

    val textDF = temp.map(x => x.split(" ")).map(Tuple1.apply)
      .toDF("text")
    print(textDF.first())
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(textDF)
    val result = model.transform(textDF)
    result.select("result").take(3).foreach(println)
    val ds = model.findSynonyms("philosophers", 5).select("word")
    ds.rdd.saveAsTextFile("./output/alien-synonyms" +  System.nanoTime())
    ds.show()
    spark.stop()
  }
} 
Example 5
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.Word2Vec
// $example off$
import org.apache.spark.sql.SparkSession

object Word2VecExample {
  def main(args: Array[String]) {
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder
      .appName("Word2Vec example").config(spConfig)
      .getOrCreate()

    val documentDF1 = spark.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply))
    documentDF1.show(1)

    val documentDF = spark.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text")


    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)
    val result = model.transform(documentDF)
    result.select("result").take(3).foreach(println)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: Word2VecExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession

object Word2VecExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("Word2Vec example")
      .getOrCreate()

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    val documentDF = spark.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text")

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)

    val result = model.transform(documentDF)
    result.collect().foreach { case Row(text: Seq[_], features: Vector) =>
      println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") }
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 7
Source File: Word2VecSpec.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.ml

import com.microsoft.ml.spark.core.schema.DatasetExtensions._
import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class Word2VecSpec extends TestBase {

  def genTokenizedText(): DataFrame = {
    session.createDataFrame(Seq(
      (0, Array("I", "walked", "the", "dog", "down", "the", "street")),
      (1, Array("I", "walked", "with", "the", "dog")),
      (2, Array("I", "walked", "the", "pup"))
    )).toDF("label", "words")
  }

  def genW2V(): Word2Vec = new Word2Vec().setSeed(1234).setMinCount(0)

  test("operation on tokenized strings") {
    val df = genTokenizedText()

    val df2 = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df).transform(df)

    val lines = df2.getDVCol("features")
    assert(lines.forall(_.size == 2))
  }

  test("return vectors") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df)
    val vectors = model.getVectors.getDVCol("vector")
    assert(vectors(0).size == 2)
  }

  test("return synonyms") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df)
    val synonyms = model.findSynonyms("dog", 2).getColAs[String]("word")
    assert(synonyms.length === 2)
  }

  test("raise an error when applied to a null array") {
    val tokenDataFrame = session.createDataFrame(Seq(
      (0, Some(Array("Hi", "I", "can", "not", "foo"))),
      (1, None))
    ).toDF("label", "tokens")
    assertSparkException[org.apache.spark.SparkException](genW2V().setInputCol("tokens"), tokenDataFrame)
  }

  test("raise an error when given strange values of parameters") {
    def base(): Word2Vec = genW2V().setInputCol("words")
    def assertIllegalArgument[T](f: T => Any, args: T*): Unit =
      args.foreach { n => interceptWithoutLogging[IllegalArgumentException] { f(n) } }
    assertIllegalArgument[Int](base.setMinCount,             -1, -10)
    assertIllegalArgument[Int](base.setMaxIter,              -1, -10)
    assertIllegalArgument[Int](base.setVectorSize,        0, -1, -10)
    assertIllegalArgument[Int](base.setWindowSize,        0, -1, -10)
    assertIllegalArgument[Int](base.setMaxSentenceLength, 0, -1, -10)
    assertIllegalArgument[Int](base.setNumPartitions,     0, -1, -10)
    assertIllegalArgument[Double](base.setStepSize, 0.0, -1.0, -10.0)
  }

  test("return a vector of zeros when it encounters an OOV word") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2).setMinCount(1).setInputCol("words").setOutputCol("features").fit(df)
    val df2 = session.createDataFrame(Seq(
      (0, Array("ketchup")))).toDF("label", "words")
    val results = model.transform(df2)
    val lines = results.getDVCol("features")
    val trueLines = List(new DenseVector(Array(0.0, 0.0)))
    assert(lines === trueLines)
  }

  test("be able to set vector size") {
    val df = genTokenizedText()
    val vectorSizes = List(1, 10, 100)
    vectorSizes.foreach { n =>
      val results =
          genW2V().setVectorSize(n)
            .setInputCol("words").setOutputCol("features").fit(df).transform(df)
            .getDVCol("features")
        assert(results(0).size === n)
    }
  }

} 
Example 8
Source File: Word2VecExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession

object Word2VecExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("Word2Vec example")
      .getOrCreate()

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    val documentDF = spark.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text")

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)

    val result = model.transform(documentDF)
    result.collect().foreach { case Row(text: Seq[_], features: Vector) =>
      println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") }
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: Word2VecExample.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Word2Vec

// $example off$

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    result.show()
    result.select("result").take(3).foreach(println)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 10
Source File: Word2VecExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession

object Word2VecExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("Word2Vec example")
      .getOrCreate()

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    val documentDF = spark.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text")

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)

    val result = model.transform(documentDF)
    result.collect().foreach { case Row(text: Seq[_], features: Vector) =>
      println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") }
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 11
Source File: Word2VecExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Word2Vec
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object Word2VecExample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Word2Vec example")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // $example on$
    // Input data: Each row is a bag of words from a sentence or document.
    val documentDF = sqlContext.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text")

    // Learn a mapping from words to Vectors.
    val word2Vec = new Word2Vec()
      .setInputCol("text")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(documentDF)
    val result = model.transform(documentDF)
    result.select("result").take(3).foreach(println)
    // $example off$
  }
}
// scalastyle:on println 
Example 12
package spark.ml.cookbook.chapter12

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, Word2Vec}
import org.apache.spark.sql.SparkSession

object ProcessWord2Vec20 {

  def main(args: Array[String]) {

    val input = "../data/sparkml2/chapter12/pg62.txt"

    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("Process Word2Vec  App")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()

    //import spark.implicits._

    Logger.getRootLogger.setLevel(Level.WARN)

    val df = spark.read.text(input).toDF("text")

    val tokenizer = new RegexTokenizer()
      .setPattern("\\W+")
      .setToLowercase(true)
      .setMinTokenLength(4)
      .setInputCol("text")
      .setOutputCol("raw")
    val rawWords = tokenizer.transform(df)

    val stopWords = new StopWordsRemover()
      .setInputCol("raw")
      .setOutputCol("terms")
      .setCaseSensitive(false)

    val wordTerms = stopWords.transform(rawWords)

    wordTerms.show(false)

    val word2Vec = new Word2Vec()
      .setInputCol("terms")
      .setOutputCol("result")
      .setVectorSize(3)
      .setMinCount(0)
    val model = word2Vec.fit(wordTerms)

    val synonyms = model.findSynonyms("martian", 10)

    synonyms.show(false)

    spark.stop()
  }
}