org.apache.spark.ml.feature.Word2Vec Scala Examples
The following examples show how to use org.apache.spark.ml.feature.Word2Vec.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Word2VecExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: Word2VecWrapper.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} import com.tencent.angel.spark.automl.feature.TransformerWrapper import org.apache.spark.ml.feature.Word2Vec class Word2VecWrapper extends TransformerWrapper { override val transformer = new Word2Vec() override var parent: TransformerWrapper = _ override val requiredInputCols: Array[String] = Array("sentences") override val requiredOutputCols: Array[String] = Array("outWord2Vec") override val hasMultiInputs: Boolean = false override val hasMultiOutputs: Boolean = false override val needAncestorInputs: Boolean = false override val relation: InToOutRelation = OneToOne override def declareInAndOut(): this.type = { transformer.setInputCol(getInputCols(0)) transformer.setOutputCol(getOutputCols(0)) this } }
Example 3
Source File: WordToVectorParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.feature.{Tokenizer, Word2Vec} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql._ class WordToVectorParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("loan_title") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new Tokenizer(). setInputCol("loan_title"). setOutputCol("loan_title_tokens"), new Word2Vec(uid = "words"). setInputCol("loan_title_tokens"). setOutputCol("loan_title_token_counts"))).fit(dataset) override val unserializedParams = Set("seed") }
Example 4
Source File: Word2VecMl.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.{SparkConf} import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.sql.SparkSession object Word2VecMl { case class Record(name: String) def main(args: Array[String]) { val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val spark = SparkSession .builder .appName("Word2Vec Sample").config(spConfig) .getOrCreate() import spark.implicits._ val rawDF = spark.sparkContext .wholeTextFiles("./data/20news-bydate-train/alt.atheism/*") val temp = rawDF.map( x => { (x._2.filter(_ >= ' ').filter(! _.toString.startsWith("(")) ) }) val textDF = temp.map(x => x.split(" ")).map(Tuple1.apply) .toDF("text") print(textDF.first()) val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(textDF) val result = model.transform(textDF) result.select("result").take(3).foreach(println) val ds = model.findSynonyms("philosophers", 5).select("word") ds.rdd.saveAsTextFile("./output/alien-synonyms" + System.nanoTime()) ds.show() spark.stop() } }
Example 5
Source File: Word2VecExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.ml.feature.Word2Vec // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val spark = SparkSession .builder .appName("Word2Vec example").config(spConfig) .getOrCreate() val documentDF1 = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)) documentDF1.show(1) val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.select("result").take(3).foreach(println) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: Word2VecExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 7
Source File: Word2VecSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.ml import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class Word2VecSpec extends TestBase { def genTokenizedText(): DataFrame = { session.createDataFrame(Seq( (0, Array("I", "walked", "the", "dog", "down", "the", "street")), (1, Array("I", "walked", "with", "the", "dog")), (2, Array("I", "walked", "the", "pup")) )).toDF("label", "words") } def genW2V(): Word2Vec = new Word2Vec().setSeed(1234).setMinCount(0) test("operation on tokenized strings") { val df = genTokenizedText() val df2 = genW2V().setVectorSize(2) .setInputCol("words").setOutputCol("features").fit(df).transform(df) val lines = df2.getDVCol("features") assert(lines.forall(_.size == 2)) } test("return vectors") { val df = genTokenizedText() val model = genW2V().setVectorSize(2) .setInputCol("words").setOutputCol("features").fit(df) val vectors = model.getVectors.getDVCol("vector") assert(vectors(0).size == 2) } test("return synonyms") { val df = genTokenizedText() val model = genW2V().setVectorSize(2) .setInputCol("words").setOutputCol("features").fit(df) val synonyms = model.findSynonyms("dog", 2).getColAs[String]("word") assert(synonyms.length === 2) } test("raise an error when applied to a null array") { val tokenDataFrame = session.createDataFrame(Seq( (0, Some(Array("Hi", "I", "can", "not", "foo"))), (1, None)) ).toDF("label", "tokens") assertSparkException[org.apache.spark.SparkException](genW2V().setInputCol("tokens"), tokenDataFrame) } test("raise an error when given strange values of parameters") { def base(): Word2Vec = genW2V().setInputCol("words") def assertIllegalArgument[T](f: T => Any, args: T*): Unit = args.foreach { n => interceptWithoutLogging[IllegalArgumentException] { f(n) } } assertIllegalArgument[Int](base.setMinCount, -1, -10) assertIllegalArgument[Int](base.setMaxIter, -1, -10) assertIllegalArgument[Int](base.setVectorSize, 0, -1, -10) assertIllegalArgument[Int](base.setWindowSize, 0, -1, -10) assertIllegalArgument[Int](base.setMaxSentenceLength, 0, -1, -10) assertIllegalArgument[Int](base.setNumPartitions, 0, -1, -10) assertIllegalArgument[Double](base.setStepSize, 0.0, -1.0, -10.0) } test("return a vector of zeros when it encounters an OOV word") { val df = genTokenizedText() val model = genW2V().setVectorSize(2).setMinCount(1).setInputCol("words").setOutputCol("features").fit(df) val df2 = session.createDataFrame(Seq( (0, Array("ketchup")))).toDF("label", "words") val results = model.transform(df2) val lines = results.getDVCol("features") val trueLines = List(new DenseVector(Array(0.0, 0.0))) assert(lines === trueLines) } test("be able to set vector size") { val df = genTokenizedText() val vectorSizes = List(1, 10, 100) vectorSizes.foreach { n => val results = genW2V().setVectorSize(n) .setInputCol("words").setOutputCol("features").fit(df).transform(df) .getDVCol("features") assert(results(0).size === n) } } }
Example 8
Source File: Word2VecExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: Word2VecExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec // $example off$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} result.show() result.select("result").take(3).foreach(println) // $example off$ sc.stop() } } // scalastyle:on println
Example 10
Source File: Word2VecExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 11
Source File: Word2VecExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object Word2VecExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("Word2Vec example") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = sqlContext.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.select("result").take(3).foreach(println) // $example off$ } } // scalastyle:on println
Example 12
Source File: ProcessWord2Vec20.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter12 import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, Word2Vec} import org.apache.spark.sql.SparkSession object ProcessWord2Vec20 { def main(args: Array[String]) { val input = "../data/sparkml2/chapter12/pg62.txt" val spark = SparkSession .builder .master("local[*]") .appName("Process Word2Vec App") .config("spark.sql.warehouse.dir", ".") .getOrCreate() //import spark.implicits._ Logger.getRootLogger.setLevel(Level.WARN) val df = spark.read.text(input).toDF("text") val tokenizer = new RegexTokenizer() .setPattern("\\W+") .setToLowercase(true) .setMinTokenLength(4) .setInputCol("text") .setOutputCol("raw") val rawWords = tokenizer.transform(df) val stopWords = new StopWordsRemover() .setInputCol("raw") .setOutputCol("terms") .setCaseSensitive(false) val wordTerms = stopWords.transform(rawWords) wordTerms.show(false) val word2Vec = new Word2Vec() .setInputCol("terms") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(wordTerms) val synonyms = model.findSynonyms("martian", 10) synonyms.show(false) spark.stop() } }