org.apache.spark.mllib.feature.Word2Vec Scala Examples
The following examples show how to use org.apache.spark.mllib.feature.Word2Vec.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Word2VecExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} // $example off$ object Word2VecExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Word2VecExample") val sc = new SparkContext(conf) // $example on$ val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() val model = word2vec.fit(input) val synonyms = model.findSynonyms("1", 5) for((synonym, cosineSimilarity) <- synonyms) { println(s"$synonym $cosineSimilarity") } // Save and load model model.save(sc, "myModelPath") val sameModel = Word2VecModel.load(sc, "myModelPath") // $example off$ sc.stop() } } // scalastyle:on println
Example 2
Source File: explore_movies.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.exploredataset import breeze.linalg.CSCMatrix import org.apache.spark.SparkContext import org.sparksamples.Util import org.apache.spark.mllib.feature.Word2Vec import scala.collection.mutable.ListBuffer object explore_movies { def processRegex(input:String):String= { val pattern = "^[^(]*".r val output = pattern.findFirstIn(input) return output.get } def main(args: Array[String]) { val sc = new SparkContext("local[2]", "Explore Users in Movie Dataset") val raw_title = org.sparksamples.Util.getMovieDataDF().select("name") raw_title.show() raw_title.createOrReplaceTempView("titles") Util.spark.udf.register("processRegex", processRegex _) val processed_titles = Util.spark.sql("select processRegex(name) from titles") processed_titles.show() val titles_rdd = processed_titles.rdd.map(r => r(0).toString) val y = titles_rdd.take(5) titles_rdd.take(5).foreach(println) println(titles_rdd.first()) //val title_terms = null val title_terms = titles_rdd.map(x => x.split(" ")) title_terms.take(5).foreach(_.foreach(println)) println(title_terms.count()) val all_terms_dic = new ListBuffer[String]() val all_terms = title_terms.flatMap(title_terms => title_terms).distinct().collect() for (term <- all_terms){ all_terms_dic += term } println(all_terms_dic.length) println(all_terms_dic.indexOf("Dead")) println(all_terms_dic.indexOf("Rooms")) val all_terms_withZip = title_terms.flatMap(title_terms => title_terms).distinct().zipWithIndex().collectAsMap() println(all_terms_withZip.get("Dead")) println(all_terms_withZip.get("Rooms")) val word2vec = new Word2Vec() val rdd_terms = titles_rdd.map(title => title.split(" ").toSeq) val model = word2vec.fit(rdd_terms) println(model.findSynonyms("Dead", 40)) val term_vectors = title_terms.map(title_terms => create_vector(title_terms, all_terms_dic)) term_vectors.take(5).foreach(println) sc.stop() } def create_vector(title_terms:Array[String], all_terms_dic:ListBuffer[String]): CSCMatrix[Int] = { var idx = 0 val x = CSCMatrix.zeros[Int](1, all_terms_dic.length) title_terms.foreach(i => { if (all_terms_dic.contains(i)) { idx = all_terms_dic.indexOf(i) x.update(0, idx, 1) } }) return x } def convert(year:String): String = { try{ val mod_year = year.substring(year.length - 4,year.length) return mod_year }catch { case e : Exception => return "1900" } } }
Example 3
Source File: ConvertWordsToVectors.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.featureext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.feature.Word2Vec object ConvertWordsToVectors{ def main(args: Array[String]) { val file = "/home/ubuntu/work/ml-resources/spark-ml/Chapter_04/data/text8_10000" val conf = new SparkConf().setMaster("local").setAppName("Word2Vector") val sc = new SparkContext(conf) val input = sc.textFile(file).map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() val model = word2vec.fit(input) val vectors = model.getVectors vectors foreach ( (t2) => println (t2._1 + "-->" + t2._2.mkString(" "))) } }
Example 4
Source File: ConvertWordsToVectors.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.featureext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.feature.Word2Vec object ConvertWordsToVectors{ def main(args: Array[String]) { val file = "/home/ubuntu/work/rajdeepd-spark-ml/spark-ml/Chapter_04/data/text8_10000" val conf = new SparkConf().setMaster("local").setAppName("Word2Vector") val sc = new SparkContext(conf) val input = sc.textFile(file).map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() val model = word2vec.fit(input) val vectors = model.getVectors vectors foreach ( (t2) => println (t2._1 + "-->" + t2._2.mkString(" "))) } }
Example 5
Source File: Word2VecMllib.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{SparseVector => SV} object Word2VecMllib { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "Word2Vector App") val path = "./data/20news-bydate-train/alt.atheism/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) import org.apache.spark.mllib.feature.Word2Vec val word2vec = new Word2Vec() //word2vec.setSeed(42) // we do this to generate the same results each time val word2vecModel = word2vec.fit(tokens) word2vecModel.findSynonyms("philosophers", 5).foreach(println) sc.stop() } }
Example 6
Source File: Word2VecExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} // $example off$ object Word2VecExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Word2VecExample") val sc = new SparkContext(conf) // $example on$ val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() val model = word2vec.fit(input) val synonyms = model.findSynonyms("1", 5) for((synonym, cosineSimilarity) <- synonyms) { println(s"$synonym $cosineSimilarity") } // Save and load model model.save(sc, "myModelPath") val sameModel = Word2VecModel.load(sc, "myModelPath") // $example off$ sc.stop() } } // scalastyle:on println
Example 7
Source File: Main.scala From stellar-random-walk with Apache License 2.0 | 5 votes |
package au.csiro.data61.randomwalk import au.csiro.data61.randomwalk.algorithm.{UniformRandomWalk, VCutRandomWalk} import au.csiro.data61.randomwalk.common.CommandParser.TaskName import au.csiro.data61.randomwalk.common.{CommandParser, Params, Property} import com.typesafe.config.Config import org.apache.log4j.LogManager import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.scalactic.{Every, Good, Or} import spark.jobserver.SparkJobInvalid import spark.jobserver.api._ object Main extends SparkJob { lazy val logger = LogManager.getLogger("myLogger") def main(args: Array[String]) { CommandParser.parse(args) match { case Some(params) => val conf = new SparkConf().setAppName("stellar-random-walk") val context: SparkContext = new SparkContext(conf) runJob(context, null, params) case None => sys.exit(1) } } override def validate(sc: SparkContext, runtime: JobEnvironment, config: Config): JobData Or Every[SparkJobInvalid] = { val args = config.getString("rw.input").split("\\s+") CommandParser.parse(args) match { case Some(params) => Good(params) } } }
Example 8
Source File: Word2VecExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} // $example off$ object Word2VecExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Word2VecExample") val sc = new SparkContext(conf) // $example on$ val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() val model = word2vec.fit(input) val synonyms = model.findSynonyms("1", 5) for((synonym, cosineSimilarity) <- synonyms) { println(s"$synonym $cosineSimilarity") } // Save and load model model.save(sc, "myModelPath") val sameModel = Word2VecModel.load(sc, "myModelPath") // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: Word2VecExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.feature.Word2Vec object Word2VecExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("Word2VecExample") val sc = new SparkContext(sparkConf) val input = sc.textFile("../data/mllib/text8").map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() //fit()方法将DataFrame转化为一个Transformer的算法 val model = word2vec.fit(input) val synonyms = model.findSynonyms("china", 40) for ((synonym, cosineSimilarity) <- synonyms) { //相似性得分 println(s"$synonym $cosineSimilarity") } // Save and load model //model.save(sc, "myModelPath") //val sameModel = Word2VecModel.load(sc, "myModelPath") } }
Example 10
Source File: Word2VecExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} // $example off$ object Word2VecExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Word2VecExample") val sc = new SparkContext(conf) // $example on$ val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq) val word2vec = new Word2Vec() val model = word2vec.fit(input) val synonyms = model.findSynonyms("1", 5) for((synonym, cosineSimilarity) <- synonyms) { println(s"$synonym $cosineSimilarity") } // Save and load model model.save(sc, "myModelPath") val sameModel = Word2VecModel.load(sc, "myModelPath") // $example off$ sc.stop() } } // scalastyle:on println