org.apache.spark.mllib.clustering.LDA Scala Examples
The following examples show how to use org.apache.spark.mllib.clustering.LDA.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LDAModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.TopicModelling import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.clustering.{ DistributedLDAModel, LDA } object LDAModelReuse { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "data/") .appName(s"OneVsRestExample") .getOrCreate() //Restoring the model for reuse val savedLDAModel = DistributedLDAModel.load(spark.sparkContext, "model/LDATrainedModel/") val lda = new LDAforTM() // actual computations are done here val defaultParams = Params().copy(input = "data/4UK1UkTX.csv", savedLDAModel) // Loading the parameters to train the LDA model lda.run(defaultParams, false) // Training the LDA model with the default parameters but don't save the trained model again spark.stop() } }
Example 2
Source File: LDATextExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.lda import scala.collection.mutable import org.apache.spark.mllib.clustering.LDA import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext object LDATextExample { val PATH = "/home/ubuntu/work/spark-src/spark/" val sc = new SparkContext("local[2]", "First Spark App") def main(args: Array[String]): Unit = { // Load documents from text files, 1 document per file val corpus: RDD[String] = sc.wholeTextFiles(PATH + "docs/*.md").map(_._2) // Split each document into a sequence of terms (words) val tokenized: RDD[Seq[String]] = corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3). filter(_.forall(java.lang.Character.isLetter))) // Choose the vocabulary. // termCounts: Sorted list of (term, termCount) pairs val termCounts: Array[(String, Long)] = tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2) // vocabArray: Chosen vocab (removing common terms) val numStopwords = 20 val vocabArray: Array[String] = termCounts.takeRight(termCounts.size - numStopwords).map(_._1) // vocab: Map term -> term index val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap // Convert documents into term count vectors val documents: RDD[(Long, Vector)] = tokenized.zipWithIndex.map { case (tokens, id) => val counts = new mutable.HashMap[Int, Double]() tokens.foreach { term => if (vocab.contains(term)) { val idx = vocab(term) counts(idx) = counts.getOrElse(idx, 0.0) + 1.0 } } (id, Vectors.sparse(vocab.size, counts.toSeq)) } // Set LDA parameters val numTopics = 10 val lda = new LDA().setK(numTopics).setMaxIterations(10) val ldaModel = lda.run(documents) //val avgLogLikelihood = ldaModel. / documents.count() // Print topics, showing top-weighted 10 terms for each topic. val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() } } }
Example 3
Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0 | 5 votes |
import collection.JavaConversions._ import scala.collection.mutable import opennlp.tools.tokenize.SimpleTokenizer import opennlp.tools.stemmer.PorterStemmer import org.apache.spark.rdd._ import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA} import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors} import org.apache.spark.mllib.feature.IDF // add openNLP jar to the Spark Context sc.addJar("opennlp-tools-1.6.0.jar") // Load documents from text files, 1 element (text string) per file val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2) // read stop words from file val stopwordFile = "stop-words.txt" val st_words = sc.textFile(stopwordFile).collect() .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet val stopwords = sc.broadcast(st_words) val minWordLength = 3 val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => val tokenizer = SimpleTokenizer.INSTANCE val stemmer = new PorterStemmer() val tokens = tokenizer.tokenize(text) val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w))) .map(w => stemmer.stem(w)) id -> words }.filter(_._2.length > 0) tokenized.cache() val numDocs = tokenized.count() val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => tokens.map(_ -> 1L) }.reduceByKey(_ + _) wordCounts.cache() val fullVocabSize = wordCounts.count() val vSize = 10000 val (vocab: Map[String, Int], selectedTokenCount: Long) = { val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)} (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum) } val documents = tokenized.map { case (id, tokens) => // Filter tokens by vocabulary, and create word count vector representation of document. val wc = new mutable.HashMap[Int, Int]() tokens.foreach { term => if (vocab.contains(term)) { val termIndex = vocab(term) wc(termIndex) = wc.getOrElse(termIndex, 0) + 1 } } val indices = wc.keys.toArray.sorted val values = indices.map(i => wc(i).toDouble) val sb = Vectors.sparse(vocab.size, indices, values) (id, sb) } val vocabArray = new Array[String](vocab.size) vocab.foreach { case (term, i) => vocabArray(i) = term } val tf = documents.map { case (id, vec) => vec }.cache() val idfVals = new IDF().fit(tf).idf.toArray val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) => val indices = vec.asInstanceOf[SparseVector].indices val counts = new mutable.HashMap[Int, Double]() for (idx <- indices) { counts(idx) = vec(idx) * idfVals(idx) } (id, Vectors.sparse(vocab.size, counts.toSeq)) } val numTopics = 5 val numIterations = 50 val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online") val ldaModel = lda.run(tfidfDocs) val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() }
Example 4
Source File: LDAExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel, LocalLDAModel} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import scopt.OptionParser object LDAExample { case class Params( inputPath: String = null, outputPath: String = null, numTopics: Int = 10, maxIterations: Int = 10, optimizer: String = "online", maxResultSize: String = "1g") def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("LDA") { head("LDA: an example app for LDA.") opt[String]("optimizer") .text(s"optimizer, default: ${defaultParams.optimizer}") .action((x, c) => c.copy(optimizer = x)) opt[String]("maxResultSize") .text("max resultSize, default: ${defaultParams.maxResultSize}") .action((x, c) => c.copy(maxResultSize = x)) opt[Int]("numTopics") .text(s"number of Topics, default: ${defaultParams.numTopics}") .action((x, c) => c.copy(numTopics = x)) opt[Int]("maxIterations") .text(s"number of max iterations, default: ${defaultParams.maxIterations}") .action((x, c) => c.copy(maxIterations = x)) arg[String]("<inputPath>") .required() .text("Input paths") .action((x, c) => c.copy(inputPath = x)) arg[String]("<outputPath>") .required() .text("outputPath paths") .action((x, c) => c.copy(outputPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf() .setAppName(s"LDA Example with $params") .set("spark.driver.maxResultSize", params.maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val corpus: RDD[(Long, Vector)] = sc.objectFile(params.inputPath) // Cluster the documents into numTopics topics using LDA val ldaModel = new LDA().setK(params.numTopics).setMaxIterations(params.maxIterations).setOptimizer(params.optimizer).run(corpus) // Save and load model. ldaModel.save(sc, params.outputPath) val savedModel = LocalLDAModel.load(sc, params.outputPath) sc.stop() } }
Example 5
Source File: DRTest.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.mllib.clustering.LDA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.scalatest._ import scala.util.Random import com.bloomberg.sparkflow._ val randomVecs = parallelize(1 to 100).map(i => Vectors.dense(Seq.fill(10)(Random.nextDouble()).toArray)) val corpus = randomVecs.zipWithUniqueId().map{case (k,v) => (v,k)} val ldaModel = corpus.mapToResult(rdd => new LDA().setK(3).run(rdd)) } test("regularSpark"){ val numbers: RDD[Int] = sc.parallelize(1 to 10) val doubles: RDD[Double] = numbers.map(_.toDouble) val sum: Double = doubles.sum() val normalized: RDD[Double] = doubles.map(_ / sum) } }