org.apache.spark.mllib.clustering.DistributedLDAModel Scala Examples
The following examples show how to use org.apache.spark.mllib.clustering.DistributedLDAModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LDAModelReuse.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.TopicModelling import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.clustering.{ DistributedLDAModel, LDA } object LDAModelReuse { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "data/") .appName(s"OneVsRestExample") .getOrCreate() //Restoring the model for reuse val savedLDAModel = DistributedLDAModel.load(spark.sparkContext, "model/LDATrainedModel/") val lda = new LDAforTM() // actual computations are done here val defaultParams = Params().copy(input = "data/4UK1UkTX.csv", savedLDAModel) // Loading the parameters to train the LDA model lda.run(defaultParams, false) // Training the LDA model with the default parameters but don't save the trained model again spark.stop() } }
Example 2
Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0 | 5 votes |
import collection.JavaConversions._ import scala.collection.mutable import opennlp.tools.tokenize.SimpleTokenizer import opennlp.tools.stemmer.PorterStemmer import org.apache.spark.rdd._ import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA} import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors} import org.apache.spark.mllib.feature.IDF // add openNLP jar to the Spark Context sc.addJar("opennlp-tools-1.6.0.jar") // Load documents from text files, 1 element (text string) per file val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2) // read stop words from file val stopwordFile = "stop-words.txt" val st_words = sc.textFile(stopwordFile).collect() .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet val stopwords = sc.broadcast(st_words) val minWordLength = 3 val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => val tokenizer = SimpleTokenizer.INSTANCE val stemmer = new PorterStemmer() val tokens = tokenizer.tokenize(text) val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w))) .map(w => stemmer.stem(w)) id -> words }.filter(_._2.length > 0) tokenized.cache() val numDocs = tokenized.count() val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => tokens.map(_ -> 1L) }.reduceByKey(_ + _) wordCounts.cache() val fullVocabSize = wordCounts.count() val vSize = 10000 val (vocab: Map[String, Int], selectedTokenCount: Long) = { val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)} (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum) } val documents = tokenized.map { case (id, tokens) => // Filter tokens by vocabulary, and create word count vector representation of document. val wc = new mutable.HashMap[Int, Int]() tokens.foreach { term => if (vocab.contains(term)) { val termIndex = vocab(term) wc(termIndex) = wc.getOrElse(termIndex, 0) + 1 } } val indices = wc.keys.toArray.sorted val values = indices.map(i => wc(i).toDouble) val sb = Vectors.sparse(vocab.size, indices, values) (id, sb) } val vocabArray = new Array[String](vocab.size) vocab.foreach { case (term, i) => vocabArray(i) = term } val tf = documents.map { case (id, vec) => vec }.cache() val idfVals = new IDF().fit(tf).idf.toArray val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) => val indices = vec.asInstanceOf[SparseVector].indices val counts = new mutable.HashMap[Int, Double]() for (idx <- indices) { counts(idx) = vec(idx) * idfVals(idx) } (id, Vectors.sparse(vocab.size, counts.toSeq)) } val numTopics = 5 val numIterations = 50 val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online") val ldaModel = lda.run(tfidfDocs) val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() }
Example 3
Source File: LDAExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel, LocalLDAModel} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import scopt.OptionParser object LDAExample { case class Params( inputPath: String = null, outputPath: String = null, numTopics: Int = 10, maxIterations: Int = 10, optimizer: String = "online", maxResultSize: String = "1g") def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("LDA") { head("LDA: an example app for LDA.") opt[String]("optimizer") .text(s"optimizer, default: ${defaultParams.optimizer}") .action((x, c) => c.copy(optimizer = x)) opt[String]("maxResultSize") .text("max resultSize, default: ${defaultParams.maxResultSize}") .action((x, c) => c.copy(maxResultSize = x)) opt[Int]("numTopics") .text(s"number of Topics, default: ${defaultParams.numTopics}") .action((x, c) => c.copy(numTopics = x)) opt[Int]("maxIterations") .text(s"number of max iterations, default: ${defaultParams.maxIterations}") .action((x, c) => c.copy(maxIterations = x)) arg[String]("<inputPath>") .required() .text("Input paths") .action((x, c) => c.copy(inputPath = x)) arg[String]("<outputPath>") .required() .text("outputPath paths") .action((x, c) => c.copy(outputPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf() .setAppName(s"LDA Example with $params") .set("spark.driver.maxResultSize", params.maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val corpus: RDD[(Long, Vector)] = sc.objectFile(params.inputPath) // Cluster the documents into numTopics topics using LDA val ldaModel = new LDA().setK(params.numTopics).setMaxIterations(params.maxIterations).setOptimizer(params.optimizer).run(corpus) // Save and load model. ldaModel.save(sc, params.outputPath) val savedModel = LocalLDAModel.load(sc, params.outputPath) sc.stop() } }