org.apache.spark.mllib.clustering.LDA Scala Example

Source File: LDAModelReuse.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.TopicModelling

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.clustering.{ DistributedLDAModel, LDA }

object LDAModelReuse {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "data/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    //Restoring the model for reuse
    val savedLDAModel = DistributedLDAModel.load(spark.sparkContext, "model/LDATrainedModel/")

    val lda = new LDAforTM() // actual computations are done here
    val defaultParams = Params().copy(input = "data/4UK1UkTX.csv", savedLDAModel) // Loading the parameters to train the LDA model
    lda.run(defaultParams, false) // Training the LDA model with the default parameters but don't save the trained model again 
    spark.stop()
  }
}

Source File: LDATextExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.lda

import scala.collection.mutable
import org.apache.spark.mllib.clustering.LDA
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

object LDATextExample {
  val PATH = "/home/ubuntu/work/spark-src/spark/"
  val sc = new SparkContext("local[2]", "First Spark App")
  def main(args: Array[String]): Unit = {
    // Load documents from text files, 1 document per file
    val corpus: RDD[String] = sc.wholeTextFiles(PATH + "docs/*.md").map(_._2)
    // Split each document into a sequence of terms (words)
    val tokenized: RDD[Seq[String]] =
      corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).
        filter(_.forall(java.lang.Character.isLetter)))
    // Choose the vocabulary.
    // termCounts: Sorted list of (term, termCount) pairs
    val termCounts: Array[(String, Long)] =
      tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2)
    // vocabArray: Chosen vocab (removing common terms)
    val numStopwords = 20
    val vocabArray: Array[String] =
      termCounts.takeRight(termCounts.size - numStopwords).map(_._1)
    // vocab: Map term -> term index
    val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
    // Convert documents into term count vectors
    val documents: RDD[(Long, Vector)] =
      tokenized.zipWithIndex.map { case (tokens, id) =>
        val counts = new mutable.HashMap[Int, Double]()
        tokens.foreach { term =>
          if (vocab.contains(term)) {
            val idx = vocab(term)
            counts(idx) = counts.getOrElse(idx, 0.0) + 1.0
          }
        }
        (id, Vectors.sparse(vocab.size, counts.toSeq))
      }
    // Set LDA parameters
    val numTopics = 10
    val lda = new LDA().setK(numTopics).setMaxIterations(10)
    val ldaModel = lda.run(documents)
    //val avgLogLikelihood = ldaModel. / documents.count()
    // Print topics, showing top-weighted 10 terms for each topic.
    val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10)
    topicIndices.foreach { case (terms, termWeights) =>
      println("TOPIC:")
      terms.zip(termWeights).foreach { case (term, weight) =>
        println(s"${vocabArray(term.toInt)}\t$weight")
      }
      println()
    }
  }
}

Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0

5 votes

import collection.JavaConversions._
import scala.collection.mutable

import opennlp.tools.tokenize.SimpleTokenizer
import opennlp.tools.stemmer.PorterStemmer

import org.apache.spark.rdd._
import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA}
import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors}
import org.apache.spark.mllib.feature.IDF

// add openNLP jar to the Spark Context 
sc.addJar("opennlp-tools-1.6.0.jar")

// Load documents from text files, 1 element (text string) per file
val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2)

// read stop words from file
val stopwordFile = "stop-words.txt"
val st_words = sc.textFile(stopwordFile).collect()
      .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet
val stopwords = sc.broadcast(st_words)

val minWordLength = 3
val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => 
    val tokenizer = SimpleTokenizer.INSTANCE
    val stemmer = new PorterStemmer()    
    val tokens = tokenizer.tokenize(text)
    val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w)))
                      .map(w => stemmer.stem(w))
    id -> words
}.filter(_._2.length > 0)

tokenized.cache()
val numDocs = tokenized.count()

val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => 
tokens.map(_ -> 1L) 
}.reduceByKey(_ + _)
wordCounts.cache()
val fullVocabSize = wordCounts.count()
val vSize = 10000
val (vocab: Map[String, Int], selectedTokenCount: Long) = {
    val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)}
    (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum)
}


val documents = tokenized.map { case (id, tokens) =>
    // Filter tokens by vocabulary, and create word count vector representation of document.
    val wc = new mutable.HashMap[Int, Int]()
    tokens.foreach { term =>
        if (vocab.contains(term)) {
          val termIndex = vocab(term)
          wc(termIndex) = wc.getOrElse(termIndex, 0) + 1
        }
    }
    val indices = wc.keys.toArray.sorted
    val values = indices.map(i => wc(i).toDouble)
    val sb = Vectors.sparse(vocab.size, indices, values)
    (id, sb)
}

val vocabArray = new Array[String](vocab.size)
vocab.foreach { case (term, i) => vocabArray(i) = term }

val tf = documents.map { case (id, vec) => vec }.cache()
val idfVals = new IDF().fit(tf).idf.toArray
val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) =>
    val indices = vec.asInstanceOf[SparseVector].indices
    val counts = new mutable.HashMap[Int, Double]()    
    for (idx <- indices) {
        counts(idx) = vec(idx) * idfVals(idx)
    }
    (id, Vectors.sparse(vocab.size, counts.toSeq))
}



val numTopics = 5
val numIterations = 50
val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online")
val ldaModel = lda.run(tfidfDocs)

val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5)
topicIndices.foreach { case (terms, termWeights) =>
    println("TOPIC:")
    terms.zip(termWeights).foreach { case (term, weight) =>
        println(s"${vocabArray(term.toInt)}\t$weight")
    }
    println()
}

Source File: LDAExample.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.ml

import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel, LocalLDAModel}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import scopt.OptionParser
object LDAExample {
  case class Params(
      inputPath: String = null,
      outputPath: String = null,
      numTopics: Int = 10,
      maxIterations: Int = 10,
      optimizer: String = "online",
      maxResultSize: String = "1g")
	  
  def main(args: Array[String]): Unit = {
    val defaultParams = Params()
	
    val parser = new OptionParser[Params]("LDA") {
	  head("LDA: an example app for LDA.")
	  opt[String]("optimizer")
        .text(s"optimizer, default: ${defaultParams.optimizer}")
        .action((x, c) => c.copy(optimizer = x))
      opt[String]("maxResultSize")
        .text("max resultSize, default: ${defaultParams.maxResultSize}")
        .action((x, c) => c.copy(maxResultSize = x))
      opt[Int]("numTopics")
        .text(s"number of Topics, default: ${defaultParams.numTopics}")
        .action((x, c) => c.copy(numTopics = x))
      opt[Int]("maxIterations")
        .text(s"number of max iterations, default: ${defaultParams.maxIterations}")
        .action((x, c) => c.copy(maxIterations = x))
      arg[String]("<inputPath>")
        .required()
        .text("Input paths")
        .action((x, c) => c.copy(inputPath = x))
      arg[String]("<outputPath>")
        .required()
        .text("outputPath paths")
        .action((x, c) => c.copy(outputPath = x))		

    }
	parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }
  
  def run(params: Params): Unit = {
    val conf = new SparkConf()
        .setAppName(s"LDA Example with $params")
        .set("spark.driver.maxResultSize", params.maxResultSize)
        .set("spark.shuffle.compress", "false")
        .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec")
        .set("spark.smartCompress", "false")
                         
    val sc = new SparkContext(conf)

    val corpus: RDD[(Long, Vector)] = sc.objectFile(params.inputPath)
    
    // Cluster the documents into numTopics topics using LDA
    val ldaModel = new LDA().setK(params.numTopics).setMaxIterations(params.maxIterations).setOptimizer(params.optimizer).run(corpus)

    // Save and load model.
    ldaModel.save(sc, params.outputPath)
    val savedModel = LocalLDAModel.load(sc, params.outputPath)

    sc.stop()
  }
}

Source File: DRTest.scala From spark-flow with Apache License 2.0

5 votes

package com.bloomberg.sparkflow.dc

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.mllib.clustering.LDA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.scalatest._

import scala.util.Random
import com.bloomberg.sparkflow._



    val randomVecs = parallelize(1 to 100).map(i => Vectors.dense(Seq.fill(10)(Random.nextDouble()).toArray))
    val corpus = randomVecs.zipWithUniqueId().map{case (k,v) => (v,k)}
    val ldaModel = corpus.mapToResult(rdd => new LDA().setK(3).run(rdd))

  }

  test("regularSpark"){
    val numbers: RDD[Int] = sc.parallelize(1 to 10)
    val doubles: RDD[Double] = numbers.map(_.toDouble)
    val sum: Double = doubles.sum()
    val normalized: RDD[Double] = doubles.map(_ / sum)
  }
}

org.apache.spark.mllib.clustering.LDA Scala Examples