org.apache.spark.ml.clustering.LDA Scala Examples
The following examples show how to use org.apache.spark.ml.clustering.LDA.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LDAParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import org.scalatest.Ignore @Ignore class LDAParitySpec extends SparkParityBase { override val dataset: DataFrame = textDataset.select("text") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val remover = new StopWordsRemover() .setInputCol(tokenizer.getOutputCol) .setOutputCol("words_filtered") val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000) val lda = new LDA().setK(5).setMaxIter(2) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = { val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution") val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution") sparkDataset.collect().zip(mleapDataset.collect()).foreach { case (sv, mv) => val sparkPrediction = sv.getAs[Vector](sparkPredictionCol) val mleapPrediction = mv.getAs[Vector](mleapPredictionCol) sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach { case (s, m) => assert(Math.abs(m - s) < 0.001) } } } }
Example 2
Source File: LDAExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.lda // scalastyle:off println // $example on$ import org.apache.spark.SparkConf import org.apache.spark.ml.clustering.LDA import org.apache.spark.sql.SparkSession // $example off$ import org.apache.spark.sql.SparkSession val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val spark = SparkSession .builder() .appName("Spark SQL Example") .config(spConfig) .getOrCreate() // $example on$ // Loads data. val dataset = spark.read.format("libsvm") .load(SPARK_PATH + "data/mllib/sample_lda_libsvm_data.txt") // Trains a LDA model. val lda = new LDA().setK(10).setMaxIter(10) val model = lda.fit(dataset) val ll = model.logLikelihood(dataset) val lp = model.logPerplexity(dataset) println(s"The lower bound on the log likelihood of the entire corpus: $ll") println(s"The upper bound bound on perplexity: $lp") // Describe topics. val topics = model.describeTopics(3) println("The topics described by their top-weighted terms:") topics.show(false) // Shows the result. val transformed = model.transform(dataset) transformed.show(false) // $example off$ spark.stop() } // scalastyle:on println
Example 3
Source File: OpLDATest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.linalg.{Vector, Vectors} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class OpLDATest extends FlatSpec with TestSparkContext { val inputData = Seq( (0.0, Vectors.sparse(11, Array(0, 1, 2, 4, 5, 6, 7, 10), Array(1.0, 2.0, 6.0, 2.0, 3.0, 1.0, 1.0, 3.0))), (1.0, Vectors.sparse(11, Array(0, 1, 3, 4, 7, 10), Array(1.0, 3.0, 1.0, 3.0, 2.0, 1.0))), (2.0, Vectors.sparse(11, Array(0, 1, 2, 5, 6, 8, 9), Array(1.0, 4.0, 1.0, 4.0, 9.0, 1.0, 2.0))), (3.0, Vectors.sparse(11, Array(0, 1, 3, 6, 8, 9, 10), Array(2.0, 1.0, 3.0, 5.0, 2.0, 3.0, 9.0))), (4.0, Vectors.sparse(11, Array(0, 1, 2, 3, 4, 6, 9, 10), Array(3.0, 1.0, 1.0, 9.0, 3.0, 2.0, 1.0, 3.0))), (5.0, Vectors.sparse(11, Array(0, 1, 3, 4, 5, 6, 7, 8, 9), Array(4.0, 2.0, 3.0, 4.0, 5.0, 1.0, 1.0, 1.0, 4.0))), (6.0, Vectors.sparse(11, Array(0, 1, 3, 6, 8, 9, 10), Array(2.0, 1.0, 3.0, 5.0, 2.0, 2.0, 9.0))), (7.0, Vectors.sparse(11, Array(0, 1, 2, 3, 4, 5, 6, 9, 10), Array(1.0, 1.0, 1.0, 9.0, 2.0, 1.0, 2.0, 1.0, 3.0))), (8.0, Vectors.sparse(11, Array(0, 1, 3, 4, 5, 6, 7), Array(4.0, 4.0, 3.0, 4.0, 2.0, 1.0, 3.0))), (9.0, Vectors.sparse(11, Array(0, 1, 2, 4, 6, 8, 9, 10), Array(2.0, 8.0, 2.0, 3.0, 2.0, 2.0, 7.0, 2.0))), (10.0, Vectors.sparse(11, Array(0, 1, 2, 3, 5, 6, 9, 10), Array(1.0, 1.0, 1.0, 9.0, 2.0, 2.0, 3.0, 3.0))), (11.0, Vectors.sparse(11, Array(0, 1, 4, 5, 6, 7, 9), Array(4.0, 1.0, 4.0, 5.0, 1.0, 3.0, 1.0))) ).map(v => v._1.toReal -> v._2.toOPVector) lazy val (ds, f1, f2) = TestFeatureBuilder(inputData) lazy val inputDS = ds.persist() val seed = 1234567890L val k = 3 val maxIter = 100 lazy val expected = new LDA() .setFeaturesCol(f2.name) .setK(k) .setSeed(seed) .fit(inputDS) .transform(inputDS) .select("topicDistribution") .collect() .toSeq .map(_.getAs[Vector](0)) Spec[OpLDA] should "convert document term vectors into topic vectors" in { val f2Vec = new OpLDA().setInput(f2).setK(k).setSeed(seed).setMaxIter(maxIter) val testTransformedData = f2Vec.fit(inputDS).transform(inputDS) val output = f2Vec.getOutput() val estimate = testTransformedData.collect(output) val mse = computeMeanSqError(estimate, expected) val expectedMse = 0.5 withClue(s"Computed mse $mse (expected $expectedMse)") { mse should be < expectedMse } } it should "convert document term vectors into topic vectors (shortcut version)" in { val output = f2.lda(k = k, seed = seed, maxIter = maxIter) val f2Vec = output.originStage.asInstanceOf[OpLDA] val testTransformedData = f2Vec.fit(inputDS).transform(inputDS) val estimate = testTransformedData.collect(output) val mse = computeMeanSqError(estimate, expected) val expectedMse = 0.5 withClue(s"Computed mse $mse (expected $expectedMse)") { mse should be < expectedMse } } private def computeMeanSqError(estimate: Seq[OPVector], expected: Seq[Vector]): Double = { val n = estimate.length.toDouble estimate.zip(expected).map { case (est, exp) => Vectors.sqdist(est.value, exp) }.sum / n } }