org.apache.spark.mllib.feature.IDF Scala Examples
The following examples show how to use org.apache.spark.mllib.feature.IDF.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TFIDF.scala From AI with Apache License 2.0 | 6 votes |
package com.bigchange.mllib import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.{SparkConf, SparkContext} import scala.io.Source object TFIDF { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TfIdfTest") .setMaster("local") val sc = new SparkContext(conf) // Load documents (one per line).要求每行作为一个document,这里zipWithIndex将每一行的行号作为doc id val documents = sc.parallelize(Source.fromFile("J:\\github\\dataSet\\TFIDF-DOC").getLines() .filter(_.trim.length > 0).toSeq) .map(_.split(" ").toSeq) .zipWithIndex() // feature number val hashingTF = new HashingTF(Math.pow(2, 18).toInt) //line number for doc id,每一行的分词结果生成tf vector val idAndTFVector = documents.map { case (seq, num) => val tf = hashingTF.transform(seq) (num + 1, tf) } idAndTFVector.cache() // build idf model val idf = new IDF().fit(idAndTFVector.values) // transform tf vector to tf-idf vector val idAndTFIDFVector = idAndTFVector.mapValues(v => idf.transform(v)) // broadcast tf-idf vectors val idAndTFIDFVectorBroadCast = sc.broadcast(idAndTFIDFVector.collect()) // cal doc cosineSimilarity val docSims = idAndTFIDFVector.flatMap { case (id1, idf1) => // filter the same doc id val idfs = idAndTFIDFVectorBroadCast.value.filter(_._1 != id1) val sv1 = idf1.asInstanceOf[SV] import breeze.linalg._ val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size) idfs.map { case (id2, idf2) => val sv2 = idf2.asInstanceOf[SV] val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size) val cosSim = bsv1.dot(bsv2) / (norm(bsv1) * norm(bsv2)) (id1, id2, cosSim) } } docSims.foreach(println) sc.stop() } }
Example 2
Source File: TfIdfSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.featureext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.feature.IDF import org.sparksamples.Util object TfIdfSample{ def main(args: Array[String]) { //TODO replace with path specific to your machine val file = Util.SPARK_HOME + "/README.md" val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq) print("Documents Size:" + documents.count) val hashingTF = new HashingTF() val tf = hashingTF.transform(documents) for(tf_ <- tf) { println(s"$tf_") } tf.cache() val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) println("tfidf size : " + tfidf.count) for(tfidf_ <- tfidf) { println(s"$tfidf_") } } }
Example 3
Source File: TfIdfSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.featureext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.feature.IDF object TfIdfSample{ def main(args: Array[String]) { //TODO replace with path specific to your machine val file = "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6//README.md" val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq) print("Documents Size:" + documents.count) val hashingTF = new HashingTF() val tf = hashingTF.transform(documents) for(tf_ <- tf) { println(s"$tf_") } tf.cache() val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) println("tfidf size : " + tfidf.count) for(tfidf_ <- tfidf) { println(s"$tfidf_") } } }
Example 4
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{ SparseVector => SV } object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.weightedFMeasure) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 5
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.mllib.util.MLUtils //import org.apache.spark.ml.feature.HashingTF //import org.apache.spark.ml.feature.IDF object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) println(zipped.first()) val train = zipped.map { case (topic, vector) => { LabeledPoint(newsgroupsMap(topic), vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm") train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => { println(topic) println(vector) LabeledPoint(topic, vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.accuracy) println(metrics.weightedFalsePositiveRate) println(metrics.weightedPrecision) println(metrics.weightedFMeasure) println(metrics.weightedRecall) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 6
Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0 | 5 votes |
import collection.JavaConversions._ import scala.collection.mutable import opennlp.tools.tokenize.SimpleTokenizer import opennlp.tools.stemmer.PorterStemmer import org.apache.spark.rdd._ import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA} import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors} import org.apache.spark.mllib.feature.IDF // add openNLP jar to the Spark Context sc.addJar("opennlp-tools-1.6.0.jar") // Load documents from text files, 1 element (text string) per file val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2) // read stop words from file val stopwordFile = "stop-words.txt" val st_words = sc.textFile(stopwordFile).collect() .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet val stopwords = sc.broadcast(st_words) val minWordLength = 3 val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => val tokenizer = SimpleTokenizer.INSTANCE val stemmer = new PorterStemmer() val tokens = tokenizer.tokenize(text) val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w))) .map(w => stemmer.stem(w)) id -> words }.filter(_._2.length > 0) tokenized.cache() val numDocs = tokenized.count() val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => tokens.map(_ -> 1L) }.reduceByKey(_ + _) wordCounts.cache() val fullVocabSize = wordCounts.count() val vSize = 10000 val (vocab: Map[String, Int], selectedTokenCount: Long) = { val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)} (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum) } val documents = tokenized.map { case (id, tokens) => // Filter tokens by vocabulary, and create word count vector representation of document. val wc = new mutable.HashMap[Int, Int]() tokens.foreach { term => if (vocab.contains(term)) { val termIndex = vocab(term) wc(termIndex) = wc.getOrElse(termIndex, 0) + 1 } } val indices = wc.keys.toArray.sorted val values = indices.map(i => wc(i).toDouble) val sb = Vectors.sparse(vocab.size, indices, values) (id, sb) } val vocabArray = new Array[String](vocab.size) vocab.foreach { case (term, i) => vocabArray(i) = term } val tf = documents.map { case (id, vec) => vec }.cache() val idfVals = new IDF().fit(tf).idf.toArray val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) => val indices = vec.asInstanceOf[SparseVector].indices val counts = new mutable.HashMap[Int, Double]() for (idx <- indices) { counts(idx) = vec(idx) * idfVals(idx) } (id, Vectors.sparse(vocab.size, counts.toSeq)) } val numTopics = 5 val numIterations = 50 val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online") val ldaModel = lda.run(tfidfDocs) val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() }
Example 7
Source File: DocumentSegmenter.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import chalk.text.analyze.PorterStemmer import chalk.text.segment.JavaSentenceSegmenter import chalk.text.tokenize.SimpleEnglishTokenizer case class Document(id: String, text: String) case class Sentence(id: Long, docId: String, text: String) case class SentenceTokens(id: Long, docId: String, tokens: Seq[String]) class DocumentSegmenter extends Serializable { def apply(documents: RDD[Document]) = { val sentences = extractSentences(documents) val tokenized = tokenize(sentences) (sentences, tokenized) } private def extractSentences(documents: RDD[Document]) : RDD[Sentence] = { documents .flatMap(d => segment(d.text).map(t => (d.id, t)) ) .zipWithIndex() .map({ case ((docId, sentenceText), sentenceId) => Sentence(sentenceId, docId, sentenceText) }) } private def tokenize(sentences: RDD[Sentence]) : RDD[SentenceTokens] = { val tokenizer = SimpleEnglishTokenizer() val nonWord = "[^a-z]*".r sentences.map(s => { val tokens = tokenizer(s.text.toLowerCase).toSeq .map(nonWord.replaceAllIn(_, "")) .filter(_.length > 3) .map(stem) SentenceTokens(s.id, s.docId, tokens) }) } private def segment(text: String) : Seq[String] = { JavaSentenceSegmenter(text).toSeq } private def stem(token: String) : String = { PorterStemmer(token) } }
Example 8
Source File: Featurizer.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector, Vector} case class SentenceFeatures(id: Long, docId: String, features: SparseVector) class Featurizer(numStopwords: Int = 0) extends Serializable { private val hashingTF = new HashingTF() private val byIDF = Ordering[Double].on[(Int,Double)](_._2) def apply(tokens: RDD[SentenceTokens]) : RDD[SentenceFeatures] = { val idf = new IDF(minDocFreq = 2) val termFrequencies = tokens.map(t => { (t.id, t.docId, hashingTF.transform(t.tokens)) }) val idfModel = idf.fit(termFrequencies.map({ case (_, _, tf) => tf })) val stopwordIndices = identifyStopwords(idfModel.idf.toSparse, numStopwords) termFrequencies .map({ case (id, docId, tf) => val tfidf = idfModel.transform(tf).toSparse val features = removeStopwords(tfidf, stopwordIndices) SentenceFeatures(id, docId, features) }) .filter(_.features.indices.size > 0) } def indexOf(token: String): Int = { hashingTF.indexOf(token) } private def identifyStopwords(idf: SparseVector, numStopwords: Int) = { featureTuples(idf).sorted(byIDF).take(numStopwords).map(_._1) } private def removeStopwords(tf: SparseVector, stopwordIndices: Array[Int]) = { val (indices, values) = featureTuples(tf) .filter(p => !stopwordIndices.contains(p._1)) .unzip new SparseVector(tf.size, indices.toArray, values.toArray) } private def featureTuples(featureVector: SparseVector) = { featureVector.indices.zip(featureVector.values) } }