org.apache.spark.mllib.feature.IDF Scala Example

Source File: TFIDF.scala From AI with Apache License 2.0

6 votes

package com.bigchange.mllib

import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.{SparseVector => SV}
import org.apache.spark.{SparkConf, SparkContext}

import scala.io.Source


object TFIDF {
  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("TfIdfTest")
      .setMaster("local")
    val sc = new SparkContext(conf)

    // Load documents (one per line).要求每行作为一个document,这里zipWithIndex将每一行的行号作为doc id
    val documents = sc.parallelize(Source.fromFile("J:\\github\\dataSet\\TFIDF-DOC").getLines()
      .filter(_.trim.length > 0).toSeq)
      .map(_.split(" ").toSeq)
      .zipWithIndex()


    // feature number
    val hashingTF = new HashingTF(Math.pow(2, 18).toInt)
    //line number for doc id，每一行的分词结果生成tf vector
    val idAndTFVector = documents.map {
      case (seq, num) =>
        val tf = hashingTF.transform(seq)
        (num + 1, tf)
    }
    idAndTFVector.cache()
    // build idf model
    val idf = new IDF().fit(idAndTFVector.values)
    // transform tf vector to tf-idf vector
    val idAndTFIDFVector = idAndTFVector.mapValues(v => idf.transform(v))
    // broadcast tf-idf vectors
    val idAndTFIDFVectorBroadCast = sc.broadcast(idAndTFIDFVector.collect())

    // cal doc cosineSimilarity
    val docSims = idAndTFIDFVector.flatMap {
      case (id1, idf1) =>
        // filter the same doc id
        val idfs = idAndTFIDFVectorBroadCast.value.filter(_._1 != id1)
        val sv1 = idf1.asInstanceOf[SV]
        import breeze.linalg._
        val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size)
        idfs.map {
          case (id2, idf2) =>
            val sv2 = idf2.asInstanceOf[SV]
            val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size)
            val cosSim = bsv1.dot(bsv2) / (norm(bsv1) * norm(bsv2))
            (id1, id2, cosSim)
        }
    }
    docSims.foreach(println)

    sc.stop()

  }
}

Source File: TfIdfSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.featureext

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.feature.IDF
import org.sparksamples.Util

object TfIdfSample{
  def main(args: Array[String]) {
    //TODO replace with path specific to your machine
    val file = Util.SPARK_HOME + "/README.md"
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq)
    print("Documents Size:" + documents.count)
    val hashingTF = new HashingTF()
    val tf = hashingTF.transform(documents)
    for(tf_ <- tf) {
      println(s"$tf_")
    }
    tf.cache()
    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    println("tfidf size : " + tfidf.count)
    for(tfidf_ <- tfidf) {
      println(s"$tfidf_")
    }
  }
}

Source File: TfIdfSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.featureext

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.feature.IDF

object TfIdfSample{
  def main(args: Array[String]) {
    //TODO replace with path specific to your machine
    val file = "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6//README.md"
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq)
    print("Documents Size:" + documents.count)
    val hashingTF = new HashingTF()
    val tf = hashingTF.transform(documents)
    for(tf_ <- tf) {
      println(s"$tf_")
    }
    tf.cache()
    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    println("tfidf size : " + tfidf.count)
    for(tfidf_ <- tfidf) {
      println(s"$tfidf_")
    }
  }
}

Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{ SparseVector => SV }


object DocumentClassification {

  def main(args: Array[String]) {
    val sc = new SparkContext("local[2]", "First Spark App")

    val path = "../data/20news-bydate-train/*"
    val rdd = sc.wholeTextFiles(path)
    val text = rdd.map { case (file, text) => text }
    val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head }
    val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap
    val dim = math.pow(2, 18).toInt
    val hashingTF = new HashingTF(dim)

    var tokens = text.map(doc => TFIDFExtraction.tokenize(doc))
    val tf = hashingTF.transform(tokens)
    tf.cache
    val v = tf.first.asInstanceOf[SV]


    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    val zipped = newsgroups.zip(tfidf)
    val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
    train.cache
    val model = NaiveBayes.train(train, lambda = 0.1)

    val testPath = "../data/20news-bydate-test/*"
    val testRDD = sc.wholeTextFiles(testPath)
    val testLabels = testRDD.map { case (file, text) =>
      val topic = file.split("/").takeRight(2).head
      newsgroupsMap(topic)
    }
    val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) }
    val testTfIdf = idf.transform(testTf)
    val zippedTest = testLabels.zip(testTfIdf)
    val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) }

    val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
    println(accuracy)
    // Updated Dec 2016 by Rajdeep
    //0.7928836962294211
    val metrics = new MulticlassMetrics(predictionAndLabel)
    println(metrics.weightedFMeasure)
    //0.7822644376431702

    val rawTokens = rdd.map { case (file, text) => text.split(" ") }
    val rawTF = rawTokens.map(doc => hashingTF.transform(doc))
    val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
    val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1)
    val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) }
    val rawZippedTest = testLabels.zip(rawTestTF)
    val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) }
    val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label))
    val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count()
    println(rawAccuracy)
    // 0.7661975570897503
    val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel)
    println(rawMetrics.weightedFMeasure)
    // older value 0.7628947184990661
    // dec 2016 : 0.7653320418573546
    sc.stop()
  }

}

Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{SparseVector => SV}
import org.apache.spark.mllib.util.MLUtils
//import org.apache.spark.ml.feature.HashingTF
//import org.apache.spark.ml.feature.IDF


object DocumentClassification {

  def main(args: Array[String]) {
    val sc = new SparkContext("local[2]", "First Spark App")

    val path = "../data/20news-bydate-train/*"
    val rdd = sc.wholeTextFiles(path)
    val text = rdd.map { case (file, text) => text }
    val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head }
    val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap
    val dim = math.pow(2, 18).toInt
    val hashingTF = new HashingTF(dim)

    var tokens = text.map(doc => TFIDFExtraction.tokenize(doc))
    val tf = hashingTF.transform(tokens)
    tf.cache
    val v = tf.first.asInstanceOf[SV]


    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    val zipped = newsgroups.zip(tfidf)
    println(zipped.first())
    val train = zipped.map { case (topic, vector) => {
      LabeledPoint(newsgroupsMap(topic), vector)
    } }

    //TODO uncomment to generate libsvm format
    MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm")

    train.cache
    val model = NaiveBayes.train(train, lambda = 0.1)

    val testPath = "../data/20news-bydate-test/*"
    val testRDD = sc.wholeTextFiles(testPath)
    val testLabels = testRDD.map { case (file, text) =>
      val topic = file.split("/").takeRight(2).head
      newsgroupsMap(topic)
    }
    val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) }
    val testTfIdf = idf.transform(testTf)
    val zippedTest = testLabels.zip(testTfIdf)
    val test = zippedTest.map { case (topic, vector) => {
      println(topic)
      println(vector)
      LabeledPoint(topic, vector)
    } }

    //TODO uncomment to generate libsvm format
    MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm")


    val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
    println(accuracy)
    // Updated Dec 2016 by Rajdeep
    //0.7928836962294211
    val metrics = new MulticlassMetrics(predictionAndLabel)
    println(metrics.accuracy)
    println(metrics.weightedFalsePositiveRate)
    println(metrics.weightedPrecision)
    println(metrics.weightedFMeasure)
    println(metrics.weightedRecall)
    //0.7822644376431702

    val rawTokens = rdd.map { case (file, text) => text.split(" ") }
    val rawTF = rawTokens.map(doc => hashingTF.transform(doc))
    val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
    val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1)
    val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) }
    val rawZippedTest = testLabels.zip(rawTestTF)
    val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) }
    val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label))
    val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count()
    println(rawAccuracy)
    // 0.7661975570897503
    val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel)
    println(rawMetrics.weightedFMeasure)
    // older value 0.7628947184990661
    // dec 2016 : 0.7653320418573546

    sc.stop()
  }

}

Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0

5 votes

import collection.JavaConversions._
import scala.collection.mutable

import opennlp.tools.tokenize.SimpleTokenizer
import opennlp.tools.stemmer.PorterStemmer

import org.apache.spark.rdd._
import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA}
import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors}
import org.apache.spark.mllib.feature.IDF

// add openNLP jar to the Spark Context 
sc.addJar("opennlp-tools-1.6.0.jar")

// Load documents from text files, 1 element (text string) per file
val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2)

// read stop words from file
val stopwordFile = "stop-words.txt"
val st_words = sc.textFile(stopwordFile).collect()
      .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet
val stopwords = sc.broadcast(st_words)

val minWordLength = 3
val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => 
    val tokenizer = SimpleTokenizer.INSTANCE
    val stemmer = new PorterStemmer()    
    val tokens = tokenizer.tokenize(text)
    val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w)))
                      .map(w => stemmer.stem(w))
    id -> words
}.filter(_._2.length > 0)

tokenized.cache()
val numDocs = tokenized.count()

val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => 
tokens.map(_ -> 1L) 
}.reduceByKey(_ + _)
wordCounts.cache()
val fullVocabSize = wordCounts.count()
val vSize = 10000
val (vocab: Map[String, Int], selectedTokenCount: Long) = {
    val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)}
    (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum)
}


val documents = tokenized.map { case (id, tokens) =>
    // Filter tokens by vocabulary, and create word count vector representation of document.
    val wc = new mutable.HashMap[Int, Int]()
    tokens.foreach { term =>
        if (vocab.contains(term)) {
          val termIndex = vocab(term)
          wc(termIndex) = wc.getOrElse(termIndex, 0) + 1
        }
    }
    val indices = wc.keys.toArray.sorted
    val values = indices.map(i => wc(i).toDouble)
    val sb = Vectors.sparse(vocab.size, indices, values)
    (id, sb)
}

val vocabArray = new Array[String](vocab.size)
vocab.foreach { case (term, i) => vocabArray(i) = term }

val tf = documents.map { case (id, vec) => vec }.cache()
val idfVals = new IDF().fit(tf).idf.toArray
val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) =>
    val indices = vec.asInstanceOf[SparseVector].indices
    val counts = new mutable.HashMap[Int, Double]()    
    for (idx <- indices) {
        counts(idx) = vec(idx) * idfVals(idx)
    }
    (id, Vectors.sparse(vocab.size, counts.toSeq))
}



val numTopics = 5
val numIterations = 50
val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online")
val ldaModel = lda.run(tfidfDocs)

val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5)
topicIndices.foreach { case (terms, termWeights) =>
    println("TOPIC:")
    terms.zip(termWeights).foreach { case (term, weight) =>
        println(s"${vocabArray(term.toInt)}\t$weight")
    }
    println()
}

Source File: DocumentSegmenter.scala From lexrank-summarizer with MIT License

5 votes

package io.github.karlhigley.lexrank

import org.apache.spark.rdd.RDD

import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.SparseVector

import chalk.text.analyze.PorterStemmer
import chalk.text.segment.JavaSentenceSegmenter
import chalk.text.tokenize.SimpleEnglishTokenizer

case class Document(id: String, text: String)
case class Sentence(id: Long, docId: String, text: String)
case class SentenceTokens(id: Long, docId: String, tokens: Seq[String])

class DocumentSegmenter extends Serializable {
  def apply(documents: RDD[Document]) = {  
    val sentences = extractSentences(documents)
    val tokenized = tokenize(sentences)
    (sentences, tokenized)
  }

  private def extractSentences(documents: RDD[Document]) : RDD[Sentence] = {
    documents
      .flatMap(d => segment(d.text).map(t => (d.id, t)) )
      .zipWithIndex()
      .map({
        case ((docId, sentenceText), sentenceId) => Sentence(sentenceId, docId, sentenceText)
      })
  }

  private def tokenize(sentences: RDD[Sentence]) : RDD[SentenceTokens] = {
    val tokenizer = SimpleEnglishTokenizer()
    val nonWord   = "[^a-z]*".r

    sentences.map(s => {
      val tokens = tokenizer(s.text.toLowerCase).toSeq
                                          .map(nonWord.replaceAllIn(_, ""))
                                          .filter(_.length > 3)
                                          .map(stem)

      SentenceTokens(s.id, s.docId, tokens)
    })
  }

  private def segment(text: String) : Seq[String] = {
    JavaSentenceSegmenter(text).toSeq
  }

  private def stem(token: String) : String = {
    PorterStemmer(token)
  }
}

Source File: Featurizer.scala From lexrank-summarizer with MIT License

5 votes

package io.github.karlhigley.lexrank

import org.apache.spark.SparkContext

import org.apache.spark.rdd.RDD

import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.{SparseVector, Vector}

case class SentenceFeatures(id: Long, docId: String, features: SparseVector)

class Featurizer(numStopwords: Int = 0) extends Serializable {
  private val hashingTF = new HashingTF()
  private val byIDF = Ordering[Double].on[(Int,Double)](_._2)

  def apply(tokens: RDD[SentenceTokens]) : RDD[SentenceFeatures] = {
    val idf = new IDF(minDocFreq = 2)

    val termFrequencies = tokens.map(t => {
        (t.id, t.docId, hashingTF.transform(t.tokens))
    })
    
    val idfModel = idf.fit(termFrequencies.map({ case (_, _, tf) => tf }))

    val stopwordIndices = identifyStopwords(idfModel.idf.toSparse, numStopwords)

    termFrequencies
      .map({
        case (id, docId, tf) =>
          val tfidf = idfModel.transform(tf).toSparse
          val features = removeStopwords(tfidf, stopwordIndices)
          SentenceFeatures(id, docId, features)
      })
      .filter(_.features.indices.size > 0)
  }

  def indexOf(token: String): Int = {
    hashingTF.indexOf(token)
  }

  private def identifyStopwords(idf: SparseVector, numStopwords: Int) = {
    featureTuples(idf).sorted(byIDF).take(numStopwords).map(_._1)
  }

  private def removeStopwords(tf: SparseVector, stopwordIndices: Array[Int]) = {
    val (indices, values) =
        featureTuples(tf)
          .filter(p => !stopwordIndices.contains(p._1))
          .unzip
    new SparseVector(tf.size, indices.toArray, values.toArray)
  }

  private def featureTuples(featureVector: SparseVector) = {
    featureVector.indices.zip(featureVector.values)
  }
}

org.apache.spark.mllib.feature.IDF Scala Examples