org.apache.spark.mllib.feature.HashingTF Scala Examples
The following examples show how to use org.apache.spark.mllib.feature.HashingTF.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TFIDF.scala From AI with Apache License 2.0 | 6 votes |
package com.bigchange.mllib import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.{SparkConf, SparkContext} import scala.io.Source object TFIDF { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TfIdfTest") .setMaster("local") val sc = new SparkContext(conf) // Load documents (one per line).要求每行作为一个document,这里zipWithIndex将每一行的行号作为doc id val documents = sc.parallelize(Source.fromFile("J:\\github\\dataSet\\TFIDF-DOC").getLines() .filter(_.trim.length > 0).toSeq) .map(_.split(" ").toSeq) .zipWithIndex() // feature number val hashingTF = new HashingTF(Math.pow(2, 18).toInt) //line number for doc id,每一行的分词结果生成tf vector val idAndTFVector = documents.map { case (seq, num) => val tf = hashingTF.transform(seq) (num + 1, tf) } idAndTFVector.cache() // build idf model val idf = new IDF().fit(idAndTFVector.values) // transform tf vector to tf-idf vector val idAndTFIDFVector = idAndTFVector.mapValues(v => idf.transform(v)) // broadcast tf-idf vectors val idAndTFIDFVectorBroadCast = sc.broadcast(idAndTFIDFVector.collect()) // cal doc cosineSimilarity val docSims = idAndTFIDFVector.flatMap { case (id1, idf1) => // filter the same doc id val idfs = idAndTFIDFVectorBroadCast.value.filter(_._1 != id1) val sv1 = idf1.asInstanceOf[SV] import breeze.linalg._ val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size) idfs.map { case (id2, idf2) => val sv2 = idf2.asInstanceOf[SV] val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size) val cosSim = bsv1.dot(bsv2) / (norm(bsv1) * norm(bsv2)) (id1, id2, cosSim) } } docSims.foreach(println) sc.stop() } }
Example 2
Source File: Util.scala From spark-twitter-sentiment with Apache License 2.0 | 5 votes |
package com.dhruv import org.apache.commons.cli.{Options, ParseException, PosixParser} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.feature.HashingTF import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder object Utils { val numFeatures = 1000 val tf = new HashingTF(numFeatures) val CONSUMER_KEY = "consumerKey" val CONSUMER_SECRET = "consumerSecret" val ACCESS_TOKEN = "accessToken" val ACCESS_TOKEN_SECRET = "accessTokenSecret" val THE_OPTIONS = { val options = new Options() options.addOption(CONSUMER_KEY, true, "Twitter OAuth Consumer Key") options.addOption(CONSUMER_SECRET, true, "Twitter OAuth Consumer Secret") options.addOption(ACCESS_TOKEN, true, "Twitter OAuth Access Token") options.addOption(ACCESS_TOKEN_SECRET, true, "Twitter OAuth Access Token Secret") options } def parseCommandLineWithTwitterCredentials(args: Array[String]) = { val parser = new PosixParser try { val cl = parser.parse(THE_OPTIONS, args) System.setProperty("twitter4j.oauth.consumerKey", cl.getOptionValue(CONSUMER_KEY)) System.setProperty("twitter4j.oauth.consumerSecret", cl.getOptionValue(CONSUMER_SECRET)) System.setProperty("twitter4j.oauth.accessToken", cl.getOptionValue(ACCESS_TOKEN)) System.setProperty("twitter4j.oauth.accessTokenSecret", cl.getOptionValue(ACCESS_TOKEN_SECRET)) cl.getArgList.toArray } catch { case e: ParseException => System.err.println("Parsing failed. Reason: " + e.getMessage) System.exit(1) } } def getAuth = { Some(new OAuthAuthorization(new ConfigurationBuilder().build())) } def featurize(s: String): Vector = { tf.transform(s.sliding(2).toSeq) } object IntParam { def unapply(str: String): Option[Int] = { try { Some(str.toInt) } catch { case e: NumberFormatException => None } } } }
Example 3
Source File: TfIdfSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.featureext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.feature.IDF import org.sparksamples.Util object TfIdfSample{ def main(args: Array[String]) { //TODO replace with path specific to your machine val file = Util.SPARK_HOME + "/README.md" val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq) print("Documents Size:" + documents.count) val hashingTF = new HashingTF() val tf = hashingTF.transform(documents) for(tf_ <- tf) { println(s"$tf_") } tf.cache() val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) println("tfidf size : " + tfidf.count) for(tfidf_ <- tfidf) { println(s"$tfidf_") } } }
Example 4
Source File: TfIdfSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.featureext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.feature.IDF object TfIdfSample{ def main(args: Array[String]) { //TODO replace with path specific to your machine val file = "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6//README.md" val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq) print("Documents Size:" + documents.count) val hashingTF = new HashingTF() val tf = hashingTF.transform(documents) for(tf_ <- tf) { println(s"$tf_") } tf.cache() val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) println("tfidf size : " + tfidf.count) for(tfidf_ <- tfidf) { println(s"$tfidf_") } } }
Example 5
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{ SparseVector => SV } object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.weightedFMeasure) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 6
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.mllib.util.MLUtils //import org.apache.spark.ml.feature.HashingTF //import org.apache.spark.ml.feature.IDF object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) println(zipped.first()) val train = zipped.map { case (topic, vector) => { LabeledPoint(newsgroupsMap(topic), vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm") train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => { println(topic) println(vector) LabeledPoint(topic, vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.accuracy) println(metrics.weightedFalsePositiveRate) println(metrics.weightedPrecision) println(metrics.weightedFMeasure) println(metrics.weightedRecall) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 7
Source File: Classifier.scala From CSYE7200_Old with MIT License | 5 votes |
package edu.neu.coe.csye7200.spam import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.classification.LogisticRegressionWithSGD import org.apache.spark.SparkConf import org.apache.spark.SparkContext object Classifier extends App { val conf = new SparkConf().setAppName("spam").setMaster("local[*]") val sc = new SparkContext(conf) val spam = sc.textFile("spark-app//input//test//spam.txt") val norm = sc.textFile("spark-app//input//test//normal.txt") val tf = new HashingTF(10000) val spamFeatures = spam.map(email => tf.transform(email.split(" "))) val normFeatures = norm.map(email => tf.transform(email.split(" "))) val posExamples = spamFeatures.map(f => LabeledPoint(1, f)) val negExamples = normFeatures.map(f => LabeledPoint(0, f)) val trainingData = posExamples.union(negExamples) trainingData.cache() val model = new LogisticRegressionWithSGD().run(trainingData) val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" ")) val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" ")) println(s"Prediction for positive test example: ${model.predict(posTest)}") println(s"Prediction for negative test example: ${model.predict(negTest)}") }
Example 8
Source File: MllibHelper.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import java.text.Normalizer import org.apache.spark.Logging import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import scala.math.BigDecimal import twitter4j.Status object MllibHelper extends Logging { val numNumberFeatures = 4 var numRetweetBegin = 100 var numRetweetEnd = 1000 var numTextFeatures = 1000 var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray def reset(conf:ConfArguments) { numRetweetBegin = conf.numRetweetBegin numRetweetEnd = conf.numRetweetEnd numTextFeatures = conf.numTextFeatures var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures") } def featurizeText(statuses: Status): SparseVector = { val text = statuses.getRetweetedStatus .getText .toLowerCase // Separate accents from characters and then remove non-unicode // characters val noAccentText = Normalizer .normalize(text, Normalizer.Form.NFD) .replaceAll("\\p{M}", "") // bigrams hashText.transform(text.sliding(2).toSeq) .asInstanceOf[SparseVector] } def featurizeNumbers(statuses: Status): Vector = { val user = statuses.getRetweetedStatus.getUser val created = statuses.getRetweetedStatus.getCreatedAt val timeLeft = (System.currentTimeMillis - created.getTime) Vectors.dense( user.getFollowersCount * Math.pow(10, -12), user.getFavouritesCount * Math.pow(10, -12), user.getFriendsCount * Math.pow(10, -12), timeLeft * Math.pow(10, -14) //retweeted.getURLEntities.length, //retweeted.getUserMentionEntities.length ) } def featurize(statuses: Status): LabeledPoint = { val textFeatures = featurizeText(statuses) val numberFeatures = featurizeNumbers(statuses) val features = Vectors.sparse( numFeatures, textFeatures.indices ++ numberFeatureIndices, textFeatures.values ++ numberFeatures.toArray ) LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features ) } def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = { val n = statuses.getRetweetedStatus.getRetweetCount (n >= start && n <= end) } def filtrate(statuses: Status): Boolean = { ( statuses.isRetweet && //statuses.getLang == "en" && retweetInterval(statuses, numRetweetBegin, numRetweetEnd) ) } }
Example 9
Source File: Utils.scala From awesome-recommendation-engine with Apache License 2.0 | 5 votes |
package com.databricks.apps.twitter_classifier import org.apache.commons.cli.{Options, ParseException, PosixParser} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.feature.HashingTF import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder object Utils { val numFeatures = 1000 val tf = new HashingTF(numFeatures) val CONSUMER_KEY = "consumerKey" val CONSUMER_SECRET = "consumerSecret" val ACCESS_TOKEN = "accessToken" val ACCESS_TOKEN_SECRET = "accessTokenSecret" val THE_OPTIONS = { val options = new Options() options.addOption(CONSUMER_KEY, true, "Twitter OAuth Consumer Key") options.addOption(CONSUMER_SECRET, true, "Twitter OAuth Consumer Secret") options.addOption(ACCESS_TOKEN, true, "Twitter OAuth Access Token") options.addOption(ACCESS_TOKEN_SECRET, true, "Twitter OAuth Access Token Secret") options } def parseCommandLineWithTwitterCredentials(args: Array[String]) = { val parser = new PosixParser try { val cl = parser.parse(THE_OPTIONS, args) //System.setProperty("twitter4j.oauth.consumerKey", cl.getOptionValue(CONSUMER_KEY)) //System.setProperty("twitter4j.oauth.consumerSecret", cl.getOptionValue(CONSUMER_SECRET)) //System.setProperty("twitter4j.oauth.accessToken", cl.getOptionValue(ACCESS_TOKEN)) //System.setProperty("twitter4j.oauth.accessTokenSecret", cl.getOptionValue(ACCESS_TOKEN_SECRET)) System.setProperty("twitter4j.oauth.consumerKey", "jREUiik4pE9bKcBUYr5xsV7jt") System.setProperty("twitter4j.oauth.consumerSecret", "LIUbDpJzgoJ8gz3w3OgQFGcMnMLyjPi9S3uBmtEdaLGzUBqkM9") System.setProperty("twitter4j.oauth.accessToken", "453844423-3P6XqQ8hXWY1K47gEL1LU9lRg9kcrzfEXDvVTMZM") System.setProperty("twitter4j.oauth.accessTokenSecret", "vrDBfnE1ya425mYIjM80OH8HmyYOQ3RUotk3t8gdFy6Yy") cl.getArgList.toArray } catch { case e: ParseException => System.err.println("Parsing failed. Reason: " + e.getMessage) System.exit(1) } } def getAuth = { Some(new OAuthAuthorization(new ConfigurationBuilder().build())) } def featurize(s: String): Vector = { tf.transform(s.sliding(2).toSeq) } object IntParam { def unapply(str: String): Option[Int] = { try { Some(str.toInt) } catch { case e: NumberFormatException => None } } } }
Example 10
Source File: DocumentSegmenter.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import chalk.text.analyze.PorterStemmer import chalk.text.segment.JavaSentenceSegmenter import chalk.text.tokenize.SimpleEnglishTokenizer case class Document(id: String, text: String) case class Sentence(id: Long, docId: String, text: String) case class SentenceTokens(id: Long, docId: String, tokens: Seq[String]) class DocumentSegmenter extends Serializable { def apply(documents: RDD[Document]) = { val sentences = extractSentences(documents) val tokenized = tokenize(sentences) (sentences, tokenized) } private def extractSentences(documents: RDD[Document]) : RDD[Sentence] = { documents .flatMap(d => segment(d.text).map(t => (d.id, t)) ) .zipWithIndex() .map({ case ((docId, sentenceText), sentenceId) => Sentence(sentenceId, docId, sentenceText) }) } private def tokenize(sentences: RDD[Sentence]) : RDD[SentenceTokens] = { val tokenizer = SimpleEnglishTokenizer() val nonWord = "[^a-z]*".r sentences.map(s => { val tokens = tokenizer(s.text.toLowerCase).toSeq .map(nonWord.replaceAllIn(_, "")) .filter(_.length > 3) .map(stem) SentenceTokens(s.id, s.docId, tokens) }) } private def segment(text: String) : Seq[String] = { JavaSentenceSegmenter(text).toSeq } private def stem(token: String) : String = { PorterStemmer(token) } }
Example 11
Source File: Featurizer.scala From lexrank-summarizer with MIT License | 5 votes |
package io.github.karlhigley.lexrank import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.{SparseVector, Vector} case class SentenceFeatures(id: Long, docId: String, features: SparseVector) class Featurizer(numStopwords: Int = 0) extends Serializable { private val hashingTF = new HashingTF() private val byIDF = Ordering[Double].on[(Int,Double)](_._2) def apply(tokens: RDD[SentenceTokens]) : RDD[SentenceFeatures] = { val idf = new IDF(minDocFreq = 2) val termFrequencies = tokens.map(t => { (t.id, t.docId, hashingTF.transform(t.tokens)) }) val idfModel = idf.fit(termFrequencies.map({ case (_, _, tf) => tf })) val stopwordIndices = identifyStopwords(idfModel.idf.toSparse, numStopwords) termFrequencies .map({ case (id, docId, tf) => val tfidf = idfModel.transform(tf).toSparse val features = removeStopwords(tfidf, stopwordIndices) SentenceFeatures(id, docId, features) }) .filter(_.features.indices.size > 0) } def indexOf(token: String): Int = { hashingTF.indexOf(token) } private def identifyStopwords(idf: SparseVector, numStopwords: Int) = { featureTuples(idf).sorted(byIDF).take(numStopwords).map(_._1) } private def removeStopwords(tf: SparseVector, stopwordIndices: Array[Int]) = { val (indices, values) = featureTuples(tf) .filter(p => !stopwordIndices.contains(p._1)) .unzip new SparseVector(tf.size, indices.toArray, values.toArray) } private def featureTuples(featureVector: SparseVector) = { featureVector.indices.zip(featureVector.values) } }
Example 12
Source File: Classifier.scala From Scalaprof with GNU General Public License v2.0 | 5 votes |
package edu.neu.coe.scala.spark.spam import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.classification.LogisticRegressionWithSGD import org.apache.spark.SparkConf import org.apache.spark.SparkContext object Classifier extends App { val conf = new SparkConf().setAppName("spam") val sc = new SparkContext(conf) val spam = sc.textFile("spam.txt") val norm = sc.textFile("normal.txt") val tf = new HashingTF(10000) val spamFeatures = spam.map(email => tf.transform(email.split(" "))) val normFeatures = norm.map(email => tf.transform(email.split(" "))) val posExamples = spamFeatures.map(f => LabeledPoint(1, f)) val negExamples = normFeatures.map(f => LabeledPoint(0, f)) val trainingData = posExamples.union(negExamples) trainingData.cache() val model = new LogisticRegressionWithSGD().run(trainingData) val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" ")) val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" ")) println(s"Prediction for positive test example: ${model.predict(posTest)}") println(s"Prediction for negative test example: ${model.predict(negTest)}") }