org.apache.spark.mllib.classification.NaiveBayes Scala Examples
The following examples show how to use org.apache.spark.mllib.classification.NaiveBayes.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparseNaiveBayes.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 2
Source File: SparseNaiveBayes.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 3
Source File: NaiveBayesExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint // $example off$ import org.apache.spark.{SparkConf, SparkContext} object NaiveBayesExample { def main(args: Array[String]) : Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) } // Split data into training (60%) and test (40%). val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0) val test = splits(1) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 4
Source File: SparseNaiveBayes.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 5
Source File: NaiveBayesExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 6
Source File: SparseNaiveBayes.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. //缓存的例子,因为它将被用于在训练和评估。 examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() //numTraining = 81, numTest = 19. println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest //Test accuracy = 1.0. 准确率 println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 7
Source File: SparseNaiveBayes.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } }
Example 8
Source File: SparseNaiveBayes.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println
Example 9
Source File: NaiveBayesExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 10
Source File: NaiveBayesExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 11
Source File: NaiveBayesExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.util.MLUtils // $example off$ object NaiveBayesExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val Array(training, test) = data.randomSplit(Array(0.6, 0.4)) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 12
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.mllib.util.MLUtils //import org.apache.spark.ml.feature.HashingTF //import org.apache.spark.ml.feature.IDF object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) println(zipped.first()) val train = zipped.map { case (topic, vector) => { LabeledPoint(newsgroupsMap(topic), vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm") train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => { println(topic) println(vector) LabeledPoint(topic, vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.accuracy) println(metrics.weightedFalsePositiveRate) println(metrics.weightedPrecision) println(metrics.weightedFMeasure) println(metrics.weightedRecall) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 13
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{ SparseVector => SV } object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.weightedFMeasure) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 14
Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.SVMWithSGD class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("numIterations", numIterations)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = SVMWithSGD.train(trainingData, numIterations) log.debug("Classification Model:" + model) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationSvm { def apply(numIterations: Int = 100) = { new PipeClassificationSvm(numIterations) } }
Example 15
Source File: PipeClassificationNaiveBayes.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.NaiveBayesModel class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("lambda", lambda)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = NaiveBayes.train(trainingData, lambda) log.debug("Classification Model:" + model) log.debug("Classification Model labels :" + model.labels.mkString(" ")) log.debug("Classification Model pi: " + model.pi.mkString(" ")) log.debug("Classification Model theta: " + model.theta.foreach(_.mkString(" "))) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationNaiveBayes { def apply(lambda: Double = 1.0) = { new PipeClassificationNaiveBayes(lambda) } }
Example 16
Source File: DigitRecognizer.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.train import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} val predictResult = Seq(0.001,0.01,0.1,1.0,10.0).map { param => val nbModel = trainNBWithParams(testData,param,"multinomial") val predictResult = testData.map { labeledPoint => val predicted = nbModel.predict(labeledPoint.features) if (predicted > 0.5) 1 else 0 }.reduce(_ + _) val accuracy = predictResult / testData.count * 1.0 println(s"nb model with lambda:$param,modelTpye:multinomial,Accuracy:$accuracy") } } }
Example 17
Source File: SparseNaiveBayes.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.util.MLUtils object SparseNaiveBayes { case class Params( input: String = null, minPartitions: Int = 0, numFeatures: Int = -1, lambda: Double = 1.0) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("SparseNaiveBayes") { head("SparseNaiveBayes: an example naive Bayes app for LIBSVM data.") opt[Int]("numPartitions") .text("min number of partitions") .action((x, c) => c.copy(minPartitions = x)) opt[Int]("numFeatures") .text("number of features") .action((x, c) => c.copy(numFeatures = x)) opt[Double]("lambda") .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}") .action((x, c) => c.copy(lambda = x)) arg[String]("<input>") .text("input paths to labeled examples in LIBSVM format") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SparseNaiveBayes with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val minPartitions = if (params.minPartitions > 0) params.minPartitions else sc.defaultMinPartitions val examples = MLUtils.loadLibSVMFile(sc, params.input, params.numFeatures, minPartitions) // Cache examples because it will be used in both training and evaluation. examples.cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0) val test = splits(1) val numTraining = training.count() val numTest = test.count() println(s"numTraining = $numTraining, numTest = $numTest.") val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val accuracy = predictionAndLabel.filter(x => x._1 == x._2).count().toDouble / numTest println(s"Test accuracy = $accuracy.") sc.stop() } } // scalastyle:on println