org.apache.spark.mllib.regression.LabeledPoint Scala Examples
The following examples show how to use org.apache.spark.mllib.regression.LabeledPoint.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkIntroduction.scala From reactive-machine-learning-systems with MIT License | 6 votes |
package com.reactivemachinelearning import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors object SparkIntroduction { def main(args: Array[String]) { // handle args // setup val session = SparkSession.builder.appName("Simple ModelExample").getOrCreate() import session.implicits._ // Load and parse the train and test data val inputBasePath = "example_data" val outputBasePath = "." val trainingDataPath = inputBasePath + "/training.txt" val testingDataPath = inputBasePath + "/testing.txt" val currentOutputPath = outputBasePath + System.currentTimeMillis() val trainingData = session.read.textFile(trainingDataPath) val trainingParsed = trainingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val testingData = session.read.textFile(testingDataPath) val testingParsed = testingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val model = LinearRegressionWithSGD.train(trainingParsed.rdd, numIterations) // Evaluate model on testing examples val predictionsAndLabels = testingParsed.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Report performance statistics val metrics = new MulticlassMetrics(predictionsAndLabels.rdd) val precision = metrics.precision val recall = metrics.recall println(s"Precision: $precision Recall: $recall") // Save model model.save(session.sparkContext, currentOutputPath) } }
Example 2
Source File: PCAOnSourceVectorExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 3
Source File: PCAExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 4
Source File: LinearRegressionWithSGDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 5
Source File: StreamingLinearRegressionExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") System.exit(1) } val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") val ssc = new StreamingContext(conf, Seconds(1)) // $example on$ val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ ssc.stop() } } // scalastyle:on println
Example 6
Source File: StreamingKMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 7
Source File: DataValidators.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 8
Source File: LogisticRegressionDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 9
Source File: SVMDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 10
Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("ChiSqSelector by FPR transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 11
Source File: EnsembleTestHelper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 12
Source File: PythonMLLibAPISuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 13
Source File: DigitRecognizer.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.train import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} val predictResult = Seq(0.001,0.01,0.1,1.0,10.0).map { param => val nbModel = trainNBWithParams(testData,param,"multinomial") val predictResult = testData.map { labeledPoint => val predicted = nbModel.predict(labeledPoint.features) if (predicted > 0.5) 1 else 0 }.reduce(_ + _) val accuracy = predictResult / testData.count * 1.0 println(s"nb model with lambda:$param,modelTpye:multinomial,Accuracy:$accuracy") } } }
Example 14
Source File: StreamingLogisticRegression.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import com.bigchange.util.{FileUtil, TimeUtil} import org.apache.spark.SparkConf import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) // model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() model.predictOnValues(testData.map(lp => (lp.label, lp.features))).map(x => x._1 +"\t" +x._2).foreachRDD(rdd =>{ val value = rdd.collect() FileUtil.normalFileWriter("F:\\datatest\\ai\\StreamingLogisticRegression\\"+TimeUtil.getCurrentHour,value) }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 15
Source File: StreamingSimpleModel.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.streaming import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingSimpleModel { def main(args: Array[String]) { val ssc = new StreamingContext("local","test",Seconds(10)) val stream = ssc.socketTextStream("localhost",9999) val numberFeatures = 100 val zeroVector = DenseVector.zeros[Double](numberFeatures) val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.dense(zeroVector.data)) .setNumIterations(1) .setStepSize(0.01) val labeledStream = stream.map { event => val split = event.split("\t") val y = split(0).toDouble val features = split(1).split(",").map(_.toDouble) LabeledPoint(label = y, features = Vectors.dense(features)) } model.trainOn(labeledStream) // 使用DStream的转换算子 val predictAndTrue = labeledStream.transform { rdd => val latestModel = model.latestModel() rdd.map { point => val predict = latestModel.predict(point.features) predict - point.label } } // 计算MSE predictAndTrue.foreachRDD { rdd => val mse = rdd.map(x => x * x).mean() val rmse = math.sqrt(mse) println(s"current batch, MSE: $mse, RMSE:$rmse") } ssc.start() ssc.awaitTermination() } }
Example 16
Source File: TrainingSetModel.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.visualisation.model import org.apache.spark.SparkContext.rddToPairRDDFunctions import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate class TrainingSetModel extends BasicAnalysable { var _trainingsSetLabeled: Option[RDD[LabeledPoint]] = None def trainingsSetLabeled = { if (_trainingsSetLabeled.isDefined) { _trainingsSetLabeled.get } else { throw new Exception("Training Set not defined") } } def trainingsSetLabeled_=(trainingsSetLabeled: RDD[LabeledPoint]) = _trainingsSetLabeled = Option(trainingsSetLabeled) lazy val trainingsSetSize = trainingsSetLabeled.count() lazy val trainingSetTruePostiveCount = { val duplicatesFiltered = labelsCounted.filter(_._1 == Duplicate) // reducing is invoked on one single entity and is only used for type conversion. duplicatesFiltered.map(_._2).reduce(_ + _) } lazy val trainingSetTrueNegativeCount = { val duplicatesFiltered = labelsCounted.filter(_._1 == NoDuplicate) // reducing is invoked on one single entity and is only used for type conversion. duplicatesFiltered.map(_._2).reduce(_ + _) } private lazy val labelsCounted = { val keyValue = trainingsSetLabeled.map(lPoint => (lPoint.label, 1)) keyValue.reduceByKey(_ + _) } }
Example 17
Source File: PipeClassificationNaiveBayes.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.NaiveBayesModel class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("lambda", lambda)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = NaiveBayes.train(trainingData, lambda) log.debug("Classification Model:" + model) log.debug("Classification Model labels :" + model.labels.mkString(" ")) log.debug("Classification Model pi: " + model.pi.mkString(" ")) log.debug("Classification Model theta: " + model.theta.foreach(_.mkString(" "))) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationNaiveBayes { def apply(lambda: Double = 1.0) = { new PipeClassificationNaiveBayes(lambda) } }
Example 18
Source File: PipeClassificationTrainingDataGenerator.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.compat.Platform import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeClassificationTrainingDataGenerator( truePositiveCount: Int = 500, trueNegativeCount: Int = 500)( implicit featureMeasures: Array[(Int, StringMetric[Double])]) extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])] with Logging { override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = { pipeContext match { case pc: GoldstandardContext with CorpusContext => { var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction) log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction) if (truePositiveFraction > 1.0) { truePositiveFraction = 1.0 log.debug("True positive pair fraction limited to 1.0") } if (trueNegativeFraction > 1.0) { trueNegativeFraction = 1.0 log.debug("True negative pair fraction limited to 1.0") } val result = generateTrainingData(pc.corpus, pc.goldstandard, truePositiveFraction, trueNegativeFraction) (input, result) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } object PipeClassificationTrainingDataGenerator { val All = -1 def apply( truePositiveCount: Int = 500, trueNegativeCount: Int = 500)( implicit featureMeasures: Array[(Int, StringMetric[Double])]) = { new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount) } }
Example 19
Source File: PipeClassificationDecisionTree.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable import de.unihamburg.vsis.sddf.Parameterized import org.apache.spark.mllib.classification.ClassificationModel class PipeClassificationDecisionTree( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = DecisionTree.trainClassifier(trainingData, numClasses = 2, categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins) log.debug("Decision Tree Model:" + model) log.debug("Decision Tree:" + model.toDebugString) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationDecisionTree { def apply( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) = { new PipeClassificationDecisionTree(impurity, maxDepth, maxBins) } }
Example 20
Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.SVMWithSGD class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("numIterations", numIterations)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = SVMWithSGD.train(trainingData, numIterations) log.debug("Classification Model:" + model) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationSvm { def apply(numIterations: Int = 100) = { new PipeClassificationSvm(numIterations) } }
Example 21
Source File: PipeAnalyseClassificationTraining.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.ResultContext import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel class PipeAnalyseClassificationTraining extends PipeElementPassthrough[(SymPairSim, RDD[LabeledPoint])] { override val _analysable: TrainingSetModel = new TrainingSetModel def substep( input: (SymPairSim, RDD[LabeledPoint]))( implicit pipeContext: AbstractPipeContext): Unit = { _analysable.trainingsSetLabeled = input._2 pipeContext match { case pc: ResultContext => { pc.trainingSetModel = Some(_analysable) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } } object PipeAnalyseClassificationTraining { def apply() = new PipeAnalyseClassificationTraining }
Example 22
Source File: AbstractPipeClassification.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable abstract class AbstractPipeClassification() extends PipeElement[(SymPairSim, RDD[LabeledPoint]), SymPairSim] with Parameterized { override val _analysable = new AlgoAnalysable _analysable.algo = this def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] def step(input: (SymPairSim, RDD[LabeledPoint]))(implicit pipeContext: AbstractPipeContext): SymPairSim = { pipeContext match { case pc: CorpusContext with GoldstandardContext => { val symPairSim = input._1 val trainingsSet = input._2 val prediction = trainModelAndClassify(trainingsSet, symPairSim) val duplicatePairs = prediction.filter(_._3 == Duplicate).map(tri => (tri._1, tri._2)) duplicatePairs } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } }
Example 23
Source File: PipeDecisionTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.classification import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.test.util.LocalSparkContext class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{ var input: (SymPairSim, RDD[LabeledPoint]) = _ override def beforeAll() { super.beforeAll() val tuple1 = Tuple("test1","test1","test1") tuple1.id = 1 val tuple2 = Tuple("test2","test2","test2") tuple2.id = 2 val tuple3 = Tuple("hans","franz","wurst") tuple3.id = 3 val symPairSim: SymPairSim = sc.parallelize(Seq( (new SymPair(tuple1, tuple2), Array(1D,1D,0D)) ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D)) )) val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq( LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91))) )) input = (symPairSim, trainingData) } override def afterAll() { super.afterAll() } test("naive bayes classification test") { val classificationPipe = new PipeClassificationNaiveBayes() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } test("svm classification test") { val classificationPipe = new PipeClassificationSvm() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } test("decision tree classification test") { val classificationPipe = new PipeClassificationDecisionTree() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } }
Example 24
Source File: FactorizationMachineCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{FMModel, FMWithSGD, LabeledPoint} import org.apache.spark.sql.DataFrame class FactorizationMachineCtrModel extends BaseCtrModel { var _model:FMModel = _ def train(samples:DataFrame) : Unit = { //calculate inner product between item embedding and user embedding val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) _pipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct) val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct) val formatSamples = preparedSamples.rdd.map( row =>{ new LabeledPoint(row.getAs[Int]("label").toDouble, Vectors.fromML(row.getAs[DenseVector]("scaledFeatures"))) }) _model = FMWithSGD.train(formatSamples, task = 1, numIterations = 200, stepSize = 0.15, miniBatchFraction = 1, dim = (true, true, 2), regParam = (0, 0, 0), initStd = 0.1) } override def transform(samples:DataFrame):DataFrame = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct) _model.predict(preparedSamples) } }
Example 25
Source File: LogisticRegressionModel.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg.Vector import org.apache.spark.mllib.classification.{LogisticRegressionModel => MLlibLRM} import org.apache.spark.mllib.linalg.{Vector => MLlibVector} import org.apache.spark.mllib.optimization.{SquaredL2Updater, LogisticGradient, LBFGS} import org.apache.spark.mllib.regression.{GeneralizedLinearAlgorithm, LabeledPoint} import org.apache.spark.mllib.util.DataValidators import org.apache.spark.rdd.RDD import keystoneml.utils.MLlibUtils.breezeVectorToMLlib import keystoneml.workflow.{LabelEstimator, Transformer} import scala.reflect.ClassTag private[this] class LogisticRegressionWithLBFGS(numClasses: Int, numFeaturesValue: Int) extends GeneralizedLinearAlgorithm[MLlibLRM] with Serializable { this.numFeatures = numFeaturesValue override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater) override protected val validators = List(multiLabelValidator) require(numClasses > 1) numOfLinearPredictor = numClasses - 1 if (numClasses > 2) { optimizer.setGradient(new LogisticGradient(numClasses)) } private def multiLabelValidator: RDD[LabeledPoint] => Boolean = { data => if (numOfLinearPredictor > 1) { DataValidators.multiLabelValidator(numOfLinearPredictor + 1)(data) } else { DataValidators.binaryLabelValidator(data) } } override protected def createModel(weights: MLlibVector, intercept: Double) = { if (numOfLinearPredictor == 1) { new MLlibLRM(weights, intercept) } else { new MLlibLRM(weights, intercept, numFeatures, numOfLinearPredictor + 1) } } } override def fit(in: RDD[T], labels: RDD[Int]): LogisticRegressionModel[T] = { val labeledPoints = labels.zip(in).map(x => LabeledPoint(x._1, breezeVectorToMLlib(x._2))) val trainer = new LogisticRegressionWithLBFGS(numClasses, numFeatures) trainer.setValidateData(false).optimizer.setNumIterations(numIters).setRegParam(regParam) val model = trainer.run(labeledPoints) new LogisticRegressionModel(model) } }
Example 26
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 27
Source File: spark-latest.scala From ann-benchmark with Apache License 2.0 | 5 votes |
import org.apache.log4j._ Logger.getRootLogger.setLevel(Level.OFF) import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.ml.classification.MultilayerPerceptronClassifier // maximum number of worker nodes in cluster val numNodes = 5 // batch size, ~10K is good for GPU val batchSize = 1000 // number of iterations to run val numIterations = 5 val train = MLUtils.loadLibSVMFile(sc, "file:///data/mnist/mnist.scale") //val layers = Array[Int](780, 2500, 2000, 1500, 1000, 500, 10) val layers = Array[Int](780, 10) val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1000).setSeed(1234L).setMaxIter(1) for (i <- 1 to numNodes) { val dataPartitions = sc.parallelize(1 to i, i) val sample = train.sample(true, 1.0 / i, 11L).collect val parallelData = sqlContext.createDataFrame(dataPartitions.flatMap(x => sample)) parallelData.persist parallelData.count val t = System.nanoTime() val model = trainer.fit(parallelData) println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) parallelData.unpersist() }
Example 28
Source File: spark.scala From ann-benchmark with Apache License 2.0 | 5 votes |
import org.apache.log4j._ Logger.getRootLogger.setLevel(Level.OFF) import org.apache.spark.mllib.ann.{FeedForwardTrainer, FeedForwardTopology} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.classification.ANNClassifier // maximum number of worker nodes in cluster val numNodes = 5 // batch size, ~10K is good for GPU val batchSize = 1000 // number of iterations to run val numIterations = 5 val train = MLUtils.loadLibSVMFile(sc, "/mnist.scale") val topology = FeedForwardTopology.multiLayerPerceptron(Array[Int](780, 2500, 2000, 1500, 1000, 500, 10), false) val trainer = new FeedForwardTrainer(topology, 780, 10).setBatchSize(batchSize) trainer.SGDOptimizer.setNumIterations(numIterations).setMiniBatchFraction(1.0).setStepSize(0.03) // parallalize the data for N nodes, persist, run X iterations and print average time for each run for (i <- 1 to numNodes) { val dataPartitions = sc.parallelize(1 to i, i) val sample = train.sample(true, 1.0 / i, 11L).collect val parallelData = dataPartitions.flatMap(x => sample) parallelData.persist parallelData.count val t = System.nanoTime() val model = new ANNClassifier(trainer).train(parallelData) println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) }
Example 29
Source File: L9-7FeatureExtraction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.ChiSqSelector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object FeatureExtractionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: FeatureExtractionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048)))) datastream.foreachRDD(rdd => { val selector = new ChiSqSelector(5) val model = selector.fit(rdd) val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features))) filtered.take(20).foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 30
Source File: L9-9LogisticRegression.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD object LogisticRegressionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(4)) .setStepSize(0.0001) .setNumIterations(1) model.trainOn(train) model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd .map(v => math.pow((v._1 - v._2), 2)).mean()))) ssc.start() ssc.awaitTermination() } }
Example 31
Source File: L9-1LinearRegression.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object LinearRegressionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: LinearRegressionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(4)) .setStepSize(0.0001) .setNumIterations(1) model.trainOn(train) model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd .map(v => math.pow((v._1 - v._2), 2)).mean()))) ssc.start() ssc.awaitTermination() } }
Example 32
Source File: T9-4DataTypes.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataTypesApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val denseV = substream.map(f => Vectors.dense(f.slice(1, 5))) denseV.print() val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) }) .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l)) sparseV.print() val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) labeledP.print() val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53))) denseM.print() denseV.foreachRDD(rdd => { val rowM = new RowMatrix(rdd) println(rowM) }) denseV.foreachRDD(rdd => { val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1)) val iRowM = new IndexedRowMatrix(iRdd) println(iRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val cRowM = new CoordinateMatrix(entries) println(cRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val blockM = new CoordinateMatrix(entries).toBlockMatrix println(blockM) }) ssc.start() ssc.awaitTermination() } }
Example 33
Source File: L9-5ChiSq.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object ChiSqApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) .filter(f => f(0) == 4.0 || f(0) == 5.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) .foreachRDD(rdd => { Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2))) }) ssc.start() ssc.awaitTermination() } }
Example 34
Source File: L9-4Correlation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CorrelationApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) walkingOrRunning.map(f => f.features).foreachRDD(rdd => { val corrSpearman = Statistics.corr(rdd, "spearman") val corrPearson = Statistics.corr(rdd, "pearson") println("Correlation Spearman: \n" + corrSpearman) println("Correlation Pearson: \n" + corrPearson) }) ssc.start() ssc.awaitTermination() } }
Example 35
Source File: L9-8PCA.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object PCAApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: PCAApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) datastream.foreachRDD(rdd => { val pca = new PCA(rdd.first().features.size / 2) .fit(rdd.map(_.features)) val testTrain = rdd.randomSplit(Array(0.3, 0.7)) val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features))) val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features))) train.take(20).foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 36
Source File: L9-10KMeans.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object KMeansClusteringApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val orientationStream = substream .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray) .map(arr => arr.map(_.toDouble)) .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingKMeans() .setK(3) .setDecayFactor(0) .setRandomCenters(18, 0.0) model.trainOn(train.map(v => v.features)) val prediction = model.predictOnValues(test.map(v => (v.label, v.features))) ssc.start() ssc.awaitTermination() } }
Example 37
Source File: LIBLINEAR.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.liblinear import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.mllib.regression.LabeledPoint import de.bwaldvogel.liblinear.SolverType import se.uu.farmbio.cp.ICP object LIBLINEAR { private def calibrationSplit( trainingData: Array[LabeledPoint], calibrationSizeP: Int, calibrationSizeN: Int) = { val shuffData = Random.shuffle(trainingData.toList) val positives = shuffData.filter { p => p.label == 1.0 } val negatives = shuffData.filter { p => p.label != 1.0 } val calibration = ( positives.take(calibrationSizeP) ++ negatives.take(calibrationSizeN)) .toArray val properTraining = ( //Negative labels go first negatives.takeRight(negatives.length - calibrationSizeN) ++ positives.takeRight(positives.length - calibrationSizeP)) .toArray (properTraining, calibration) } private[liblinear] def splitFractional( trainingData: Array[LabeledPoint], calibrationFraction: Double) = { val calibrationSizeP = (trainingData.count(_.label == 1.0) * calibrationFraction).toInt val calibrationSizeN = (trainingData.count(_.label != 1.0) * calibrationFraction).toInt calibrationSplit(trainingData, calibrationSizeP, calibrationSizeN) } def trainAggregatedICPClassifier( sc: SparkContext, trainingData: Array[LabeledPoint], calibrationFraction: Double = 0.2, numberOfICPs: Int = 30, solverType: SolverType = SolverType.L2R_L2LOSS_SVC_DUAL, regParam: Double = 1, tol: Double = 0.01) = { //Broadcast the dataset val trainBroadcast = sc.broadcast(trainingData) //Train ICPs for different calibration samples val icps = sc.parallelize((1 to numberOfICPs)).map { _ => //Sample calibration val (properTraining, calibration) = splitFractional(trainBroadcast.value, calibrationFraction) //Train ICP val alg = new LibLinAlg( properTraining, solverType, regParam, tol) ICP.trainClassifier(alg, numClasses = 2, calibration) } new AggregatedICPClassifier(icps) } }
Example 38
Source File: LibLinAlg.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.liblinear import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import de.bwaldvogel.liblinear.Feature import de.bwaldvogel.liblinear.FeatureNode import de.bwaldvogel.liblinear.Linear import de.bwaldvogel.liblinear.Parameter import de.bwaldvogel.liblinear.Problem import de.bwaldvogel.liblinear.SolverType import se.uu.farmbio.cp.UnderlyingAlgorithm import se.uu.farmbio.cp.Deserializer object LibLinAlg { private def vectorToFeatures(v: Vector) = { val indices = v.toSparse.indices val values = v.toSparse.values indices .zip(values) .sortBy { case (i, v) => i } .map { case (i, v) => new FeatureNode(i + 1, v) .asInstanceOf[Feature] } } private def train( input: Array[LabeledPoint], solverType: SolverType, c: Double, tol: Double) = { //configure problem val problem = new Problem problem.l = input.length problem.n = input(0).features.size problem.x = input.map { p => vectorToFeatures(p.features) } problem.y = input.map(_.label + 1.0) problem.bias = -1.0 //train val parameter = new Parameter(solverType, c, tol) val libLinModel = Linear.train(problem, parameter) //convert to Spark SVMModel val weights = libLinModel.getFeatureWeights val intercept = libLinModel.getBias val svmModel = new SVMModel(Vectors.dense(weights).toSparse, intercept) svmModel.clearThreshold svmModel } } object LibLinAlgDeserializer extends Deserializer[LibLinAlg] { override def deserialize(alg: String) = { val splitted = alg.split(",", 2) val intercept = splitted(0) val weights = splitted(1) val model = new SVMModel(Vectors.parse(weights).toSparse, intercept.toDouble) model.clearThreshold() new LibLinAlg(model) } } class LibLinAlg( val svmModel: SVMModel) extends UnderlyingAlgorithm( (features: Vector) => svmModel.predict(features)) { def this( training: Array[LabeledPoint], solverType: SolverType, regParam: Double, tol: Double) = { this(LibLinAlg.train(training, solverType, regParam, tol)) } override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { score } else { -score } } override def toString = { this.svmModel.intercept + "," + this.svmModel.weights.toString } }
Example 39
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.loss.LogLoss import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a GBTs UnderlyingAlgorithm private object GBT { def trainingProcedure( input: RDD[LabeledPoint], numIterations: Int): (Vector => Double) = { //Configuration val boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.numIterations = numIterations boostingStrategy.treeStrategy.maxDepth = 5 boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() boostingStrategy.loss = LogLoss //Training val remappedInput = input.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) val model = new GradientBoostedTrees(boostingStrategy) .run(input = remappedInput) model.predict } } class GBT( private val input: RDD[LabeledPoint], private val numIterations: Int) extends UnderlyingAlgorithm( GBT.trainingProcedure(input,numIterations)) { override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { -score } else { score } } }
Example 40
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.HingeGradient import org.apache.spark.mllib.optimization.LBFGS import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a SVMs UnderlyingAlgorithm private object SVM { def trainingProcedure( input: RDD[LabeledPoint], maxNumItearations: Int, regParam: Double, numCorrections: Int, convergenceTol: Double) = { //Train SVM with LBFGS val numFeatures = input.take(1)(0).features.size val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache() val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1)) val (weightsWithIntercept, _) = LBFGS.runLBFGS( training, new HingeGradient(), new SquaredL2Updater(), numCorrections, convergenceTol, maxNumItearations, regParam, initialWeightsWithIntercept) //Create the model using the weights val model = new SVMModel( Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), weightsWithIntercept(weightsWithIntercept.size - 1)) //Return raw score predictor model.clearThreshold() model } } class SVM(val model: SVMModel) extends UnderlyingAlgorithm(model.predict) { def this( input: RDD[LabeledPoint], maxNumItearations: Int = 100, regParam: Double = 0.1, numCorrections: Int = 10, convergenceTol: Double = 1e-4) = { this(SVM.trainingProcedure( input, maxNumItearations, regParam, numCorrections, convergenceTol)) } def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { -score } else { score } } }
Example 41
Source File: LogisticRegression.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.LBFGS import org.apache.spark.mllib.optimization.LogisticGradient import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a LogisticRegression UnderlyingAlgorithm private object LogisticRegression { def trainingProcedure( input: RDD[LabeledPoint], maxNumItearations: Int, regParam: Double, numCorrections: Int, convergenceTol: Double): (Vector => Double) = { //Train Logistic Regression with LBFGS val numFeatures = input.take(1)(0).features.size val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache() val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1)) val (weightsWithIntercept, _) = LBFGS.runLBFGS( training, new LogisticGradient(), new SquaredL2Updater(), numCorrections, convergenceTol, maxNumItearations, regParam, initialWeightsWithIntercept) //Create the model using the weights val model = new LogisticRegressionModel( Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), weightsWithIntercept(weightsWithIntercept.size - 1)) //Return raw score predictor model.clearThreshold() model.predict } } class LogisticRegression( private val input: RDD[LabeledPoint], private val maxNumItearations: Int = 100, private val regParam: Double = 0.1, private val numCorrections: Int = 10, private val convergenceTol: Double = 1e-4) extends UnderlyingAlgorithm( LogisticRegression.trainingProcedure( input, maxNumItearations, regParam, numCorrections, convergenceTol)) { override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { 1-score } else { score } } }
Example 42
package se.uu.farmbio.cp import org.apache.spark.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object ICP extends Logging { private def simpleSplit( input: RDD[LabeledPoint], numOfCalibSamples: Int) = { //Computing the calibration fraction using binomial upper bound val n = input.count val fraction = numOfCalibSamples.toDouble / n val delta = 1e-4 val minSamplingRate = 1e-10 val gamma = -math.log(delta) / n val calibFraction = math.min(1, math.max(minSamplingRate, fraction + gamma + math.sqrt(gamma * gamma + 2 * gamma * fraction))) //calibFraction is enough most of the times, but not always val splits = input.randomSplit(Array(calibFraction, 1 - calibFraction)) var sample = splits(0).collect while (sample.length < numOfCalibSamples) { logWarning("Needed to re-sample calibration set due to insufficient sample size.") val split = input.randomSplit(Array(calibFraction, 1 - calibFraction)) sample = splits(0).collect } val calibration = sample.take(numOfCalibSamples) val additional = sample.takeRight(sample.length - numOfCalibSamples) val sc = input.context (calibration, splits(1) ++ sc.parallelize(additional)) } private def stratifiedSplit( input: RDD[LabeledPoint], numOfCalibSamples: Int) = { logWarning("Stratified sampling is supported only for binary classification.") //Calibration split, making sure there is some data for both classes val class0 = input.filter(_.label == 0.0) val class1 = input.filter(_.label == 1.0) val count0 = class0.count val count1 = class1.count val posRatio = count1.doubleValue / (count0 + count1) val posSize = if(numOfCalibSamples * posRatio < 19) { logWarning("Raising the number of positive samples to 19 (allows sig >= 0.5)") 19 } else { (numOfCalibSamples * posRatio).ceil.toInt } val negSize = numOfCalibSamples - posSize val (negSmpl, negTr) = ICP.simpleSplit(class0, negSize) val (posSmpl, posTr) = ICP.simpleSplit(class1, posSize) val properTraining = negTr ++ posTr val clalibration = negSmpl ++ posSmpl (clalibration, properTraining) } def calibrationSplit( input: RDD[LabeledPoint], numOfCalibSamples: Int, stratified: Boolean = false) = { if (stratified) { logWarning("Stratified sampling needs to count the dataset, you should use it wisely.") ICP.stratifiedSplit(input, numOfCalibSamples) } else { ICP.simpleSplit(input, numOfCalibSamples) } } def trainClassifier[A <: UnderlyingAlgorithm]( alg: A, numClasses: Int, calibSet: Array[LabeledPoint]): ICPClassifierModel[A] = { //Compute aphas for each class (mondrian approach) val alphas = (0 to numClasses - 1).map { i => calibSet.filter(_.label == i) //filter current label .map(newSmpl => alg.nonConformityMeasure(newSmpl)) //compute alpha } new ICPClassifierModelImpl(alg, alphas) } }
Example 43
Source File: TestUtils.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp import scala.util.Random import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object TestUtils { def generate4ClassesData(instances: Int, seed: Long): Seq[LabeledPoint] = { val rnd = new Random(seed) Seq.fill(instances)((rnd.nextInt(100), rnd.nextInt(100))).map(r => { val label = if (r._1 < 50 && r._2 < 50) { 0.0 } else if (r._1 < 50) { 1.0 } else if (r._2 < 50) { 2.0 } else { 3.0 } new LabeledPoint(label, Vectors.dense(Array(r._1.toDouble, r._2.toDouble))) }) } def generate4ClassesTrainCalibTest(significance: Double) = { val numClasses = 4 val calibSamples = 4 * numClasses * (1 / significance - 1).ceil.toInt //4 times the minimum val training = generate4ClassesData(instances = 80, seed = Random.nextLong) val test = generate4ClassesData(instances = 20, seed = Random.nextLong) val calibration = generate4ClassesData(instances = calibSamples, seed = Random.nextLong) .toArray (training, calibration, test) } def generateBinaryData(instances: Int, seed: Long): Seq[LabeledPoint] = { val rnd = new Random(seed) Seq.fill(instances)(rnd.nextInt(100)).map(r => { val label = if (r < 50) { 0.0 } else { 1.0 } new LabeledPoint(label, Vectors.dense(r)) }) } def testPerformance[T <: UnderlyingAlgorithm]( model: ICPClassifierModel[T], test: RDD[LabeledPoint], sig: Double = 0.2, minEff: Double = 0.6, minRec: Double = 0.6) = { val pvAndLab = test.map { p => (model.mondrianPv(p.features), p.label) } val metrics = new BinaryClassificationICPMetrics(pvAndLab, Array(sig)) val eff = metrics.efficiencyBySignificance(sig) val rec = metrics.recallBySignificance(sig) eff >= minEff && rec >= minRec } }
Example 44
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 45
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 46
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 47
Source File: GMMClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gmm // scalastyle:off println // $example on$ import org.apache.spark.SparkConf import org.apache.spark.ml.clustering.{GaussianMixture, KMeans} // $example off$ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object GMMClustering { def main(args: Array[String]): Unit = { val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val spark = SparkSession .builder() .appName("Spark SQL Example") .config(spConfig) .getOrCreate() val datasetUsers = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_users_libsvm/part-00000") datasetUsers.show(3) val gmmUsers = new GaussianMixture().setK(5).setSeed(1L) val modelUsers = gmmUsers.fit(datasetUsers) for (i <- 0 until modelUsers.gaussians.length) { println("Users : weight=%f\ncov=%s\nmean=\n%s\n" format (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean)) } val dataSetItems = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_items_libsvm/part-00000") val gmmItems = new GaussianMixture().setK(5).setSeed(1L) val modelItems = gmmItems.fit(dataSetItems) for (i <- 0 until modelItems.gaussians.length) { println("Items : weight=%f\ncov=%s\nmean=\n%s\n" format (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean)) } spark.stop() } def loadInLibSVMFormat(line: String, noOfFeatures : Int) : LabeledPoint = { val items = line.split(' ') val label = items.head.toDouble val (indices, values) = items.tail.filter(_.nonEmpty).map { item => val indexAndValue = item.split(':') val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based. val value = indexAndValue(1).toDouble (index, value) }.unzip // check if indices are one-based and in ascending order var previous = -1 var i = 0 val indicesLength = indices.length while (i < indicesLength) { val current = indices(i) require(current > previous, "indices should be one-based and in ascending order" ) previous = current i += 1 } (label, indices.toArray, values.toArray) import org.apache.spark.mllib.linalg.Vectors val d = noOfFeatures LabeledPoint(label, Vectors.sparse(d, indices, values)) } }
Example 48
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{ SparseVector => SV } object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.weightedFMeasure) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 49
Source File: DocumentClassification.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.feature.{HashingTF, IDF} import org.apache.spark.mllib.linalg.SparseVector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.{SparseVector => SV} import org.apache.spark.mllib.util.MLUtils //import org.apache.spark.ml.feature.HashingTF //import org.apache.spark.ml.feature.IDF object DocumentClassification { def main(args: Array[String]) { val sc = new SparkContext("local[2]", "First Spark App") val path = "../data/20news-bydate-train/*" val rdd = sc.wholeTextFiles(path) val text = rdd.map { case (file, text) => text } val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap val dim = math.pow(2, 18).toInt val hashingTF = new HashingTF(dim) var tokens = text.map(doc => TFIDFExtraction.tokenize(doc)) val tf = hashingTF.transform(tokens) tf.cache val v = tf.first.asInstanceOf[SV] val idf = new IDF().fit(tf) val tfidf = idf.transform(tf) val zipped = newsgroups.zip(tfidf) println(zipped.first()) val train = zipped.map { case (topic, vector) => { LabeledPoint(newsgroupsMap(topic), vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(train,"./output/20news-by-date-train-libsvm") train.cache val model = NaiveBayes.train(train, lambda = 0.1) val testPath = "../data/20news-bydate-test/*" val testRDD = sc.wholeTextFiles(testPath) val testLabels = testRDD.map { case (file, text) => val topic = file.split("/").takeRight(2).head newsgroupsMap(topic) } val testTf = testRDD.map { case (file, text) => hashingTF.transform(TFIDFExtraction.tokenize(text)) } val testTfIdf = idf.transform(testTf) val zippedTest = testLabels.zip(testTfIdf) val test = zippedTest.map { case (topic, vector) => { println(topic) println(vector) LabeledPoint(topic, vector) } } //TODO uncomment to generate libsvm format MLUtils.saveAsLibSVMFile(test,"./output/20news-by-date-test-libsvm") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() println(accuracy) // Updated Dec 2016 by Rajdeep //0.7928836962294211 val metrics = new MulticlassMetrics(predictionAndLabel) println(metrics.accuracy) println(metrics.weightedFalsePositiveRate) println(metrics.weightedPrecision) println(metrics.weightedFMeasure) println(metrics.weightedRecall) //0.7822644376431702 val rawTokens = rdd.map { case (file, text) => text.split(" ") } val rawTF = rawTokens.map(doc => hashingTF.transform(doc)) val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } val rawZippedTest = testLabels.zip(rawTestTF) val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() println(rawAccuracy) // 0.7661975570897503 val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) println(rawMetrics.weightedFMeasure) // older value 0.7628947184990661 // dec 2016 : 0.7653320418573546 sc.stop() } }
Example 50
Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegression{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD") } }
Example 51
Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithLog{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 //val step = 0.2 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features)))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = false if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log") } }
Example 52
Source File: DecisionTreeUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeUtil { def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data_dt = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r))) } val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) return (training, test) } def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], categoricalFeaturesInfo: scala.Predef.Map[Int, Int], maxDepth :Int, maxBins: Int): Double = { val impurity = "variance" val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo, impurity,maxDepth, maxBins ) val true_vs_predicted = test.map(p => (p.label, decisionTreeModel.predict(p.features))) val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) return rmsle } }
Example 53
Source File: DecisionTreeCategoricalFeaturesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeCategoricalFeaturesApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val save = true //val sc = new SparkContext("local[2]", "First Spark App") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) val first = records.first() println(numData.toInt) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen println("Feature vector length for categorical features:"+ catLen) println("Feature vector length for numerical features:" + numLen) println("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data_dt = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r))) } val first_point = data_dt.first() println("Decision Tree feature vector:" + first_point.features.toString) println("Decision Tree feature vector length: " + first_point.features.size) def getCatFeatures(): scala.Predef.Map[Int, Int] = { var d = scala.Predef.Map[Int, Int]() for(a <- 2 until 10){ d += (a-2 -> (get_mapping(records, a).size + 1)) //d.put(a-2,get_mapping(records, a).size + 1) } return d } val cat_features = getCatFeatures() //dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2,10)]) //val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val decisionTreeModel= DecisionTree.trainRegressor(data_dt, cat_features, impurity, maxDepth, maxBins) //val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo, // impurity, maxDepth, maxBins ) val preds = decisionTreeModel.predict(data_dt.map( p=> p.features)) val actual = data.map( p=> p.label) val true_vs_predicted_dt = actual.zip(preds) val true_vs_predicted_csv = data.map(p => p.label + " ," + decisionTreeModel.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) if (save){ true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_categorical_" + date + ".csv") } print("Decision Tree depth: " + decisionTreeModel.depth) print("Decision Tree number of nodes: " + decisionTreeModel.numNodes) Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree Categorical Features") } }
Example 54
Source File: DecisionTreeWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeWithLog{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val save = false val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) val first = records.first() println(numData.toInt) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen println("Feature vector length for categorical features:"+ catLen) println("Feature vector length for numerical features:" + numLen) println("Total feature vector length: " + totalLen) val data_dt = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extract_features_dt(r))) } val first_point = data_dt.first() println("Decision Tree feature vector:" + first_point.features.toString) println("Decision Tree feature vector length: " + first_point.features.size) val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo, impurity, maxDepth, maxBins ) val preds = decisionTreeModel.predict(data_dt.map( p=> p.features)) val preds_2 = preds.map(p=> Math.exp(p)) val actual = data_dt.map( p=> Math.exp(p.label)) val true_vs_predicted_dt = actual.zip(preds) if(save){ val true_vs_predicted_csv = data_dt.map(p => p.label + " ," + decisionTreeModel.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_" + date + ".csv") } print("Decision Tree depth: " + decisionTreeModel.depth) print("Decision Tree number of nodes: " + decisionTreeModel.numNodes) Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree With Log") Util.sc.stop() } }
Example 55
Source File: RidgeRegressionApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.regression.{LabeledPoint, RidgeRegressionWithSGD} import org.apache.spark.rdd.RDD import scala.collection.Map import scala.collection.mutable.ListBuffer object RidgeRegressionApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { //val sc = new SparkContext("local[2]", "First Spark App") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) records.cache() //print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.1 val intercept =false val rr = new RidgeRegressionWithSGD() rr.optimizer.setNumIterations(iterations) rr.optimizer.setStepSize(0.1) val rrModel = rr.run(data) val true_vs_predicted = data.map(p => (p.label, rrModel.predict(p.features))) val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Ridge Regression - Mean Squared Error: " + mse) println("Ridge Regression - Mean Absolute Error: " + mae) println("Ridge Regression - Root Mean Squared Log Error:" + rmsle) } }
Example 56
Source File: GradientBoostedTreesUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gradientboosted import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object GradientBoostedTreesUtil { def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) return (training, test) } def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], iterations:Int, maxDepth:Int, maxBins: Int): Double ={ var boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.setNumIterations(iterations) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) val model = GradientBoostedTrees.train(train, boostingStrategy) // // @classmethod // @since("1.3.0") // def trainRegressor(cls, data, categoricalFeaturesInfo, // loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, // maxBins=32): val true_vs_predicted = test.map(p => (p.label, model.predict(p.features))) val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) return rmsle } }
Example 57
Source File: GradientBoostedTreesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gradientboosted import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object GradientBoostedTreesApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { //val conf = new SparkConf().setMaster("local").setAppName("GradientBoostedTreesRegressionApp") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) records.cache() var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Gradient Boosted Trees Model feature vector:" + first_point.features.toString) println("Gradient Boosted Trees Model feature vector length: " + first_point.features.size) var boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.setNumIterations(3)// Note: Use more iterations in practice. boostingStrategy.treeStrategy.setMaxDepth(5) val model = GradientBoostedTrees.train(data, boostingStrategy) val true_vs_predicted = data.map(p => (p.label, model.predict(p.features))) val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val save = true if(save){ val true_vs_predicted_csv = data.map(p => p.label + " ," + model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/gradient_boosted_trees_" + date + ".csv") } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Gradient Boosted Trees - Mean Squared Error: " + mse) println("Gradient Boosted Trees - Mean Absolute Error: " + mae) println("Gradient Boosted Trees - Root Mean Squared Log Error:" + rmsle) } }
Example 58
Source File: LinearRegressionWithIntercept.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithIntercept{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data1 = { records.map(r => Util.extractFeatures(r, catLen, mappings)) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true val linReg = new LinearRegressionWithSGD().setIntercept(intercept) linReg.optimizer.setNumIterations(iterations).setStepSize(step) val linear_model = linReg.run(data) print(data.first()); val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Linear Model - Mean Squared Error: " + mse) println("Linear Model - Mean Absolute Error: " + mae) println("Linear Model - Root Mean Squared Log Error:" + rmsle) } }
Example 59
Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegression{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD") } }
Example 60
Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithLog{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 //val step = 0.2 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features)))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = false if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log") } }
Example 61
Source File: IsotonicRegressionApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.regression.{IsotonicRegression, LabeledPoint} import org.apache.spark.rdd.RDD import scala.collection.Map import scala.collection.mutable.ListBuffer object IsotonicRegressionApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader_1000.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) records.cache() var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val parsedData = records.map { r => (Util.extractLabel(r), Util.extractSumFeature(r, catLen, mappings), 1.0) } val iterations = 10 val step = 0.1 val intercept =false val x = new IsotonicRegression().setIsotonic(false) val model = x.run(parsedData) val parsedData1: RDD[Double] = parsedData.map(r => r._2) //val model = GradientBoostedTrees.train(data, boostingStrategy) val true_vs_predicted = parsedData.map(p => (p._1, model.predict(p._2))) val save = true if(save){ val true_vs_predicted_csv = parsedData.map(p => ( p._1+ "," + model.predict(p._2))) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/isotonic_regression_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) Util.calculatePrintMetrics(true_vs_predicted, "Isotonic Regression") } }
Example 62
Source File: LinearRegressionDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator import org.apache.spark.mllib.util.LinearDataGenerator import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time} import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object LinearRegressionDataGen extends WorkloadDefaults { val name = "data-generation-lr" // Application parameters #1million points have 200M data size val numOfExamples: Int = 40000 val numOfFeatures: Int = 4 val eps: Double = 0.5 val intercepts: Double = 0.1 val numOfPartitions: Int = 10 val maxIteration: Int = 3 override def apply(m: Map[String, Any]) = new LinearRegressionDataGen( numRows = getOrThrow(m, "rows").asInstanceOf[Int], numCols = getOrThrow(m, "cols").asInstanceOf[Int], output = Some(getOrThrow(m, "output").asInstanceOf[String]), saveMode = getOrDefault[String](m, "save-mode", SaveModes.error), eps = getOrDefault[Double](m, "eps", eps), intercepts = getOrDefault[Double](m, "intercepts", intercepts), numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions) ) } case class LinearRegressionDataGen ( numRows: Int, numCols: Int, input: Option[String] = None, output: Option[String], saveMode: String, eps: Double, intercepts: Double, numPartitions: Int ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, data): (Long, RDD[LabeledPoint]) = time { LinearDataGenerator.generateLinearRDD( spark.sparkContext, numRows, numCols, eps, numPartitions, intercepts ) } import spark.implicits._ val (convertTime, dataDF) = time { data.toDF } val (saveTime, _) = time { val outputstr = output.get if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.") writeToDisk(output.get, saveMode, dataDF, spark) }//TODO you can't output this to CSV. Parquet is fine val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 63
Source File: PCAOnSourceVectorExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 64
Source File: PCAExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 65
Source File: LinearRegressionWithSGDExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 66
Source File: StreamingLinearRegressionExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") System.exit(1) } val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") val ssc = new StreamingContext(conf, Seconds(1)) // $example on$ val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ ssc.stop() } } // scalastyle:on println
Example 67
Source File: StreamingKMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 68
Source File: DataValidators.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 69
Source File: LogisticRegressionDataGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 70
Source File: SVMDataGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 71
Source File: ChiSqSelectorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("ChiSqSelector by fpr transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr") .setFpr(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 72
Source File: EnsembleTestHelper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 73
Source File: PythonMLLibAPISuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 74
Source File: Prediction.scala From uberdata with Apache License 2.0 | 5 votes |
package eleflow.uberdata.data import eleflow.uberdata.model.TypeMixin.TrainedData import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import eleflow.uberdata.enums.SupportedAlgorithm case class Prediction(validationPrediction: RDD[(Double, Double)], model: TrainedData[scala.Serializable], testDataSet: RDD[((Double, Any), LabeledPoint)], validationPredictionId: RDD[(Any, Double)], trainPredictionId: RDD[(Any, LabeledPoint)], testPredictionId: RDD[(Any, Double)]) case class MultiplePrediction( multiplePredictionValidation: Map[SupportedAlgorithm.Algorithm, RDD[(Double, Double)]], validationDataSet: RDD[((Double, Any), LabeledPoint)], trainDataSet: RDD[((Double, Any), LabeledPoint)], multiplePredictionTest: Map[SupportedAlgorithm.Algorithm, RDD[(Any, Double)]], testDataSet: RDD[((Double, Any), LabeledPoint)], models: List[TrainedData[Serializable]] )
Example 75
Source File: QuadraticRenyiEntropy.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.prototype import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import io.github.mandar2812.dynaml.kernels.DensityKernel override def entropy(data: List[DenseVector[Double]]): Double = { val dim = data.head.length val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2)) val product = for(i <- data.view; j <- data.view) yield (i, j) -1*log_e(product.map((couple) => { val point1: DenseVector[Double] = couple._1 / sqrt(2.0) val point2: DenseVector[Double] = couple._2 / sqrt(2.0) density.eval(point1 - point2) }).sum) } override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = { val dim = data.first()._2.features.size -1*log_e(data.cartesian(data).map((couple) =>{ val point1: DenseVector[Double] = DenseVector(couple._1._2.features.toArray) / sqrt(2.0) val point2: DenseVector[Double] = DenseVector(couple._2._2.features.toArray) / sqrt(2.0) density.eval(point1 - point2) }).reduce((a,b) => a + b)) } def entropyDifference(entropy: Double, data: List[DenseVector[Double]], add: DenseVector[Double], remove: DenseVector[Double]): Double = { val dim = data.head.length val expEntropy = math.exp(-1.0*entropy) val product1 = for(i <- data.view) yield (remove, i) val subtractEnt = 2*product1.map((couple) => { density.eval((couple._1 - couple._2) / sqrt(2.0)) }).sum - density.eval(DenseVector.zeros(dim)) val product2 = for(i <- data.view) yield (add, i) val addEnt = 2*product2.map((couple) => { density.eval((couple._1 - couple._2) / sqrt(2.0)) }).sum - 2*density.eval((add - remove) / sqrt(2.0)) + density.eval(DenseVector.zeros(dim)) -1.0*log_e(expEntropy + addEnt - subtractEnt) - entropy } }
Example 76
Source File: SparkLogisticGLM.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.lm //Breeze Imports import breeze.linalg.DenseVector import breeze.numerics.sigmoid import breeze.stats.distributions.Gaussian import io.github.mandar2812.dynaml.optimization.ProbitGradient import org.apache.spark.mllib.linalg.Vectors //DynaML Imports import io.github.mandar2812.dynaml.optimization.{ GradientDescentSpark, LogisticGradient, RegularizedOptimizer, SquaredL2Updater} //Spark Imports import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD class SparkProbitGLM( data: RDD[(DenseVector[Double], Double)], numPoints: Long, map: (DenseVector[Double]) => DenseVector[Double] = identity[DenseVector[Double]]) extends SparkLogisticGLM(data, numPoints, map) { private val standardGaussian = new Gaussian(0, 1.0) override val h: (Double) => Double = (x: Double) => standardGaussian.cdf(x) override protected val optimizer: RegularizedOptimizer[ DenseVector[Double], DenseVector[Double], Double, RDD[LabeledPoint]] = new GradientDescentSpark(new ProbitGradient, new SquaredL2Updater) }
Example 77
Source File: GradientDescentSpark.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.DenseVector import org.apache.log4j.{Logger, Priority} import org.apache.spark.AccumulatorParam import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD override def optimize(nPoints: Long, ParamOutEdges: RDD[LabeledPoint], initialP: DenseVector[Double]) : DenseVector[Double] = GradientDescentSpark.runBatchSGD( nPoints, this.regParam, this.numIterations, this.updater, this.gradient, this.stepSize, initialP, ParamOutEdges, this.miniBatchFraction ) } object GradientDescentSpark { private val logger = Logger.getLogger(this.getClass) def runBatchSGD( nPoints: Long, regParam: Double, numIterations: Int, updater: Updater, gradient: Gradient, stepSize: Double, initial: DenseVector[Double], POutEdges: RDD[LabeledPoint], miniBatchFraction: Double): DenseVector[Double] = { var count = 1 var oldW: DenseVector[Double] = initial var newW = oldW val sc = POutEdges.context val gradb = sc.broadcast(gradient) logger.log(Priority.INFO, "Training model using SGD") while(count <= numIterations) { val cumGradient = sc.accumulator(DenseVector.zeros[Double](initial.length))(new VectorAccumulator()) val wb = sc.broadcast(oldW) POutEdges sample(withReplacement = false, fraction = miniBatchFraction) foreach ((ed) => { val features = DenseVector(ed.features.toArray) val label = ed.label val (g, _) = gradb.value.compute(features, label, wb.value) cumGradient += g }) newW = updater.compute(oldW, cumGradient.value / nPoints.toDouble, stepSize, count, regParam)._1 oldW = newW count += 1 } newW } } class VectorAccumulator extends AccumulatorParam[DenseVector[Double]] { override def addInPlace(r1: DenseVector[Double], r2: DenseVector[Double]): DenseVector[Double] = r1 + r2 override def zero(initialValue: DenseVector[Double]): DenseVector[Double] = DenseVector.zeros(initialValue.length) }
Example 78
Source File: ConjugateGradientSpark.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import scala.util.Random def runCG(A: DenseMatrix[Double], b: DenseVector[Double], x: DenseVector[Double], epsilon: Double, MAX_ITERATIONS: Int): DenseVector[Double] = { val residual = b - (A*x) val p = residual var count = 1.0 var alpha = math.pow(norm(residual, 2), 2)/(p.t * (A*p)) var beta = 0.0 while(norm(residual, 2) >= epsilon && count <= MAX_ITERATIONS) { //update x axpy(alpha, p, x) //before updating residual, calculate norm (required for beta) val de = math.pow(norm(residual, 2), 2) //update residual axpy(-1.0*alpha, A*p, residual) //calculate beta beta = math.pow(norm(residual, 2), 2)/de //update p p :*= beta axpy(1.0, residual, p) //update alpha alpha = math.pow(norm(residual, 2), 2)/(p.t * (A*p)) count += 1 } x } }
Example 79
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 80
Source File: HandsOnKMeanStreaming.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.mllib.clustering.StreamingKMeans object HandsOnKMeanStreaming { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnKMeanStreaming") val ssc = new StreamingContext(conf, Seconds(10)) val model = new StreamingKMeans(). setK(4). // number of clusters is 4 setDecayFactor(1.0). // decay factor (the forgetfulness of the previous centroids) setRandomCenters(3, 0.0) // 3 dimensions and 0 weight import org.apache.spark.mllib.linalg.Vectors val trainingData = ssc.textFileStream("file:/tmp/k-means-train-data").map(Vectors.parse).cache() trainingData.print() import org.apache.spark.mllib.regression.LabeledPoint val testData = ssc.textFileStream("file:/tmp/k-means-test-data").map(LabeledPoint.parse) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes) } }
Example 81
Source File: HandsOnLinRegStreaming.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD object HandsOnLinRegStreaming { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnLinRegStreaming") val ssc = new StreamingContext(conf, Seconds(10)) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD().setInitialWeights(Vectors.zeros(numFeatures)) val trainingData = ssc.textFileStream("file:/tmp/lin-reg-train-data").map(LabeledPoint.parse).cache() trainingData.print() // output training data for debug purpose val testData = ssc.textFileStream("file:/tmp/lin-reg-test-data").map(LabeledPoint.parse) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes) } }
Example 82
Source File: LinearRegExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark.sql.SparkSession object LinearRegExample { val homeDir = System.getProperty("user.home") def main(args: Array[String]): Unit = { // 1. Set Spark session val spark = SparkSession.builder().master("local").getOrCreate() // 2. Set logging level to WARNING spark.sparkContext.setLogLevel("WARN") // 3. Import necessary classes from Spark MLLib package that are needed for linear regression import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // 4. Load the data val data = spark.sparkContext.textFile(s"${homeDir}/lpsa.data") // 5. Parse the data into LabeledPoint and cache val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // 6. Build the model by setting number of iterations, step size val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // 7. Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println(s"training Mean Squared Error $MSE") // 8. Save the model model.save(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel") // 9. Load the saved model val sameModel = LinearRegressionModel.load(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel") // 10. Output the model println(sameModel) } }
Example 83
Source File: LRAccuracyTest.scala From SparseML with Apache License 2.0 | 5 votes |
package MLlib import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkContext, SparkConf} object LRAccuracyTest { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map( l => LabeledPoint(l.label, l.features.toSparse)) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new SparseLogisticRegressionWithLBFGS() .setNumClasses(5) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Get evaluation metrics. val metrics = new MulticlassMetrics(predictionAndLabels) val precision = metrics.precision println("Precision = " + precision) } }
Example 84
Source File: MnistExample.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object MnistExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate() val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8) .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => Vectors.dense(arr.slice(1, 785))) val model = new KMeans() .setK(10) .setInitializationMode("random") .setMaxIterations(10) .run(trainRDD) println("final clusters:") println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) } }
Example 85
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 86
Source File: Classifier.scala From CSYE7200_Old with MIT License | 5 votes |
package edu.neu.coe.csye7200.spam import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.classification.LogisticRegressionWithSGD import org.apache.spark.SparkConf import org.apache.spark.SparkContext object Classifier extends App { val conf = new SparkConf().setAppName("spam").setMaster("local[*]") val sc = new SparkContext(conf) val spam = sc.textFile("spark-app//input//test//spam.txt") val norm = sc.textFile("spark-app//input//test//normal.txt") val tf = new HashingTF(10000) val spamFeatures = spam.map(email => tf.transform(email.split(" "))) val normFeatures = norm.map(email => tf.transform(email.split(" "))) val posExamples = spamFeatures.map(f => LabeledPoint(1, f)) val negExamples = normFeatures.map(f => LabeledPoint(0, f)) val trainingData = posExamples.union(negExamples) trainingData.cache() val model = new LogisticRegressionWithSGD().run(trainingData) val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" ")) val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" ")) println(s"Prediction for positive test example: ${model.predict(posTest)}") println(s"Prediction for negative test example: ${model.predict(negTest)}") }
Example 87
Source File: MnistCSVDriver.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.{SparkConf, SparkContext} object MnistCSVDriver { def main(args: Array[String]) { val topology = new CNNTopology topology.addLayer(CNNLayer.buildConvolutionLayer(new Scale(28, 28))) topology.addLayer(CNNLayer.buildConvLayer(6, new Scale(5, 5))) topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(5, 5))) topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(4, 4))) val cnn: CNN = new CNN(topology).setMaxIterations(500000).setMiniBatchSize(16) Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => new LabeledPoint(arr(0), Vectors.dense(arr.slice(1, 785).map(v => if(v > 0) 1.0 else 0)))) val start = System.nanoTime() cnn.trainOneByOne(data) println("Training time: " + (System.nanoTime() - start) / 1e9) } }
Example 88
Source File: PCAOnSourceVectorExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 89
Source File: PCAExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 90
Source File: LinearRegressionWithSGDExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 91
Source File: StreamingLinearRegressionExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") System.exit(1) } val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") val ssc = new StreamingContext(conf, Seconds(1)) // $example on$ val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ ssc.stop() } } // scalastyle:on println
Example 92
Source File: StreamingKMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 93
Source File: DataValidators.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 94
Source File: LogisticRegressionDataGenerator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 95
Source File: SVMDataGenerator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 96
Source File: ChiSqSelectorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("ChiSqSelector by fpr transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr") .setFpr(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 97
Source File: EnsembleTestHelper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 98
Source File: PythonMLLibAPISuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 99
Source File: MllibHelper.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import java.text.Normalizer import org.apache.spark.Logging import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import scala.math.BigDecimal import twitter4j.Status object MllibHelper extends Logging { val numNumberFeatures = 4 var numRetweetBegin = 100 var numRetweetEnd = 1000 var numTextFeatures = 1000 var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray def reset(conf:ConfArguments) { numRetweetBegin = conf.numRetweetBegin numRetweetEnd = conf.numRetweetEnd numTextFeatures = conf.numTextFeatures var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures") } def featurizeText(statuses: Status): SparseVector = { val text = statuses.getRetweetedStatus .getText .toLowerCase // Separate accents from characters and then remove non-unicode // characters val noAccentText = Normalizer .normalize(text, Normalizer.Form.NFD) .replaceAll("\\p{M}", "") // bigrams hashText.transform(text.sliding(2).toSeq) .asInstanceOf[SparseVector] } def featurizeNumbers(statuses: Status): Vector = { val user = statuses.getRetweetedStatus.getUser val created = statuses.getRetweetedStatus.getCreatedAt val timeLeft = (System.currentTimeMillis - created.getTime) Vectors.dense( user.getFollowersCount * Math.pow(10, -12), user.getFavouritesCount * Math.pow(10, -12), user.getFriendsCount * Math.pow(10, -12), timeLeft * Math.pow(10, -14) //retweeted.getURLEntities.length, //retweeted.getUserMentionEntities.length ) } def featurize(statuses: Status): LabeledPoint = { val textFeatures = featurizeText(statuses) val numberFeatures = featurizeNumbers(statuses) val features = Vectors.sparse( numFeatures, textFeatures.indices ++ numberFeatureIndices, textFeatures.values ++ numberFeatures.toArray ) LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features ) } def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = { val n = statuses.getRetweetedStatus.getRetweetCount (n >= start && n <= end) } def filtrate(statuses: Status): Boolean = { ( statuses.isRetweet && //statuses.getLang == "en" && retweetInterval(statuses, numRetweetBegin, numRetweetEnd) ) } }
Example 100
Source File: DatasetExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, DataFrame} object DatasetExample { case class Params( input: String = "data/mllib/sample_libsvm_data.txt", dataFormat: String = "libsvm") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DatasetExample") { head("Dataset: an example app using DataFrame as a Dataset for ML.") opt[String]("input") .text(s"input path to dataset") .action((x, c) => c.copy(input = x)) opt[String]("dataFormat") .text("data format: libsvm (default), dense (deprecated in Spark v1.1)") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DatasetExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // for implicit conversions // Load input data val origData: RDD[LabeledPoint] = params.dataFormat match { case "dense" => MLUtils.loadLabeledPoints(sc, params.input) case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input) } println(s"Loaded ${origData.count()} instances from file: ${params.input}") // Convert input data to DataFrame explicitly. val df: DataFrame = origData.toDF() println(s"Inferred schema:\n${df.schema.prettyJson}") println(s"Converted to DataFrame with ${df.count()} records") // Select columns val labelsDf: DataFrame = df.select("label") val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v } val numLabels = labels.count() val meanLabel = labels.fold(0.0)(_ + _) / numLabels println(s"Selected label column with average value $meanLabel") val featuresDf: DataFrame = df.select("features") val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataset").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) println(s"Loading Parquet file with UDT from $outputDir.") val newDataset = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet: ${newDataset.schema.prettyJson}") val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v } val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}") sc.stop() } }
Example 101
Source File: StreamingKMeansExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } }
Example 102
Source File: DataValidators.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.DeveloperApi import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 103
Source File: LogisticRegressionDataGenerator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { if (args.length != 5) { println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 104
Source File: SVMDataGenerator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint @DeveloperApi object SVMDataGenerator { def main(args: Array[String]) { if (args.length < 2) { println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 105
Source File: RandomForestRegressorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame def compareAPIs( data: RDD[LabeledPoint], rf: RandomForestRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val oldStrategy = rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity) val oldModel = OldRandomForest.trainRegressor( data, oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newModel = rf.fit(newData) // Use parent from newTree since this is not checked anyways. val oldModelAsNew = RandomForestRegressionModel.fromOld( oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) } }
Example 106
Source File: DecisionTreeRegressorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext { import DecisionTreeRegressorSuite.compareAPIs private var categoricalDataPointsRDD: RDD[LabeledPoint] = _ override def beforeAll() { super.beforeAll() categoricalDataPointsRDD = sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints()) } ///////////////////////////////////////////////////////////////////////////// // Tests calling train() ///////////////////////////////////////////////////////////////////////////// test("Regression stump with 3-ary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) val categoricalFeatures = Map(0 -> 3, 1-> 3) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } test("Regression stump with binary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) val categoricalFeatures = Map(0 -> 2, 1-> 2) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// // TODO: test("model save/load") SPARK-6725 } private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { def compareAPIs( data: RDD[LabeledPoint], dt: DecisionTreeRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val oldStrategy = dt.getOldStrategy(categoricalFeatures) val oldTree = OldDecisionTree.train(data, oldStrategy) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newTree = dt.fit(newData) // Use parent from newTree since this is not checked anyways. val oldTreeAsNew = DecisionTreeRegressionModel.fromOld( oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) } }
Example 107
Source File: ChiSqSelectorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } }
Example 108
Source File: EnsembleTestHelper.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter import scala.collection.mutable object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) => prediction - label } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 109
Source File: PythonMLLibAPISuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.recommendation.Rating class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array[Double]() val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 110
Source File: TreePoint.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD private[sxgboost] case class TreePoint(label: Double, binnedFeature: Array[Int]) private[sxgboost] object TreePoint { def convertToTreeRDD( input: RDD[LabeledPoint], splitsBundle: Array[Array[Split]]): RDD[TreePoint] = { val thresholdsBundle: Array[Array[Double]] = splitsBundle.map { splits => splits.map(_.asInstanceOf[OrderedSplit].threshold) } input.map { x => TreePoint.labeledPointToTreePoint(x, thresholdsBundle) } } def labeledPointToTreePoint( labeledPoint: LabeledPoint, thresholdsBundle: Array[Array[Double]]): TreePoint = { val numFeatures = labeledPoint.features.size val bins = new Array[Int](numFeatures) var featureIndex = 0 while (featureIndex < numFeatures) { bins(featureIndex) = findBin(featureIndex, labeledPoint, thresholdsBundle(featureIndex)) featureIndex += 1 } new TreePoint(labeledPoint.label, bins) } def findBin( featureIndex: Int, labeledPoint: LabeledPoint, thresholds: Array[Double]): Int = { val featureValue = labeledPoint.features(featureIndex) val idx = java.util.Arrays.binarySearch(thresholds, featureValue) if (idx >= 0) { idx } else { -idx - 1 } } }
Example 111
Source File: LogisticLoss.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost.loss import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD class LogisticLoss extends Loss { override def diff1(label: Double, f: Double): Double = { val e = Math.exp(f) - label + e / (1 + e) } override def diff2(label: Double, f: Double): Double = { val e = Math.exp(f) e / Math.pow(1 + e, 2) } override def toPrediction(score: Double): Double = { 1 / (1 + Math.exp(-score)) } override def getInitialBias(input: RDD[LabeledPoint]): Double = { val totalWeight = input.count() val scaledLabels = input.map(lp => lp.label / totalWeight) val p = scaledLabels.treeReduce(_+_) Math.log(p / (1 - p)) } }
Example 112
Source File: SquareLoss.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost.loss import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD class SquareLoss extends Loss { override def diff1(label: Double, f: Double): Double = 2 * (f - label) override def diff2(label: Double, f: Double): Double = 2.0 override def toPrediction(score: Double): Double = score override def getInitialBias(input: RDD[LabeledPoint]): Double = { val totalWeight = input.count() val scaledLabels = input.map(lp => lp.label / totalWeight) scaledLabels.treeReduce(_+_) } }
Example 113
Source File: MetaData.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD private[sxgboost] class MetaData( val numFeatures: Int, val numBins: Array[Int]) extends Serializable { } private[sxgboost] object MetaData { def getMetaData(input: RDD[LabeledPoint], splits: Array[Array[Split]]): MetaData = { val numFeatures = input.first().features.size // The number of Bins is the number of splits + 1 val numBins = splits.map(_.length + 1) new MetaData(numFeatures, numBins) } }
Example 114
Source File: TestData.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD trait TestData { val simpleData = Seq( LabeledPoint(0.1, Vectors.dense(0, 0)), LabeledPoint(0.2, Vectors.dense(0, 1)), LabeledPoint(0.3, Vectors.dense(0, 2)), LabeledPoint(0.4, Vectors.dense(1, 0)), LabeledPoint(0.5, Vectors.dense(1, 1)), LabeledPoint(0.6, Vectors.dense(1, 2)) ) val simpleBinnedData = Seq( TreePoint(0.1, Array(0, 0)), TreePoint(0.2, Array(0, 1)), TreePoint(0.3, Array(0, 2)), TreePoint(0.4, Array(1, 0)), TreePoint(0.5, Array(1, 1)), TreePoint(0.6, Array(1, 2)) ) val simpleMetaData = new MetaData(2, Array(3, 4)) def randomLabelPointRDD( sc: SparkContext, numRows: Long, numCols: Int, numPartitions: Int, seed: Long): RDD[LabeledPoint] = { val featuresBundle = RandomRDDs.normalVectorRDD(sc, numRows, numCols, numPartitions, seed) val labels = RandomRDDs.normalRDD(sc, numRows, numPartitions, seed + 999) (labels zip featuresBundle).map { case (label, features) => LabeledPoint(label, features)} } }
Example 115
Source File: TreePointSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.scalatest.FunSuite class TreePointSuite extends FunSuite{ test("findBin") { val labeledPoint = LabeledPoint(0, Vectors.dense(0.0, 1.0, 1.5, 2.1)) val thresholds = Array[Double](1.0, 2.0) assert(TreePoint.findBin(0, labeledPoint, thresholds) == 0) assert(TreePoint.findBin(1, labeledPoint, thresholds) == 0) assert(TreePoint.findBin(2, labeledPoint, thresholds) == 1) assert(TreePoint.findBin(3, labeledPoint, thresholds) == 2) } }
Example 116
Source File: LogisticLossSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost.loss import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.scalatest.FunSuite import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class LogisticLossSuite extends FunSuite with MLlibTestSparkContext with NumericDiff{ val loss = new LogisticLoss() def numericLoss(label: Double, f: Double): Double = { - label * f + Math.log(1 + Math.exp(f)) } test("diff's match numerical counterparts") { val delta =0.001 Seq[Double](0.0, 1.0).foreach { label => Seq[Double](-1.5, -0.8, 0, 0.8, 1.5).foreach { f => assert(numericDiff1(label, f, delta) ~== loss.diff1(label, f) relTol 1e-3) assert(numericDiff2(label, f, delta) ~== loss.diff2(label, f) relTol 1e-3) } } } test("initial bias") { val data = Seq( LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(0.0, Vectors.dense(0.0)) ) val p = data.map(lp => lp.label).sum / data.length val theta = Math.log(p / (1 - p)) val rdd = sc.parallelize(data, 2) assert(loss.getInitialBias(rdd) ~== theta relTol 1e-5) } test("prediction from score"){ assert(loss.toPrediction(0.0) ~== 0.5 relTol 1e-5) assert(loss.toPrediction(1.0) ~== 0.7310586 relTol 1e-5) assert(loss.toPrediction(-1.0) ~== 0.2689414 relTol 1e-5) } }
Example 117
Source File: PoissonLossSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost.loss import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.scalatest.FunSuite import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext import rotationsymmetry.sxgboost.utils.TestingUtils._ class PoissonLossSuite extends FunSuite with MLlibTestSparkContext with NumericDiff { val loss = new PoissonLoss() def numericLoss(label: Double, f: Double): Double = { - label * Math.log(f) + f } test("diff's match numerical counterparts") { val delta = 0.0001 Seq[Double](0.0, 1.0, 2.0, 3.0).foreach { label => Seq[Double](0.1, 0.8, 1.5).foreach { f => assert(numericDiff1(label, f, delta) ~== loss.diff1(label, f) relTol 1e-3) assert(numericDiff2(label, f, delta) ~== loss.diff2(label, f) relTol 1e-3) } } } test("initial bias") { val data = Seq( LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(0.0, Vectors.dense(0.0)) ) val mean = data.map(lp => lp.label).sum / data.length val rdd = sc.parallelize(data, 2) assert(loss.getInitialBias(rdd) ~== mean relTol 1e-5) } test("prediction from score") { assert(loss.toPrediction(0.0) === 0.0) assert(loss.toPrediction(1.0) === 1.0) assert(loss.toPrediction(1.5) === 1.5) } }
Example 118
Source File: SparkXGBoostClassifierSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.functions.udf import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.LogisticLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext { test("test with simple data") { val rawdata = Seq( LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(0, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(1, Vectors.dense(1.0, 1.0)) ) val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2)) val truthUDF = udf { feature: Vector => if (feature(0) == feature(1)) 0.0 else 1.0 } val dataWithTruth = data.withColumn("truth", truthUDF(data("features"))) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(2) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostClassifier)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("truth") .setPredictionCol("prediction") .setMetricName("precision") val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth)) assert(precision === 1.0) } }
Example 119
Source File: SVMWithSGDDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 120
Source File: LogisticRegressionWithLBFGSDeom.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS //逻辑回归,基于lbfgs优化损失函数,支持多分类(BFGS是逆秩2拟牛顿法) val modelBFGS = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) //在测试数据上计算原始分数 // Compute raw scores on the test set. val predictionAndLabels = test.map { //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) case LabeledPoint(label, features) => val prediction = modelBFGS.predict(features) (prediction, label) } //获取评估指标 // Get evaluation metrics. val metricsBFGS = new MulticlassMetrics(predictionAndLabels) val precision = metricsBFGS.precision println("Precision = " + precision) } }
Example 121
Source File: StreamingLinearRegression.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLinearRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression") //批次间隔 val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingLinearRegressionWithSGD()//(SGD随机梯度下降) //initialWeights初始取值,默认是0向量 .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 122
Source File: DecisionTreeExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Entropy import org.apache.spark.SparkConf import org.apache.spark.SparkContext //加载文件 val data = sc.textFile("../data/mllib/tennis.csv") //解析数据并把它加载到LablePoint val parsedData = data.map {line => val parts = line.split(',').map(_.toDouble) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(parts(0), Vectors.dense(parts.tail)) } //用这些数据训练算法 val model = DecisionTree.train(parsedData, Classification,Entropy, 3) //创建一个向量表示无雨,风大,低温 val v=Vectors.dense(0.0,1.0,0.0) //预测是否打网球 model.predict(v) } }
Example 123
Source File: LogisticRegressionWithLBFGSExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors val points = Array( //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(0.0,Vectors.dense(0.245)), LabeledPoint(0.0,Vectors.dense(0.247)), LabeledPoint(1.0,Vectors.dense(0.285)), LabeledPoint(1.0,Vectors.dense(0.299)), LabeledPoint(1.0,Vectors.dense(0.327)), LabeledPoint(1.0,Vectors.dense(0.347)), LabeledPoint(0.0,Vectors.dense(0.356)), LabeledPoint(1.0,Vectors.dense(0.36)), LabeledPoint(0.0,Vectors.dense(0.363)), LabeledPoint(1.0,Vectors.dense(0.364)), LabeledPoint(0.0,Vectors.dense(0.398)), LabeledPoint(1.0,Vectors.dense(0.4)), LabeledPoint(0.0,Vectors.dense(0.409)), LabeledPoint(1.0,Vectors.dense(0.421)), LabeledPoint(0.0,Vectors.dense(0.432)), LabeledPoint(1.0,Vectors.dense(0.473)), LabeledPoint(1.0,Vectors.dense(0.509)), LabeledPoint(1.0,Vectors.dense(0.529)), LabeledPoint(0.0,Vectors.dense(0.561)), LabeledPoint(0.0,Vectors.dense(0.569)), LabeledPoint(1.0,Vectors.dense(0.594)), LabeledPoint(1.0,Vectors.dense(0.638)), LabeledPoint(1.0,Vectors.dense(0.656)), LabeledPoint(1.0,Vectors.dense(0.816)), LabeledPoint(1.0,Vectors.dense(0.853)), LabeledPoint(1.0,Vectors.dense(0.938)), LabeledPoint(1.0,Vectors.dense(1.036)), LabeledPoint(1.0,Vectors.dense(1.045))) //创建之前数据的RDD val spiderRDD = sc.parallelize(points) //使用数据训练模型(当所有预测值为0的时候,拦截是有意义的) //逻辑回归,基于lbfgs优化损失函数,支持多分类,(BFGS是逆秩2拟牛顿法) val lr = new LogisticRegressionWithLBFGS().setIntercept(true) val model = lr.run(spiderRDD) //预测0.938尺度的蜘蛛的现状 val predict = model.predict(Vectors.dense(0.938)) } }
Example 124
Source File: StreamingLogisticRegression.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") //批次间隔 val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) //SGD基于梯度下降,仅支持2分类 val model = new StreamingLogisticRegressionWithSGD() //initialWeights初始取值,默认是0向量 .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 125
Source File: SVMWithSGDExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils import org.apache.spark.SparkConf object SVMWithSGDExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SVMWithSGDExample").setMaster("local[4]") val sc = new SparkContext(conf) //把数据加载成RDD val svmData = MLUtils.loadLibSVMFile(sc, "../data/mllib/sample_libsvm_data.txt") //计算记录的数目 svmData.count //把数据集分成两半,一半训练数据和一半测试数据 val trainingAndTest = svmData.randomSplit(Array(0.5, 0.5)) //训练数据和测试数据赋值 val trainingData = trainingAndTest(0) val testData = trainingAndTest(1) //训练算法产并经过100次迭代构建模型 (SGD随机梯度下降) val model = SVMWithSGD.train(trainingData, 100) //用模型去为任意数据集预测标签,使用测试数据中的第一个点测试标签 val label = model.predict(testData.first.features) //创建一个元组,其中第一个元素是测试数据的预测标签,第二个元素是实际标签 val predictionsAndLabels = testData.map(r => (model.predict(r.features), r.label)) //计算有多少预测标签和实际标签不匹配的记录 predictionsAndLabels.filter(p => p._1 != p._2).count } }
Example 126
Source File: DecisionTreeTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Gini object DecisionTreeTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering") val sc = new SparkContext(sparkConf) val data = sc.textFile("../data/mllib/sample_tree_data.csv") val parsedData = data.map { line => val parts = line.split(',').map(_.toDouble) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(parts(0), Vectors.dense(parts.tail)) } val maxDepth = 5//树的最大深度,为了防止过拟合,设定划分的终止条件 val model = DecisionTree.train(parsedData, Classification, Gini, maxDepth) val labelAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val trainErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / parsedData.count println("Training Error = " + trainErr) } }
Example 127
Source File: StreamingKMeansExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") //批次间隔 val ssc = new StreamingContext(conf, Seconds(3.toLong)) //文件流,训练目录,解析向量 val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) //测试目录 val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() //聚类的个数 .setK(args(3).toInt) //直接设置衰减因子 .setDecayFactor(1.0) //随机中心数 .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData)//对数据集进行聚类训练 //predict 对新的数据点进行所属聚类的预测 model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 128
Source File: ChiSqSelector.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) .map { case (_, indices) => indices } .sorted new ChiSqSelectorModel(indices) } }
Example 129
Source File: LogLoss.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[mllib] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 130
Source File: LogisticRegressionDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import scala.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("LogisticRegressionDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt val eps = 3 if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $LogisticRegressionDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateLogisticRDD(sc, numExamples, numFeatures, eps, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 131
Source File: MLPSuite.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.neuralNetwork import com.github.cloudml.zen.ml.util.{Utils, SparkUtils, MnistDatasetSuite} import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector => SV} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.scalatest.{FunSuite, Matchers} class MLPSuite extends FunSuite with MnistDatasetSuite with Matchers { ignore("MLP") { val (data, numVisible) = mnistTrainDataset(5000) val topology = Array(numVisible, 500, 10) val nn = MLP.train(data, 20, 1000, topology, fraction = 0.02, learningRate = 0.1, weightCost = 0.0) // val nn = MLP.runLBFGS(data, topology, 100, 4000, 1e-5, 0.001) // MLP.runSGD(data, nn, 37, 6000, 0.1, 0.5, 0.0) val (dataTest, _) = mnistTrainDataset(10000, 5000) println("Error: " + MLP.error(dataTest, nn, 100)) } ignore("binary classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = s"$sparkHome/data/a5a" val checkpoint = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpoint) val data = MLUtils.loadLibSVMFile(sc, dataSetFile).map { case LabeledPoint(label, features) => val y = BDV.zeros[Double](2) y := 0.04 / y.length y(if (label > 0) 0 else 1) += 0.96 (features, SparkUtils.fromBreeze(y)) }.persist() val trainSet = data.filter(_._1.hashCode().abs % 5 == 3).persist() val testSet = data.filter(_._1.hashCode().abs % 5 != 3).persist() val numVisible = trainSet.first()._1.size val topology = Array(numVisible, 30, 2) var nn = MLP.train(trainSet, 100, 1000, topology, fraction = 0.02, learningRate = 0.05, weightCost = 0.0) val modelPath = s"$checkpoint/model" nn.save(sc, modelPath) nn = MLP.load(sc, modelPath) val scoreAndLabels = testSet.map { case (features, label) => val out = nn.predict(SparkUtils.toBreeze(features).toDenseVector.asDenseMatrix.t) // Utils.random.nextInt(2).toDouble (out(0, 0), if (label(0) > 0.5) 1.0 else 0.0) }.persist() scoreAndLabels.repartition(1).map(t => s"${t._1}\t${t._2}"). saveAsTextFile(s"$checkpoint/mlp/${System.currentTimeMillis()}") val testAccuracy = new BinaryClassificationMetrics(scoreAndLabels).areaUnderROC() println(f"Test AUC = $testAccuracy%1.6f") } }
Example 132
Source File: LogisticRegressionSuite.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.regression import com.github.cloudml.zen.ml.util._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.scalatest.{Matchers, FunSuite} import com.github.cloudml.zen.ml.util.SparkUtils._ class LogisticRegressionSuite extends FunSuite with SharedSparkContext with Matchers { test("LogisticRegression MIS") { val zenHome = sys.props.getOrElse("zen.test.home", fail("zen.test.home is not set!")) val dataSetFile = classOf[LogisticRegressionSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile) val max = dataSet.map(_.features.activeValuesIterator.map(_.abs).sum + 1L).max val maxIter = 10 val stepSize = 1 / (2 * max) val trainDataSet = dataSet.zipWithUniqueId().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else -1.0 (id, LabeledPoint(newLabel, features)) } val lr = new LogisticRegressionMIS(trainDataSet, stepSize) val pps = new Array[Double](maxIter) var i = 0 val startedAt = System.currentTimeMillis() while (i < maxIter) { lr.run(1) val q = lr.forward(i) pps(i) = lr.loss(q) i += 1 } println((System.currentTimeMillis() - startedAt) / 1e3) pps.foreach(println) val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs } assert(ppsDiff.count(_ > 0).toDouble / ppsDiff.size > 0.05) assert(pps.head - pps.last > 0) } test("LogisticRegression SGD") { val zenHome = sys.props.getOrElse("zen.test.home", fail("zen.test.home is not set!")) val dataSetFile = classOf[LogisticRegressionSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile) val maxIter = 10 val stepSize = 1 val trainDataSet = dataSet.zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0 (id, LabeledPoint(newLabel, features)) } val lr = new LogisticRegressionSGD(trainDataSet, stepSize) val pps = new Array[Double](maxIter) var i = 0 val startedAt = System.currentTimeMillis() while (i < maxIter) { lr.run(1) val margin = lr.forward(i) pps(i) = lr.loss(margin) i += 1 } println((System.currentTimeMillis() - startedAt) / 1e3) pps.foreach(println) val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs } assert(ppsDiff.count(_ > 0).toDouble / ppsDiff.size > 0.05) assert(pps.head - pps.last > 0) } }
Example 133
Source File: MovieLensUtils.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.examples.ml import breeze.linalg.{SparseVector => BSV} import com.github.cloudml.zen.ml.util.Logging import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{SparseVector => SSV} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel private[zen] object MovieLensUtils extends Logging { def genSamplesWithTime( sc: SparkContext, dataFile: String, numPartitions: Int = -1, newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK): (RDD[(Long, LabeledPoint)], RDD[(Long, LabeledPoint)], Array[Long]) = { val line = sc.textFile(dataFile).first() val splitString = if (line.contains(",")) "," else "::" var movieLens = sc.textFile(dataFile, sc.defaultParallelism).mapPartitions { iter => iter.filter(t => !t.startsWith("userId") && !t.isEmpty).map { line => val Array(userId, movieId, rating, timestamp) = line.split(splitString) (userId.toInt, movieId.toInt, rating.toDouble, timestamp.toInt) } } movieLens = movieLens.repartition(if (numPartitions > 0) numPartitions else sc.defaultParallelism) movieLens.persist(newLevel).count() val daySeconds = 60 * 60 * 24 val maxUserId = movieLens.map(_._1).max + 1 val maxMovieId = movieLens.map(_._2).max + 1 val maxTime = movieLens.map(_._4 / daySeconds).max() val minTime = movieLens.map(_._4 / daySeconds).min() val maxDay = maxTime - minTime + 1 val numFeatures = maxUserId + maxMovieId + maxDay val dataSet = movieLens.map { case (userId, movieId, rating, timestamp) => val sv = BSV.zeros[Double](numFeatures) sv(userId) = 1.0 sv(movieId + maxUserId) = 1.0 sv(timestamp / daySeconds - minTime + maxUserId + maxMovieId) = 1.0 val gen = (1125899906842597L * timestamp).abs val labeledPoint = new LabeledPoint(rating, new SSV(sv.length, sv.index.slice(0, sv.used), sv.data.slice(0, sv.used))) (gen, labeledPoint) }.persist(newLevel) dataSet.count() movieLens.unpersist() val trainSet = dataSet.filter(t => t._1 % 5 > 0).map(_._2).zipWithIndex().map(_.swap).persist(newLevel) val testSet = dataSet.filter(t => t._1 % 5 == 0).map(_._2).zipWithIndex().map(_.swap).persist(newLevel) trainSet.count() testSet.count() dataSet.unpersist() val views = Array(maxUserId, maxMovieId + maxUserId, numFeatures).map(_.toLong) (trainSet, testSet, views) } }
Example 134
Source File: NetflixPrizeUtils.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.examples.ml import java.text.SimpleDateFormat import java.util.{Locale, TimeZone} import breeze.linalg.{SparseVector => BSV} import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{SparseVector => SSV} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.mutable.ArrayBuffer object NetflixPrizeUtils { def genSamplesWithTime( sc: SparkContext, input: String, numPartitions: Int = -1, newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK): (RDD[(Long, LabeledPoint)], RDD[(Long, LabeledPoint)], Array[Long]) = { val probeFile = s"$input/probe.txt" val dataSetFile = s"$input/training_set val views = Array(maxUserId, maxMovieId + maxUserId, numFeatures).map(_.toLong) (trainSet, testSet, views) } }
Example 135
Source File: GradientBoostingTreeDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import scala.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD def generateGBTRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("GradientBoostingTreeDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt val eps = 0.3 if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $GradientBoostingTreeDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateGBTRDD(sc, numExamples, numFeatures, eps, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 136
Source File: GradientBoostedTreeDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import scala.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD def generateGBTRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("GradientBoostingTreeDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt val eps = 0.3 if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $GradientBoostingTreeDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateGBTRDD(sc, numExamples, numFeatures, eps, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 137
Source File: LinearRegressionDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import scala.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD def generateLinearRDD( sc: SparkContext, numExamples: Int, numFeatures: Int, eps: Double, numParts: Int = 3, seed: Long = System.currentTimeMillis()): RDD[LabeledPoint] = { val random = new Random() // Random values distributed uniformly in [-0.5, 0.5] val weights = Array.fill(numFeatures)(random.nextDouble() - 0.5) val data : RDD[LabeledPoint] = sc.parallelize(0 until numExamples, numParts).mapPartitions{ part => val rnd = new Random(seed) // mean for each feature val xMean = Array.fill[Double](weights.length)(0.0) // variance for each feature val xVariance = Array.fill[Double](weights.length)(1.0 / 3.0) def rndElement(i: Int) = {(rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)} part.map{ _ => val features = Vectors.dense(weights.indices.map{rndElement(_)}.toArray) val label = blas.ddot(weights.length, weights, 1, features.toArray ,1) + eps * rnd.nextGaussian() LabeledPoint(label, features) } } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("LinearRegressionDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 1000 var numFeatures: Int = 50 var eps: Double = 1.0 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $LinearRegressionDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateLinearRDD(sc, numExamples, numFeatures, eps, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 138
Source File: GradientBoostingTree.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint object GradientBoostingTree { def main(args: Array[String]): Unit = { var inputPath = "" var numIterations: Int = 3 val numClasses: Int = 2 val maxDepth: Int = 5 if (args.length == 2) { inputPath = args(0) numIterations = args(1).toInt } val conf = new SparkConf() .setAppName("GradientBoostingTree") val sc = new SparkContext(conf) // Load and parse the data file. //val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val data: RDD[LabeledPoint] = sc.objectFile(inputPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a GradientBoostedTrees model. // The defaultParams for Classification use LogLoss by default. val boostingStrategy = BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = numIterations boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.treeStrategy.maxDepth = maxDepth // Empty categoricalFeaturesInfo indicates all features are continuous. boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() val model = GradientBoostedTrees.train(trainingData, boostingStrategy) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 139
Source File: LinearRegression.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.rdd.RDD import scopt.OptionParser object LinearRegression { case class Params( dataPath: String = null, numIterations: Int = 100, stepSize: Double = 0.00000001 ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("Linear"){ head("Linear Regression: an example of linear regression with SGD optimizer") opt[Int]("numIterations") .text(s"numIterations, default: ${defaultParams.numIterations}") .action((x,c) => c.copy(numIterations = x)) opt[Double]("stepSize") .text(s"stepSize, default: ${defaultParams.stepSize}") .action((x,c) => c.copy(stepSize = x)) arg[String]("<dataPath>") .required() .text("Input path for data") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"LinearRegressionWithSGD with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val dataPath = params.dataPath val numIterations = params.numIterations val stepSize = params.stepSize // Load training data in LabeledPoint format. val data: RDD[LabeledPoint] = sc.objectFile(dataPath) // Building the model val model = LinearRegressionWithSGD.train(data, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = data.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("Training Mean Squared Error = " + MSE) sc.stop() } }
Example 140
Source File: PCADataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import scala.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD def generatePCARDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = rnd.nextGaussian() val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() - 0.5 } LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("PCADataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 100 var numFeatures: Int = 8 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt val eps = 3 if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $PCADataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generatePCARDD(sc, numExamples, numFeatures, eps, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 141
Source File: RandomForestDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import scala.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD def generateRFRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("RandomForestDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt val eps = 0.3 if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $RandomForestDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateRFRDD(sc, numExamples, numFeatures, eps, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 142
Source File: PCAExample.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.apache.spark.rdd.RDD object PCAExample { def main(args: Array[String]): Unit = { var inputPath = "" var maxResultSize = "1g" if (args.length == 2) { inputPath = args(0) maxResultSize = args(1) } val conf = new SparkConf() .setAppName("PCAExample") .set("spark.driver.maxResultSize", maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val data: RDD[LabeledPoint] = sc.objectFile(inputPath) val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } sc.stop() } } // scalastyle:on println
Example 143
Source File: GradientBoostedTree.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object GradientBoostedTree { case class Params( numClasses: Int = 2, maxDepth: Int = 30, maxBins: Int = 32, numIterations: Int = 20, learningRate: Double = 0.1, dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("GBT"){ head("GBT: an example of Gradient Boosted Tree for classification") opt[Int]("numClasses") .text(s"numClasses, default: ${defaultParams.numClasses}") .action((x,c) => c.copy(numClasses = x)) opt[Int]("maxDepth") .text(s"maxDepth, default: ${defaultParams.maxDepth}") .action((x,c) => c.copy(maxDepth = x)) opt[Int]("maxBins") .text(s"maxBins, default: ${defaultParams.maxBins}") .action((x,c) => c.copy(maxBins = x)) opt[Int]("numIterations") .text(s"numIterations, default: ${defaultParams.numIterations}") .action((x,c) => c.copy(numIterations = x)) opt[Double]("learningRate") .text(s"learningRate, default: ${defaultParams.learningRate}") .action((x,c) => c.copy(learningRate = x)) arg[String]("<dataPath>") .required() .text("data path for Gradient Boosted Tree") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Gradient Boosted Tree with $params") val sc = new SparkContext(conf) val dataPath = params.dataPath val numClasses = params.numClasses val maxDepth = params.maxDepth val maxBins = params.maxBins val numIterations = params.numIterations val learningRate = params.learningRate // Load data file. val data: RDD[LabeledPoint] = sc.objectFile(dataPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a GradientBoostedTrees model. val boostingStrategy = BoostingStrategy.defaultParams("Classification") boostingStrategy.numIterations = numIterations boostingStrategy.learningRate = learningRate boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.treeStrategy.maxDepth = maxDepth boostingStrategy.treeStrategy.maxBins = maxBins // Empty categoricalFeaturesInfo indicates all features are continuous. boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() val model = GradientBoostedTrees.train(trainingData, boostingStrategy) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 144
Source File: SVMWithSGDExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object SVMWithSGDExample { case class Params( numIterations: Int = 100, stepSize: Double = 1.0, regParam: Double = 0.01, dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("SVM") { head("SVM: an example of SVM for classification.") opt[Int]("numIterations") .text(s"numIterations, default: ${defaultParams.numIterations}") .action((x,c) => c.copy(numIterations = x)) opt[Double]("stepSize") .text(s"stepSize, default: ${defaultParams.stepSize}") .action((x,c) => c.copy(stepSize = x)) opt[Double]("regParam") .text(s"regParam, default: ${defaultParams.regParam}") .action((x,c) => c.copy(regParam = x)) arg[String]("<dataPath>") .required() .text("data path of SVM") .action((x, c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"SVM with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val dataPath = params.dataPath val numIterations = params.numIterations val stepSize = params.stepSize val regParam = params.regParam val data: RDD[LabeledPoint] = sc.objectFile(dataPath) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = SVMWithSGD.train(training, numIterations, stepSize, regParam) // Clear the default threshold. model.clearThreshold() // Compute raw scores on the test set. val scoreAndLabels = test.map { point => val score = model.predict(point.features) (score, point.label) } // Get evaluation metrics. val metrics = new BinaryClassificationMetrics(scoreAndLabels) val auROC = metrics.areaUnderROC() println("Area under ROC = " + auROC) sc.stop() } }
Example 145
Source File: RandomForestClassification.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import scopt.OptionParser object RandomForestClassification { case class Params( inputPath: String = null, numTrees: Int = 3, numClasses: Int = 2, featureSubsetStrategy: String = "auto", impurity: String = "gini", maxDepth: Int = 4, maxBins: Int = 32) def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("RF") { head("RF: an example app.") opt[Int]("numTrees") .text(s"numTrees, default: ${defaultParams.numTrees}") .action((x, c) => c.copy(numTrees = x)) opt[Int]("numClasses") .text(s"numClasses, default: ${defaultParams.numClasses}") .action((x, c) => c.copy(numClasses = x)) opt[Int]("maxDepth") .text(s"maxDepth, default: ${defaultParams.maxDepth}") .action((x, c) => c.copy(maxDepth = x)) opt[Int]("maxBins") .text(s"maxBins, default: ${defaultParams.maxBins}") .action((x, c) => c.copy(maxBins = x)) opt[String]("featureSubsetStrategy") .text(s"featureSubsetStrategy, default: ${defaultParams.featureSubsetStrategy}") .action((x, c) => c.copy(featureSubsetStrategy = x)) opt[String]("impurity") .text(s"impurity (smoothing constant), default: ${defaultParams.impurity}") .action((x, c) => c.copy(impurity = x)) arg[String]("<inputPath>") .required() .text("Input path of dataset") .action((x, c) => c.copy(inputPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"RFC with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) // $example on$ // Load and parse the data file. val data: RDD[LabeledPoint] = sc.objectFile(params.inputPath) // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) // Train a RandomForest model. // Empty categoricalFeaturesInfo indicates all features are continuous. val categoricalFeaturesInfo = Map[Int, Int]() val model = RandomForest.trainClassifier(trainingData, params.numClasses, categoricalFeaturesInfo, params.numTrees, params.featureSubsetStrategy, params.impurity, params.maxDepth, params.maxBins) // Evaluate model on test instances and compute test error val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error = " + testErr) sc.stop() } }
Example 146
Source File: MVMSuite.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.recommendation import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, sum => brzSum} import com.github.cloudml.zen.ml.util._ import com.google.common.io.Files import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.scalatest.{FunSuite, Matchers} class MVMSuite extends FunSuite with SharedSparkContext with Matchers { test("binary classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val checkpoint = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpoint) val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0.0 (id, LabeledPoint(newLabel, features)) } val stepSize = 0.1 val regParam = 1e-2 val l2 = (regParam, regParam, regParam) val rank = 20 val useAdaGrad = true val trainSet = dataSet.cache() val fm = new FMClassification(trainSet, stepSize, l2, rank, useAdaGrad) val maxIter = 10 val pps = new Array[Double](maxIter) var i = 0 val startedAt = System.currentTimeMillis() while (i < maxIter) { fm.run(1) pps(i) = fm.saveModel().loss(trainSet) i += 1 } println((System.currentTimeMillis() - startedAt) / 1e3) pps.foreach(println) val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs } assert(ppsDiff.count(_ < 0).toDouble / ppsDiff.size > 0.05) val fmModel = fm.saveModel() val tempDir = Files.createTempDir() tempDir.deleteOnExit() val path = tempDir.toURI.toString fmModel.save(sc, path) val sameModel = FMModel.load(sc, path) assert(sameModel.k === fmModel.k) assert(sameModel.classification === fmModel.classification) assert(sameModel.factors.sortByKey().map(_._2).collect() === fmModel.factors.sortByKey().map(_._2).collect()) } ignore("url_combined classification") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val dataSetFile = classOf[MVMSuite].getClassLoader().getResource("binary_classification_data.txt").toString() val checkpointDir = s"$sparkHome/target/tmp" sc.setCheckpointDir(checkpointDir) val dataSet = MLUtils.loadLibSVMFile(sc, dataSetFile).zipWithIndex().map { case (LabeledPoint(label, features), id) => val newLabel = if (label > 0.0) 1.0 else 0.0 (id, LabeledPoint(newLabel, features)) }.cache() val numFeatures = dataSet.first()._2.features.size val stepSize = 0.1 val numIterations = 500 val regParam = 1e-3 val rank = 20 val views = Array(20, numFeatures / 2, numFeatures).map(_.toLong) val useAdaGrad = true val useWeightedLambda = true val miniBatchFraction = 1 val Array(trainSet, testSet) = dataSet.randomSplit(Array(0.8, 0.2)) trainSet.cache() testSet.cache() val fm = new MVMClassification(trainSet, stepSize, views, regParam, 0.0, rank, useAdaGrad, useWeightedLambda, miniBatchFraction) fm.run(numIterations) val model = fm.saveModel() println(f"Test loss: ${model.loss(testSet.cache())}%1.4f") } }
Example 147
Source File: SVMDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD def generateSVMRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, nparts: Int = 2): RDD[LabeledPoint] = { val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples,nparts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("SVMDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $SVMDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateSVMRDD(sc, numExamples, numFeatures, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 148
Source File: LogisticRegression.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object LogisticRegression { def main(args: Array[String]): Unit = { var inputPath = "" if (args.length == 1) { inputPath = args(0) } val conf = new SparkConf().setAppName("LogisticRegressionWithLBFGS") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) // $example on$ // Load training data in LIBSVM format. val data: RDD[LabeledPoint] = sc.objectFile(inputPath) // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } val accuracy = predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count() println(s"Accuracy = $accuracy") sc.stop() } } // scalastyle:on println
Example 149
Source File: MyRegressionMetrics.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.sql.SparkSession object MyRegressionMetrics { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .appName("myRegressionMetrics") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter4/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val categoricalFeaturesInfo = Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins) // Instantiate metrics object val predictionsAndLabels = testData.map(example => (model.predict(example.features), example.label) ) val metrics = new RegressionMetrics(predictionsAndLabels) // Squared error println(s"MSE = ${metrics.meanSquaredError}") println(s"RMSE = ${metrics.rootMeanSquaredError}") // R-squared println(s"R-squared = ${metrics.r2}") // Mean absolute error println(s"MAE = ${metrics.meanAbsoluteError}") // Explained variance println(s"Explained variance = ${metrics.explainedVariance}") // $example off$ spark.stop() } }
Example 150
Source File: MyBinaryClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils object MyBinaryClassification { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .appName("myBinaryClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() // Load training data in LIBSVM format //https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html val data = MLUtils.loadLibSVMFile(spark.sparkContext, "../data/sparkml2/chapter4/myBinaryClassificationData.txt") // Split data into training (60%) and test (40%) val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) training.cache() // Run training algorithm to build the model val model = new LogisticRegressionWithLBFGS() .setNumClasses(2) .run(training) // Clear the prediction threshold so the model will return probabilities model.clearThreshold // Compute raw scores on the test set val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Instantiate metrics object val metrics = new BinaryClassificationMetrics(predictionAndLabels) // Precision by threshold val precision = metrics.precisionByThreshold precision.foreach { case (t, p) => println(s"Threshold: $t, Precision: $p") } // Recall by threshold val recall = metrics.recallByThreshold recall.foreach { case (t, r) => println(s"Threshold: $t, Recall: $r") } val PRC = metrics.pr val f1Score = metrics.fMeasureByThreshold f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 1") } val beta = 0.5 val fScore = metrics.fMeasureByThreshold(beta) f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 0.5") } val auPRC = metrics.areaUnderPR println("Area under precision-recall curve = " + auPRC) val thresholds = precision.map(_._1) val roc = metrics.roc val auROC = metrics.areaUnderROC println("Area under ROC = " + auROC) spark.stop() } }
Example 151
Source File: MyStreamingKMeans.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter8 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object MyStreamingKMeans { def main(args: Array[String]) { val trainingDir = "../data/sparkml2/chapter8/trainingDir" val testDir = "../data/sparkml2/chapter8/testDir" val batchDuration = 10 val numClusters = 2 val numDimensions = 3 Logger.getLogger("org").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("myStreamingKMeans") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(batchDuration.toLong)) val trainingData = ssc.textFileStream(trainingDir).map(Vectors.parse) val testData = ssc.textFileStream(testDir).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(numClusters) .setDecayFactor(1.0) .setRandomCenters(numDimensions, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 152
Source File: MyGradientBoostingClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.sql.SparkSession object MyGradientBoostingClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyGradientBoostedTreesClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val algo = "Classification" val numIterations = 3 val numClasses = 2 val maxDepth = 5 val maxBins = 32 val categoricalFeatureInfo = Map[Int,Int]() val boostingStrategy = BoostingStrategy.defaultParams(algo) boostingStrategy.setNumIterations(numIterations) boostingStrategy.treeStrategy.setNumClasses(numClasses) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo evaluate(trainingData, testData, boostingStrategy) println("===================") spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], boostingStrategy : BoostingStrategy ) :Unit = { val model = GradientBoostedTrees.train(trainingData, boostingStrategy) val metrics = getMetrics(model, testData) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Model Accuracy: "+metrics.precision) println("Model Error: "+ (1-metrics.precision)) // (0 until boostingStrategy.treeStrategy.getNumClasses()).map( // category => (metrics.precision(category), metrics.recall(category)) // ).foreach(println) // println("My Classification GBT model:\n" + model.toDebugString) } def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 153
Source File: MyRandomForestClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.sql.SparkSession object MyRandomForestClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyRandomForestClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 3 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. // val impurity = "gini" val maxDepth = 4 val maxBins = 32 evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees, featureSubsetStrategy, "gini", maxDepth, maxBins) evaluate(trainingData, testData, numClasses,categoricalFeaturesInfo,numTrees, featureSubsetStrategy, "entropy", maxDepth, maxBins) println("=============") spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], numClasses: Int, categoricalFeaturesInfo: Map[Int,Int], numTrees: Int, featureSubsetStrategy: String, impurity: String, maxDepth: Int, maxBins:Int ) :Unit = { val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy,impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Using Impurity :"+ impurity) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Model Accuracy: "+metrics.precision) println("Model Error: "+ (1-metrics.precision)) // (0 until numClasses).map( // category => (metrics.precision(category), metrics.recall(category)) // ).foreach(println) println("My Random Forest Model:\n" + model.toDebugString) } def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 154
Source File: MyGradientBoostingRegression.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.sql.SparkSession object MyGradientBoostingRegression { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyGradientBoostedTreesRegression") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val algo = "Regression" val numIterations = 3 val maxDepth = 5 val maxBins = 32 val categoricalFeatureInfo = Map[Int,Int]() val boostingStrategy = BoostingStrategy.defaultParams(algo) boostingStrategy.setNumIterations(numIterations) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeatureInfo val model = GradientBoostedTrees.train(trainingData, boostingStrategy) val metrics = getMetrics(model, testData) println("Test Mean Squared Error = " + metrics.meanSquaredError) println("My regression GBT model:\n" + model.toDebugString) spark.stop() } def getMetrics(model: GradientBoostedTreesModel, data: RDD[LabeledPoint]): RegressionMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new RegressionMetrics(predictionsAndLabels) } }
Example 155
Source File: MyDecisionTreeRegression.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.tree.model.DecisionTreeModel import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MyDecisionTreeRegression { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyDecisionTreeRegression") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val categoricalFeaturesInfo = Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Test Mean Squared Error = " + metrics.meanSquaredError) println("My regression tree model:\n" + model.toDebugString) spark.stop() } def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): RegressionMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new RegressionMetrics(predictionsAndLabels) } }
Example 156
Source File: MyRandomForestRegression.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.sql.SparkSession object MyRandomForestRegression { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyRandomForestRegression") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) println("Training Data count:"+trainingData.count()) println("Test Data Count:"+testData.count()) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 3 // Use more in practice. val featureSubsetStrategy = "auto" // Let the algorithm choose. val impurity = "variance" val maxDepth = 4 val maxBins = 32 val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Test Mean Squared Error = " + metrics.meanSquaredError) println("My Random Forest model:\n" + model.toDebugString) spark.stop() } def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): RegressionMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new RegressionMetrics(predictionsAndLabels) } } // scalastyle:on println
Example 157
Source File: MyDecisionTreeClassification.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter10 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.DecisionTreeModel import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MyDecisionTreeClassification { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyDecisionTreeClassification") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val rawData = spark.sparkContext.textFile("../data/sparkml2/chapter10/breast-cancer-wisconsin.data") val data = rawData.map(_.trim) .filter(text => !(text.isEmpty || text.startsWith("#") || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) val slicedValues = values.slice(1, values.size) val featureVector = Vectors.dense(slicedValues.init) val label = values.last / 2 -1 LabeledPoint(label, featureVector) } println(rawData.count()) println(data.count()) val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val maxDepth = 5 val maxBins = 32 evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo, "gini", maxDepth, maxBins) evaluate(trainingData, testData, numClasses, categoricalFeaturesInfo, "entropy", maxDepth, maxBins) spark.stop() } def evaluate( trainingData: RDD[LabeledPoint], testData: RDD[LabeledPoint], numClasses: Int, categoricalFeaturesInfo: Map[Int,Int], impurity: String, maxDepth: Int, maxBins:Int ) :Unit = { val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins) val metrics = getMetrics(model, testData) println("Using Impurity :"+ impurity) println("Confusion Matrix :") println(metrics.confusionMatrix) println("Decision Tree Accuracy: "+metrics.precision) println("Decision Tree Error: "+ (1-metrics.precision)) (0 until numClasses).map( category => (metrics.precision(category), metrics.recall(category)) ).foreach(println) } def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = { val predictionsAndLabels = data.map(example => (model.predict(example.features), example.label) ) new MulticlassMetrics(predictionsAndLabels) } }
Example 158
Source File: IrisData.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext object IrisData { def readFromFile(sc: SparkContext) = { sc.textFile("../data/sparkml2/chapter13/iris.data") .filter(s => !s.isEmpty) .zipWithIndex() } def toLabelPoints(records: (String, Long)): LabeledPoint = { val (record, recordId) = records val fields = record.split(",") LabeledPoint(recordId, Vectors.dense(fields(0).toDouble, fields(1).toDouble, fields(2).toDouble, fields(3).toDouble)) } def buildLabelLookup(records: RDD[(String, Long)]) = { records.map { case (record: String, id: Long) => { val fields = record.split(",") (id, fields(4)) } }.collect().toMap } }
Example 159
Source File: LogisticStreaming.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.mutable.Queue object LogisticStreaming { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getRootLogger.setLevel(Level.WARN) val spark = SparkSession .builder .master("local[*]") .appName("Logistic Streaming App") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ val ssc = new StreamingContext(spark.sparkContext, Seconds(2)) val rawDF = spark.read .text("../data/sparkml2/chapter13/pima-indians-diabetes.data").as[String] val buf = rawDF.rdd.map(value => { val data = value.split(",") (data.init.toSeq, data.last) }) val lps = buf.map{ case (feature: Seq[String], label: String) => val featureVector = feature.map(_.toDouble).toArray[Double] LabeledPoint(label.toDouble, Vectors.dense(featureVector)) } val trainQueue = new Queue[RDD[LabeledPoint]]() val testQueue = new Queue[RDD[LabeledPoint]]() val trainingStream = ssc.queueStream(trainQueue) val testStream = ssc.queueStream(testQueue) val numFeatures = 8 val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) .setNumIterations(15) .setStepSize(0.5) .setMiniBatchFraction(0.25) model.trainOn(trainingStream) val result = model.predictOnValues(testStream.map(lp => (lp.label, lp.features))) result.map{ case (label: Double, prediction: Double) => (label, prediction) }.print() ssc.start() val Array(trainData, test) = lps.randomSplit(Array(.80, .20)) trainQueue += trainData Thread.sleep(4000) val testGroups = test.randomSplit(Array(.50, .50)) testGroups.foreach(group => { testQueue += group Thread.sleep(2000) }) ssc.stop() } }
Example 160
Source File: KMeansStreaming.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.mutable.Queue object KMeansStreaming { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("KMean Streaming App") .config("spark.sql.warehouse.dir", ".") .config("spark.executor.memory", "2g") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(1)) Logger.getRootLogger.setLevel(Level.WARN) val irisData = IrisData.readFromFile(spark.sparkContext) val lookup = IrisData.buildLabelLookup(irisData) val trainQueue = new Queue[RDD[LabeledPoint]]() val testQueue = new Queue[RDD[LabeledPoint]]() val trainingStream = ssc.queueStream(trainQueue) val testStream = ssc.queueStream(testQueue) val model = new StreamingKMeans().setK(3) .setDecayFactor(1.0) .setRandomCenters(4, 0.0) model.trainOn(trainingStream.map(lp => lp.features)) val values = model.predictOnValues(testStream.map(lp => (lp.label, lp.features))) values.foreachRDD(n => n.foreach(v => { println(v._2, v._1, lookup(v._1.toLong)) })) ssc.start() val irisLabelPoints = irisData.map(record => IrisData.toLabelPoints(record)) val Array(trainData, test) = irisLabelPoints.randomSplit(Array(.80, .20)) trainQueue += irisLabelPoints Thread.sleep(2000) val testGroups = test.randomSplit(Array(.25, .25, .25, .25)) testGroups.foreach(group => { testQueue += group println("-" * 25) Thread.sleep(1000) }) ssc.stop() } }
Example 161
Source File: Classifier.scala From Scalaprof with GNU General Public License v2.0 | 5 votes |
package edu.neu.coe.scala.spark.spam import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.classification.LogisticRegressionWithSGD import org.apache.spark.SparkConf import org.apache.spark.SparkContext object Classifier extends App { val conf = new SparkConf().setAppName("spam") val sc = new SparkContext(conf) val spam = sc.textFile("spam.txt") val norm = sc.textFile("normal.txt") val tf = new HashingTF(10000) val spamFeatures = spam.map(email => tf.transform(email.split(" "))) val normFeatures = norm.map(email => tf.transform(email.split(" "))) val posExamples = spamFeatures.map(f => LabeledPoint(1, f)) val negExamples = normFeatures.map(f => LabeledPoint(0, f)) val trainingData = posExamples.union(negExamples) trainingData.cache() val model = new LogisticRegressionWithSGD().run(trainingData) val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" ")) val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" ")) println(s"Prediction for positive test example: ${model.predict(posTest)}") println(s"Prediction for negative test example: ${model.predict(negTest)}") }
Example 162
Source File: FeaturesParser.scala From spark-anomaly-detection with MIT License | 5 votes |
package com.micvog.ml import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object FeaturesParser{ def parseFeatures(rawdata: RDD[String]): RDD[Vector] = { val rdd: RDD[Array[Double]] = rawdata.map(_.split(",").map(_.toDouble)) val vectors: RDD[Vector] = rdd.map(arrDouble => Vectors.dense(arrDouble)) vectors } def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = { val rdd: RDD[Array[Double]] = cvData.map(_.split(",").map(_.toDouble)) val labeledPoints = rdd.map(arrDouble => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length)))) labeledPoints } }
Example 163
Source File: LogisticRegressionDataGenerator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 164
Source File: SVMDataGenerator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 165
Source File: ChiSqSelectorSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext //特征提取和转换 卡方选择(ChiSqSelector)稀疏和稠密向量 test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize(//标记的离散数据 Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData =//预过滤数据 //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) //fit()方法将DataFrame转化为一个Transformer的算法 val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => //transform()方法将DataFrame转化为另外一个DataFrame的算法 LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } }
Example 166
Source File: EnsembleTestHelper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter import scala.collection.mutable object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) => label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => //MAE平均绝对误差是所有单个观测值与算术平均值的偏差的绝对值的平均 //math.abs返回数的绝对值 errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 167
Source File: PythonMLLibAPISuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.recommendation.Rating class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array[Double]() val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 168
Source File: PCAOnSourceVectorExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } sc.stop() } } // scalastyle:on println
Example 169
Source File: PCAExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println(s"Mean Squared Error = $MSE") println(s"PCA Mean Squared Error = $MSE_pca") // $example off$ sc.stop() } } // scalastyle:on println
Example 170
Source File: LinearRegressionWithSGDExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println(s"training Mean Squared Error $MSE") // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 171
Source File: StreamingLinearRegressionExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") System.exit(1) } val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") val ssc = new StreamingContext(conf, Seconds(1)) // $example on$ val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ ssc.stop() } } // scalastyle:on println
Example 172
Source File: StreamingKMeansExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 173
Source File: DataValidators.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 174
Source File: LogisticRegressionDataGenerator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 175
Source File: SVMDataGenerator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 176
Source File: EnsembleTestHelper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 177
Source File: PythonMLLibAPISuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 178
Source File: DataValidators.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.Logging import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 179
Source File: MainRun.scala From spark-anomaly-detection with MIT License | 5 votes |
import com.micvog.ml.{AnomalyDetection, FeaturesParser} import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object MainRun { val rawFilePath = "./src/test/resources/training.csv" val cvFilePath = "./src/test/resources/cross_val.csv" def main(args: Array[String]) { val conf = new SparkConf().setAppName("Anomaly Detection Spark") val sc = new SparkContext(conf) val rawdata = sc.textFile(rawFilePath, 2).cache() val cvData = sc.textFile(cvFilePath, 2).cache() //convert raw data to vectors val trainingVec: RDD[Vector] = FeaturesParser.parseFeatures(rawdata) val cvLabeledVec: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData) val data = trainingVec.cache() val anDet: AnomalyDetection = new AnomalyDetection() //derive model val model = anDet.run(data) val dataCvVec = cvLabeledVec.cache() val optimalModel = anDet.optimize(dataCvVec, model) //find outliers in CV val cvVec = cvLabeledVec.map(_.features) val results = optimalModel.predict(cvVec) val outliers = results.filter(_._2).collect() outliers.foreach(v => println(v._1)) println("\nFound %s outliers\n".format(outliers.length)) } }
Example 180
Source File: AnomalyDetection$Test.scala From spark-anomaly-detection with MIT License | 5 votes |
package com.micvog.ml import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.scalactic.Equality import org.scalatest.{FlatSpec, FunSuite, Matchers} class AnomalyDetection$Test extends FlatSpec with Matchers with SharedSparkContext { { val point = Vectors.dense(Array(14.8593411857427, 14.9006647394062)) val means = Vectors.dense(Array(14.1122257839456, 14.9977105081362)) val variances = Vectors.dense(Array(1.83263141349452, 1.70974533082878)) "probFunction" should "return correct product value" in { val p = AnomalyDetection.probFunction(point, means, variances) assert(p === 0.0769984879544 +- 0.0001) } "predict" should "predict the anomaly" in { assert(!AnomalyDetection.predict(point, means, variances, 0.05)) } "predict" should "predict non anomaly" in { assert(AnomalyDetection.predict(point, means, variances, 0.08)) } } private def vectorequality() = { new Equality[Vector] { def areEqual(a: Vector, b: Any): Boolean = b match { case v: Vector => v.toArray.zip(a.toArray).map(pair => pair._1 === pair._2 +- 0.001).reduce((a, b) => a && b) case _ => false } } } def trainModel(): AnomalyDetectionModel = { val trainingExamplesFilePath = "./src/test/resources/training.csv" val trainingData = sc.textFile(trainingExamplesFilePath, 2).cache() val trainingRdd = FeaturesParser.parseFeatures(trainingData) new AnomalyDetection().run(trainingRdd) } "run" should "return model with correct mean and variance" in { val model: AnomalyDetectionModel = trainModel() //use scalactic's more relaxing equality implicit val vectorEq = vectorequality() assert(model.means === Vectors.dense(Array(79.9843751617201, 5.13662727300755))) assert(model.variances === Vectors.dense(Array(356.44539323536225, 3.79818173645375))) } "optimize" should "calculate epsilon and F1 score" in { val cvFilePath = "./src/test/resources/cross_val.csv" val cvData = sc.textFile(cvFilePath, 2).cache() val cvPointsRdd: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData) val model = trainModel() val optimalModel = new AnomalyDetection().optimize(cvPointsRdd, model) assert(optimalModel.epsilon === 3.382218E-4 +- 0.0000000001) } }
Example 181
Source File: NaiveBayesExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint // $example off$ import org.apache.spark.{SparkConf, SparkContext} object NaiveBayesExample { def main(args: Array[String]) : Unit = { val conf = new SparkConf().setAppName("NaiveBayesExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) } // Split data into training (60%) and test (40%). val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0) val test = splits(1) val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() // Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") val sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") // $example off$ } } // scalastyle:on println
Example 182
Source File: StreamingKMeansExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 183
Source File: LogLoss.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[mllib] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 184
Source File: DataValidators.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.Logging import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 185
Source File: LogisticRegressionDataGenerator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 186
Source File: SVMDataGenerator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 187
Source File: ChiSqSelectorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{Row, SQLContext} class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Test Chi-Square selector") { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ val data = Seq( LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) ) val preFilteredData = Seq( Vectors.dense(0.0), Vectors.dense(6.0), Vectors.dense(8.0), Vectors.dense(5.0) ) val df = sc.parallelize(data.zip(preFilteredData)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") val model = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } } test("ChiSqSelector read/write") { val t = new ChiSqSelector() .setFeaturesCol("myFeaturesCol") .setLabelCol("myLabelCol") .setOutputCol("myOutputCol") .setNumTopFeatures(2) testDefaultReadWrite(t) } test("ChiSqSelectorModel read/write") { val oldModel = new feature.ChiSqSelectorModel(Array(1, 3)) val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel) val newInstance = testDefaultReadWrite(instance) assert(newInstance.selectedFeatures === instance.selectedFeatures) } }
Example 188
Source File: DecisionTreeRegressorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.util.MLTestingUtils import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame class DecisionTreeRegressorSuite extends SparkFunSuite with MLlibTestSparkContext { import DecisionTreeRegressorSuite.compareAPIs private var categoricalDataPointsRDD: RDD[LabeledPoint] = _ override def beforeAll() { super.beforeAll() categoricalDataPointsRDD = sc.parallelize(OldDecisionTreeSuite.generateCategoricalDataPoints()) } ///////////////////////////////////////////////////////////////////////////// // Tests calling train() ///////////////////////////////////////////////////////////////////////////// test("Regression stump with 3-ary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) .setSeed(1) val categoricalFeatures = Map(0 -> 3, 1-> 3) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } test("Regression stump with binary (ordered) categorical features") { val dt = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(100) val categoricalFeatures = Map(0 -> 2, 1-> 2) compareAPIs(categoricalDataPointsRDD, dt, categoricalFeatures) } test("copied model must have the same parent") { val categoricalFeatures = Map(0 -> 2, 1-> 2) val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0) val model = new DecisionTreeRegressor() .setImpurity("variance") .setMaxDepth(2) .setMaxBins(8).fit(df) MLTestingUtils.checkCopy(model) } ///////////////////////////////////////////////////////////////////////////// // Tests of model save/load ///////////////////////////////////////////////////////////////////////////// // TODO: test("model save/load") SPARK-6725 } private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { def compareAPIs( data: RDD[LabeledPoint], dt: DecisionTreeRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val numFeatures = data.first().features.size val oldStrategy = dt.getOldStrategy(categoricalFeatures) val oldTree = OldDecisionTree.train(data, oldStrategy) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newTree = dt.fit(newData) // Use parent from newTree since this is not checked anyways. val oldTreeAsNew = DecisionTreeRegressionModel.fromOld( oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) assert(newTree.numFeatures === numFeatures) } }
Example 189
Source File: ChiSqSelectorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 190
Source File: EnsembleTestHelper.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter import scala.collection.mutable object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 191
Source File: PythonMLLibAPISuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, SparseMatrix} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.recommendation.Rating class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array[Double]() val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 192
Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0 | 5 votes |
package org.hogzilla.hbase import scala.math.random import java.lang.Math import org.apache.spark._ import org.apache.hadoop.hbase.client.HBaseAdmin import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName} import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.hadoop.hbase.client.HTable import org.apache.hadoop.hbase.filter.SingleColumnValueFilter import org.apache.hadoop.hbase.filter.BinaryComparator import org.apache.hadoop.hbase.filter.FilterList import org.apache.hadoop.hbase.filter.CompareFilter import java.util.ArrayList import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.filter.Filter import scala.collection.mutable.HashSet import org.apache.hadoop.hbase.client.Put object HogHBaseReputation { // Ex: MX, whitelist def getReputationList(listName:String, listType:String):Set[String] = { val list = new HashSet[String] val filters: ArrayList[Filter] = new ArrayList(); val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType))) colValFilter1.setFilterIfMissing(false); val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName))) colValFilter2.setFilterIfMissing(false); filters.add(colValFilter1); filters.add(colValFilter2); val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters); val scan = new Scan() scan.setFilter(filterList) val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator() while(it.hasNext()) { list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) ) } list.toSet } def saveReputationList(listName:String, listType:String, ip:String) = { val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip)) HogHBaseRDD.hogzilla_reputation.put(put) } }