org.apache.spark.mllib.linalg.Vectors Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.Vectors.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DataFrameExample.scala From drizzle-spark with Apache License 2.0 | 7 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 2
Source File: SummaryStatisticsExample.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 3
Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 4
Source File: SparkIntroduction.scala From reactive-machine-learning-systems with MIT License | 6 votes |
package com.reactivemachinelearning import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors object SparkIntroduction { def main(args: Array[String]) { // handle args // setup val session = SparkSession.builder.appName("Simple ModelExample").getOrCreate() import session.implicits._ // Load and parse the train and test data val inputBasePath = "example_data" val outputBasePath = "." val trainingDataPath = inputBasePath + "/training.txt" val testingDataPath = inputBasePath + "/testing.txt" val currentOutputPath = outputBasePath + System.currentTimeMillis() val trainingData = session.read.textFile(trainingDataPath) val trainingParsed = trainingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val testingData = session.read.textFile(testingDataPath) val testingParsed = testingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val model = LinearRegressionWithSGD.train(trainingParsed.rdd, numIterations) // Evaluate model on testing examples val predictionsAndLabels = testingParsed.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Report performance statistics val metrics = new MulticlassMetrics(predictionsAndLabels.rdd) val precision = metrics.precision val recall = metrics.recall println(s"Precision: $precision Recall: $recall") // Save model model.save(session.sparkContext, currentOutputPath) } }
Example 5
import java.io.{File, PrintWriter} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.clustering.GaussianMixture import org.apache.spark.sql.functions._ def computeGaussianMixtureModel( pathToTextFile: String, quantity: Int) { case class Point(x: Double, y: Double) def save(f: File)(func: PrintWriter => Unit) { val p = new PrintWriter(f) try { func(p) } finally { p.close() } } val filename = pathToTextFile.split("\\.")(0) val outputFilename = s"$filename-GMM-k${quantity}.tsv" val points = sc .textFile(pathToTextFile) .map { line => line.trim.split("\\s+") } .map { row => Point(row(0).toDouble, row(1).toDouble) } val features = points .map { p => Vectors.dense(p.x, p.y) } features.cache() val gmm = new GaussianMixture() .setK(quantity) .run(features) val predictions = features .map { f => (f(0), f(1), gmm.predict(f) + 1) } .collect save(new File(outputFilename)) { println(s"OUTPUT TO: ${outputFilename}") f => predictions.foreach{ case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}") } } }
Example 6
Source File: PCAOnSourceVectorExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 7
Source File: PCAOnRowMatrixExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 8
Source File: TallSkinnyPCA.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 9
Source File: GaussianMixtureExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object GaussianMixtureExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("GaussianMixtureExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/gmm_data.txt") val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using GaussianMixture val gmm = new GaussianMixture().setK(2).run(parsedData) // Save and load model gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") val sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") // output parameters of max-likelihood model for (i <- 0 until gmm.k) { println("weight=%f\nmu=%s\nsigma=\n%s\n" format (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma)) } // $example off$ sc.stop() } } // scalastyle:on println
Example 10
Source File: PCAExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 11
Source File: CosineSimilarity.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 12
Source File: ElementwiseProductExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors // $example off$ object ElementwiseProductExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("ElementwiseProductExample") val sc = new SparkContext(conf) // $example on$ // Create some vector data; also works for sparse vectors val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) val transformingVector = Vectors.dense(0.0, 1.0, 2.0) val transformer = new ElementwiseProduct(transformingVector) // Batch transform and per-row transform give the same results: val transformedData = transformer.transform(data) val transformedData2 = data.map(x => transformer.transform(x)) // $example off$ println("transformedData: ") transformedData.foreach(x => println(x)) println("transformedData2: ") transformedData2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 13
Source File: SVDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 14
Source File: TallSkinnySVD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 15
Source File: StandardScalerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 16
Source File: KMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 17
Source File: MultivariateSummarizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 18
Source File: LinearRegressionWithSGDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 19
Source File: StreamingLinearRegressionExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") System.exit(1) } val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") val ssc = new StreamingContext(conf, Seconds(1)) // $example on$ val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ ssc.stop() } } // scalastyle:on println
Example 20
Source File: BisectingKMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 21
Source File: StreamingKMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 22
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import java.lang.{Iterable => JavaIterable} import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.unsafe.hash.Murmur3_x86_32._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils private[spark] def murmur3Hash(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = UTF8String.fromString(s) hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed) case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } }
Example 23
Source File: Normalizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 24
Source File: GaussianMixtureModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import scala.collection.JavaConverters import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.GaussianMixtureModel import org.apache.spark.mllib.linalg.{Vector, Vectors} val gaussians: Array[Byte] = { val modelGaussians = model.gaussians.map { gaussian => Array[Any](gaussian.mu, gaussian.sigma) } SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava) } def predictSoft(point: Vector): Vector = { Vectors.dense(model.predictSoft(point)) } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 25
Source File: Word2VecModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 26
Source File: MatrixFactorizationModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 27
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.MLUtils private def calculateCovarianceConstants: (DBM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = MLUtils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 28
Source File: SpearmanCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 29
Source File: Updater.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization import scala.math._ import breeze.linalg.{axpy => brzAxpy, norm => brzNorm, Vector => BV} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.mllib.linalg.{Vector, Vectors} @DeveloperApi class SquaredL2Updater extends Updater { override def compute( weightsOld: Vector, gradient: Vector, stepSize: Double, iter: Int, regParam: Double): (Vector, Double) = { // add up both updates from the gradient of the loss (= step) as well as // the gradient of the regularizer (= regParam * weightsOld) // w' = w - thisIterStepSize * (gradient + regParam * w) // w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient val thisIterStepSize = stepSize / math.sqrt(iter) val brzWeights: BV[Double] = weightsOld.asBreeze.toDenseVector brzWeights :*= (1.0 - thisIterStepSize * regParam) brzAxpy(-thisIterStepSize, gradient.asBreeze, brzWeights) val norm = brzNorm(brzWeights, 2.0) (Vectors.fromBreeze(brzWeights), 0.5 * regParam * norm * norm) } }
Example 30
Source File: LogisticRegressionDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 31
Source File: SVMDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 32
Source File: LabeledPoint.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 33
Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("ChiSqSelector by FPR transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 34
Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 35
Source File: IDFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 36
Source File: PCASuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 37
Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") { val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)") assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing") val expected = Vectors.sparse(n, termFreqs) assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") { val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } test("applying binary term freqs") { val hashingTF = new HashingTF(100).setBinary(true) val doc = "a a b c c c".split(" ") val n = hashingTF.numFeatures val expected = Vectors.sparse(n, Seq( (hashingTF.indexOf("a"), 1.0), (hashingTF.indexOf("b"), 1.0), (hashingTF.indexOf("c"), 1.0))) assert(hashingTF.transform(doc) ~== expected absTol 1e-14) } }
Example 38
Source File: EnsembleTestHelper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 39
Source File: PythonMLLibAPISuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 40
Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Matrices, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 41
Source File: KMeansPMMLModelExportSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 42
Source File: PMMLModelExportFactorySuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 43
Source File: CoordinateMatrixSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { val m = 5 val n = 4 var mat: CoordinateMatrix = _ override def beforeAll() { super.beforeAll() val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } mat = new CoordinateMatrix(entries) } test("size") { assert(mat.numRows() === m) assert(mat.numCols() === n) } test("empty entries") { val entries = sc.parallelize(Seq[MatrixEntry](), 1) val emptyMat = new CoordinateMatrix(entries) intercept[RuntimeException] { emptyMat.numCols() } intercept[RuntimeException] { emptyMat.numRows() } } test("toBreeze") { val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(mat.toBreeze() === expected) } test("transpose") { val transposed = mat.transpose() assert(mat.toBreeze().t === transposed.toBreeze()) } test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(indexedRowMatrix.toBreeze() === expected) } test("toRowMatrix") { val rowMatrix = mat.toRowMatrix() val rows = rowMatrix.rows.collect().toSet val expected = Set( Vectors.dense(1.0, 2.0, 0.0, 0.0), Vectors.dense(0.0, 3.0, 4.0, 0.0), Vectors.dense(0.0, 0.0, 5.0, 6.0), Vectors.dense(7.0, 0.0, 0.0, 8.0), Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } test("toBlockMatrix") { val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 44
Source File: LabeledPointSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.Vectors class LabeledPointSuite extends SparkFunSuite { test("parse labeled points") { val points = Seq( LabeledPoint(1.0, Vectors.dense(1.0, 0.0)), LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0)))) points.foreach { p => assert(p === LabeledPoint.parse(p.toString)) } } test("parse labeled points with whitespaces") { val point = LabeledPoint.parse("(0.0, [1.0, 2.0])") assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) } test("parse labeled points with v0.9 format") { val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0") assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) } test("conversions between new ml LabeledPoint and mllib LabeledPoint") { val points: Seq[LabeledPoint] = Seq( LabeledPoint(1.0, Vectors.dense(1.0, 0.0)), LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0)))) val newPoints: Seq[NewLabeledPoint] = points.map(_.asML) points.zip(newPoints).foreach { case (p1, p2) => assert(p1 === LabeledPoint.fromML(p2)) } } }
Example 45
Source File: RidgeRegressionSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.{LinearDataGenerator, LocalClusterSparkContext, MLlibTestSparkContext} import org.apache.spark.util.Utils private object RidgeRegressionSuite { val model = new RidgeRegressionModel(weights = Vectors.dense(0.1, 0.2, 0.3), intercept = 0.5) } class RidgeRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { def predictionError(predictions: Seq[Double], input: Seq[LabeledPoint]): Double = { predictions.zip(input).map { case (prediction, expected) => (prediction - expected.label) * (prediction - expected.label) }.sum / predictions.size } test("ridge regression can help avoid overfitting") { // For small number of examples and large variance of error distribution, // ridge regression should give smaller generalization error that linear regression. val numExamples = 50 val numFeatures = 20 // Pick weights as random values distributed uniformly in [-0.5, 0.5] val random = new Random(42) val w = Array.fill(numFeatures)(random.nextDouble() - 0.5) // Use half of data for training and other half for validation val data = LinearDataGenerator.generateLinearInput(3.0, w, 2 * numExamples, 42, 10.0) val testData = data.take(numExamples) val validationData = data.takeRight(numExamples) val testRDD = sc.parallelize(testData, 2).cache() val validationRDD = sc.parallelize(validationData, 2).cache() // First run without regularization. val linearReg = new LinearRegressionWithSGD() linearReg.optimizer.setNumIterations(200) .setStepSize(1.0) val linearModel = linearReg.run(testRDD) val linearErr = predictionError( linearModel.predict(validationRDD.map(_.features)).collect(), validationData) val ridgeReg = new RidgeRegressionWithSGD() ridgeReg.optimizer.setNumIterations(200) .setRegParam(0.1) .setStepSize(1.0) val ridgeModel = ridgeReg.run(testRDD) val ridgeErr = predictionError( ridgeModel.predict(validationRDD.map(_.features)).collect(), validationData) // Ridge validation error should be lower than linear regression. assert(ridgeErr < linearErr, "ridgeError (" + ridgeErr + ") was not less than linearError(" + linearErr + ")") } test("model save/load") { val model = RidgeRegressionSuite.model val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString // Save model, load it back, and compare. try { model.save(sc, path) val sameModel = RidgeRegressionModel.load(sc, path) assert(model.weights == sameModel.weights) assert(model.intercept == sameModel.intercept) } finally { Utils.deleteRecursively(tempDir) } } } class RidgeRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext { test("task size should be small in both training and prediction") { val m = 4 val n = 200000 val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) => val random = new Random(idx) iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble())))) }.cache() // If we serialize data directly in the task closure, the size of the serialized task would be // greater than 1MB and hence Spark would throw an error. val model = RidgeRegressionWithSGD.train(points, 2) val predictions = model.predict(points.map(_.features)) } }
Example 46
Source File: TestMPSLinearProgramSolver.scala From spark-lp with Apache License 2.0 | 5 votes |
object TestMPSLinearProgramSolver { def main(args: Array[String]) { val conf = new SparkConf() .setMaster("local[2]") .setAppName("TestMPSLinearProgramSolver") val sc = new SparkContext(conf) // Parse the provided MPS file. val parser = new MPSParser() val mpsFile = new File(args(0)) parser.parse(mpsFile) // Convert the parsed linear program to standard form. val converter = new LPStandardConverter(true) converter.toStandardForm(parser.getC, parser.getG, parser.getH, parser.getA, parser.getB, parser.getLb, parser.getUb) // Convert the parameters of the linear program to spark lp compatible formats. val numPartitions = 2 val c: DVector = sc.parallelize(converter.getStandardC.toArray, numPartitions) .glom.map(new DenseVector(_)) val B: DMatrix = sc.parallelize(converter.getStandardA.toArray.transpose.map( Vectors.dense(_).toSparse: Vector), numPartitions) val b = new DenseVector(converter.getStandardB.toArray) println("Start solving ... ") val (optimalVal, optimalX) = LP.solve(c, B, b, sc=sc) println("optimalVal: " + optimalVal) //println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 47
Source File: LinopMatrixAdjoint.scala From spark-lp with Apache License 2.0 | 5 votes |
override def apply(x: DVector): DenseVector = { val n = this.n matrix.zipPartitions(x)((matrixPartition, xPartition) => Iterator.single( matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate( // NOTE A DenseVector result is assumed here (not sparse safe). Vectors.zeros(n).toDense)( seqop = (_, _) match { case (sum, (matrix_i, x_i)) => { // Multiply an element of x by its corresponding matrix row, and add to the // accumulation sum vector. BLAS.axpy(x_i, matrix_i, sum) sum } }, combop = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } )) ).treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 }, combOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } , depth ) } }
Example 48
Source File: SpLinopMatrixSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.fs.dvector.dmatrix._ class SpLinopMatrixSuite extends FunSuite with MLlibTestSparkContext { test("SpLinopMatrix.apply is implemented properly") { val matrix: DMatrix = sc.parallelize(Array( Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)), 2) val vector: DVector = sc.parallelize(Array(2.0, 3.0), 2).glom.map(new DenseVector(_)) val expectApply: DMatrix = sc.parallelize(Array( Vectors.dense(2.0 * 1.0, 2.0 * 2.0, 2.0 * 3.0), Vectors.dense(3.0 * 4.0, 3.0 * 5.0, 3.0 * 6.0)), 2) assert((new SpLinopMatrix(vector))(matrix).collect().deep == expectApply.collect().deep, // or sameElements "SpLinopMatrix.apply should return the correct result.") } }
Example 49
Source File: InitializeSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _} import org.apache.spark.mllib.optimization.tfocs.VectorSpace.{DMatrix, DVector} class InitializeSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c: DVector = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows: DMatrix = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b: DenseVector = new DenseVector(bArray) val cBrz = new BDV[Double](cArray) val BBrz = new BDM[Double](7, 5, BArray.flatMap(x => x), offset = 0, majorStride = 5, isTranspose = true) val bBrz = new BDV[Double](bArray) // (BT * B) ^(-1) val BTBInv = inv(BBrz.t * BBrz) // xTilda = B * BTBInv * b val xTilda: BDV[Double] = BBrz * (BTBInv * bBrz) // lambdaTilda = BTBInv * (B^T * c) val lambdaTilda: BDV[Double] = BTBInv * (BBrz.t * cBrz) // sTilda = c - B * lambdaTilda val sTilda = cBrz - BBrz * lambdaTilda val deltax = Math.max(1.5 * max(xTilda), 0) val deltas = Math.max(1.5 * max(sTilda), 0) val xHat = xTilda :+ deltax val sHat = sTilda :+ deltas val deltaxHat: Double = 0.5 * (xHat.t * sHat) / sum(sHat) val deltasHat: Double = 0.5 * (xHat.t * sHat) / sum(xHat) // x = xHat + deltaxHat * e val expectedx: BDV[Double] = xHat :+ deltaxHat // val expectedLambda = lambdaTilda val expecteds: BDV[Double] = sHat :+ deltasHat test("Initialize.init is implemented properly") { val result = Initialize.init(c, rows, b) //println(LP.solve(c, rows, b, 1e-4, 1).collect()) assert(Vectors.dense(expectedx.toArray) ~= Vectors.dense(result._1.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init x0 is not computed correctly.") assert(Vectors.dense(lambdaTilda.toArray) ~= Vectors.dense(result._2.toArray) relTol 1e-6, "Initialize.init lambda0 is not computed correctly.") assert(Vectors.dense(expecteds.toArray) ~= Vectors.dense(result._3.flatMap(_.toArray).collect()) relTol 1e-6, "Initialize.init s0 should return the correct answer.") } }
Example 50
Source File: LPSuite.scala From spark-lp with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.lp import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.optimization.lp.VectorSpace._ import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace class LPSuite extends FunSuite with MLlibTestSparkContext { val numPartitions = 2 val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0) val BArray = Array( Array(12.0, 16.0, 30.0, 1.0, 0.0), Array(24.0, 16.0, 12.0, 0.0, 1.0), Array(-1.0, 0.0, 0.0, 0.0, 0.0), Array(0.0, -1.0, 0.0, 0.0, 0.0), Array(0.0, 0.0, -1.0, 0.0, 0.0), Array(0.0, 0.0, 0.0, 1.0, 0.0), Array(0.0, 0.0, 0.0, 0.0, 1.0)) val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0) lazy val c = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_)) lazy val rows = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_)) lazy val b = new DenseVector(bArray) test("LP solve is implemented properly") { val (v, x) = LP.solve(c, rows, b, sc=sc) // solution obtained from scipy.optimize.linprog and octave glgk lpsolver with fun_val = 12.083 val expectedSol = Vectors.dense( Array(1.66666667, 5.83333333, 40.0, 0.0, 0.0, 13.33333333, 9.16666667)) val xx = Vectors.dense(x.flatMap(_.toArray).collect()) println(s"$xx") println("optimal min value: " + v) assert(xx ~== expectedSol absTol 1e-6, "LP.solve x should return the correct answer.") } }
Example 51
Source File: Main.scala From didactic-computing-machine with GNU Affero General Public License v3.0 | 5 votes |
package example import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} object Main extends App { val conf = new SparkConf() .setAppName("K means cluster") .setMaster("local") val sc = SparkContext .getOrCreate(conf) val data = sc.parallelize( Vector( Vector(-4.0, -1.0, -4.0), Vector(2.0, 0.0, 0.0), Vector(1.0, -2.0, 4.0), Vector(-3.0, -4.0, -1.0), Vector(2.0, -4.0, 0.0), Vector(2.0, 1.0, -5), Vector(3.0, -3.0, 0.0), Vector(-1.0, -1.0, 1.0) ).map(t => Vectors.dense(t.toArray))) val numOfClusters = 3 val numOfIterations = 100 val clusters = KMeans.train(data, numOfClusters, numOfIterations) println("Cluster centers") clusters.clusterCenters.foreach(println) println("Squared Errors") println(clusters.computeCost(data)) println("Predictions") println(clusters.predict(Vectors.dense(0.0, 0.0, 0.0))) println(clusters.predict(Vectors.dense(-3.0, -2.0, 1.5))) }
Example 52
Source File: DigitRecognizer.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.train import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} val predictResult = Seq(0.001,0.01,0.1,1.0,10.0).map { param => val nbModel = trainNBWithParams(testData,param,"multinomial") val predictResult = testData.map { labeledPoint => val predicted = nbModel.predict(labeledPoint.features) if (predicted > 0.5) 1 else 0 }.reduce(_ + _) val accuracy = predictResult / testData.count * 1.0 println(s"nb model with lambda:$param,modelTpye:multinomial,Accuracy:$accuracy") } } }
Example 53
Source File: DenseKMeans.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( var input: String = null, k: Int = 2, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() defaultParams.input = args(0) run(defaultParams) } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) // Return the K-means cost (sum of squared distances of points to their nearest center) for this val cost = model.computeCost(examples) // 获取质点(k个) val centerPoint = model.clusterCenters val one = centerPoint(0) val two = centerPoint(1) println(s"centerPoint=$one,$two.") println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 54
Source File: StreamingLogisticRegression.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import com.bigchange.util.{FileUtil, TimeUtil} import org.apache.spark.SparkConf import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) // model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() model.predictOnValues(testData.map(lp => (lp.label, lp.features))).map(x => x._1 +"\t" +x._2).foreachRDD(rdd =>{ val value = rdd.collect() FileUtil.normalFileWriter("F:\\datatest\\ai\\StreamingLogisticRegression\\"+TimeUtil.getCurrentHour,value) }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 55
Source File: StreamingSimpleModel.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.streaming import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingSimpleModel { def main(args: Array[String]) { val ssc = new StreamingContext("local","test",Seconds(10)) val stream = ssc.socketTextStream("localhost",9999) val numberFeatures = 100 val zeroVector = DenseVector.zeros[Double](numberFeatures) val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.dense(zeroVector.data)) .setNumIterations(1) .setStepSize(0.01) val labeledStream = stream.map { event => val split = event.split("\t") val y = split(0).toDouble val features = split(1).split(",").map(_.toDouble) LabeledPoint(label = y, features = Vectors.dense(features)) } model.trainOn(labeledStream) // 使用DStream的转换算子 val predictAndTrue = labeledStream.transform { rdd => val latestModel = model.latestModel() rdd.map { point => val predict = latestModel.predict(point.features) predict - point.label } } // 计算MSE predictAndTrue.foreachRDD { rdd => val mse = rdd.map(x => x * x).mean() val rmse = math.sqrt(mse) println(s"current batch, MSE: $mse, RMSE:$rmse") } ssc.start() ssc.awaitTermination() } }
Example 56
Source File: PipeClassificationNaiveBayes.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.NaiveBayesModel class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("lambda", lambda)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = NaiveBayes.train(trainingData, lambda) log.debug("Classification Model:" + model) log.debug("Classification Model labels :" + model.labels.mkString(" ")) log.debug("Classification Model pi: " + model.pi.mkString(" ")) log.debug("Classification Model theta: " + model.theta.foreach(_.mkString(" "))) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationNaiveBayes { def apply(lambda: Double = 1.0) = { new PipeClassificationNaiveBayes(lambda) } }
Example 57
Source File: PipeClassificationTrainingDataGenerator.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.compat.Platform import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import com.rockymadden.stringmetric.StringMetric import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable class PipeClassificationTrainingDataGenerator( truePositiveCount: Int = 500, trueNegativeCount: Int = 500)( implicit featureMeasures: Array[(Int, StringMetric[Double])]) extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])] with Logging { override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = { pipeContext match { case pc: GoldstandardContext with CorpusContext => { var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction) log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction) if (truePositiveFraction > 1.0) { truePositiveFraction = 1.0 log.debug("True positive pair fraction limited to 1.0") } if (trueNegativeFraction > 1.0) { trueNegativeFraction = 1.0 log.debug("True negative pair fraction limited to 1.0") } val result = generateTrainingData(pc.corpus, pc.goldstandard, truePositiveFraction, trueNegativeFraction) (input, result) } case _ => { throw new Exception("Wrong AbstractPipeContext type.") } } } object PipeClassificationTrainingDataGenerator { val All = -1 def apply( truePositiveCount: Int = 500, trueNegativeCount: Int = 500)( implicit featureMeasures: Array[(Int, StringMetric[Double])]) = { new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount) } }
Example 58
Source File: PipeClassificationDecisionTree.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.pipe.context.CorpusContext import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable import de.unihamburg.vsis.sddf.Parameterized import org.apache.spark.mllib.classification.ClassificationModel class PipeClassificationDecisionTree( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = DecisionTree.trainClassifier(trainingData, numClasses = 2, categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins) log.debug("Decision Tree Model:" + model) log.debug("Decision Tree:" + model.toDebugString) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationDecisionTree { def apply( impurity: String = "gini", maxDepth: Int = 5, maxBins: Int = 32) = { new PipeClassificationDecisionTree(impurity, maxDepth, maxBins) } }
Example 59
Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.SVMWithSGD class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("numIterations", numIterations)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = SVMWithSGD.train(trainingData, numIterations) log.debug("Classification Model:" + model) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationSvm { def apply(numIterations: Int = 100) = { new PipeClassificationSvm(numIterations) } }
Example 60
Source File: PipeDecisionTest.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.test.classification import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import de.unihamburg.vsis.sddf.SddfContext.Duplicate import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.test.util.LocalSparkContext class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{ var input: (SymPairSim, RDD[LabeledPoint]) = _ override def beforeAll() { super.beforeAll() val tuple1 = Tuple("test1","test1","test1") tuple1.id = 1 val tuple2 = Tuple("test2","test2","test2") tuple2.id = 2 val tuple3 = Tuple("hans","franz","wurst") tuple3.id = 3 val symPairSim: SymPairSim = sc.parallelize(Seq( (new SymPair(tuple1, tuple2), Array(1D,1D,0D)) ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D)) )) val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq( LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1))) ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89))) ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91))) )) input = (symPairSim, trainingData) } override def afterAll() { super.afterAll() } test("naive bayes classification test") { val classificationPipe = new PipeClassificationNaiveBayes() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } test("svm classification test") { val classificationPipe = new PipeClassificationSvm() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } test("decision tree classification test") { val classificationPipe = new PipeClassificationDecisionTree() implicit val pipeContext = new SddfPipeContext() val result = classificationPipe.run(input) assert(result.count === 1) } }
Example 61
Source File: BisectingKMeansModel.scala From bisecting-kmeans with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.bisectingkmeans import breeze.linalg.{Vector => BV, norm => breezeNorm} import org.apache.spark.Logging import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() this.node.toLinkageMatrix.foreach {x => val row = new java.util.ArrayList[java.lang.Double]() row.add(x._1.toDouble) row.add(x._2.toDouble) row.add(x._3.toDouble) row.add(x._4.toDouble) javaList.add(row) } javaList } }
Example 62
Source File: TestMPSLinearProgram.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import java.io.File import com.joptimizer.optimizers.LPStandardConverter import com.joptimizer.util.MPSParser import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.SolverSLP import org.apache.spark.{ SparkConf, SparkContext } object TestMPSLinearProgram { def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestMPSLinearProgram") val sc = new SparkContext(sparkConf) // Parse the provided MPS file. val parser = new MPSParser() var mpsFile = new File(args(0)) parser.parse(mpsFile) // Convert the parsed linear program to standard form. val converter = new LPStandardConverter(true) converter.toStandardForm(parser.getC, parser.getG, parser.getH, parser.getA, parser.getB, parser.getLb, parser.getUb) // Convert the parameters of the linear program to spark tfocs compatible formats. val c = sc.parallelize(converter.getStandardC.toArray).glom.map(new DenseVector(_)) val A = sc.parallelize(converter.getStandardA.toArray.transpose.map( Vectors.dense(_).toSparse: Vector)) val b = new DenseVector(converter.getStandardB.toArray) val n = converter.getStandardN val mu = 1e-2 // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'. val (optimalX, _) = SolverSLP.run(c, A, b, mu) println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 63
Source File: TestLASSO.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import scala.util.Random import org.apache.spark.mllib.linalg.{ BLAS, DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.SolverL1RLS import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.{ SparkConf, SparkContext } object TestLASSO { def main(args: Array[String]) { val rnd = new Random(34324) val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLASSO") val sc = new SparkContext(sparkConf) val n = 1024 // Design matrix column count. val m = n / 2 // Design matrix row count. val k = m / 5 // Count of nonzero weights. // Generate the design matrix using random normal values, then normalize the columns. val unnormalizedA = RandomRDDs.normalVectorRDD(sc, m, n, 0, rnd.nextLong) val AColumnNormSq = unnormalizedA.treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum, rowA) => { val rowASq = Vectors.dense(rowA.toArray.map(rowA_i => rowA_i * rowA_i)) BLAS.axpy(1.0, rowASq, sum) sum }, combOp = (sum1, sum2) => { BLAS.axpy(1.0, sum2, sum1) sum1 }) val A = unnormalizedA.map(rowA => Vectors.dense(rowA.toArray.zip(AColumnNormSq.toArray).map { case (rowA_i, normsq_i) => rowA_i / math.sqrt(normsq_i) })) // Generate the actual 'x' vector, including 'k' nonzero values. val x = Vectors.zeros(n).toDense for (i <- rnd.shuffle(0 to n - 1).take(k)) { x.values(i) = rnd.nextGaussian } // Generate the 'b' vector using the design matrix and weights, adding gaussian noise. val bOriginal = new DenseVector(A.map(rowA => BLAS.dot(rowA, x)).collect) val snr = 30 // SNR in dB val sigma = math.pow(10, ((10 * math.log10(math.pow(Vectors.norm(bOriginal, 2), 2) / n) - snr) / 20)) val b = sc.parallelize(bOriginal.values.map(_ + sigma * rnd.nextGaussian)) .glom .map(new DenseVector(_)) // Set 'lambda' using the noise standard deviation. val lambda = 2 * sigma * math.sqrt(2 * math.log(n)) // Solve the lasso problem using SolverL1RLS, finding the estimated x vector 'estimatedX'. val (estimatedX, _) = SolverL1RLS.run(A, b, lambda) println("estimatedX: " + estimatedX.values.mkString(", ")) sc.stop() } }
Example 64
Source File: SolverL1RLS.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.fs.dvector.double._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.double._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.vs.dvector._ import org.apache.spark.mllib.optimization.tfocs.vs.vector._ def run(A: DMatrix, b: DVector, lambda: Double, x0: Option[DenseVector] = None): (DenseVector, Array[Double]) = { val (x, TFOCS.OptimizationData(lossHistory, _, _)) = TFOCS.optimize(new SmoothQuad(b), new LinopMatrix(A), new ProxL1(lambda), x0.getOrElse(Vectors.zeros(A.first().size).toDense)) (x, lossHistory) } }
Example 65
Source File: SolverSLP.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.apache.spark.mllib.linalg.{ BLAS, DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.fs.dvector.double._ import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.double._ import org.apache.spark.mllib.optimization.tfocs.vs.dvector._ object SolverSLP { def run( c: DVector, A: DMatrix, b: DenseVector, mu: Double, x0: Option[DVector] = None, z0: Option[DenseVector] = None, numContinuations: Int = 10, tol: Double = 1e-4, initialTol: Double = 1e-3, dualTolCheckInterval: Int = 10): (DVector, Array[Double]) = { val minusB = b.copy BLAS.scal(-1.0, minusB) TFOCS_SCD.optimize(new ProxShiftRPlus(c), new LinopMatrixAdjoint(A, minusB), new ProxZero(), mu, x0.getOrElse(c.mapElements(_ => 0.0)), z0.getOrElse(Vectors.zeros(b.size).toDense), numContinuations, tol, initialTol, dualTolCheckInterval) } }
Example 66
Source File: ProxL1.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.vector.double import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.{ ProxCapableFunction, ProxMode, ProxValue } class ProxL1(q: Double) extends ProxCapableFunction[DenseVector] { require(q > 0) override def apply(z: DenseVector, t: Double, mode: ProxMode): ProxValue[DenseVector] = { // NOTE DenseVectors are assumed here (not sparse safe). val shrinkage = q * t val minimizer = shrinkage match { case 0.0 => z case _ => new DenseVector(z.values.map(z_i => z_i * (1.0 - math.min(shrinkage / math.abs(z_i), 1.0)))) } val f = if (mode.f) Some(apply(minimizer)) else None ProxValue(f, Some(minimizer)) } override def apply(x: DenseVector): Double = q * Vectors.norm(x, 1) }
Example 67
Source File: LinopMatrixAdjoint.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector import org.apache.spark.mllib.linalg.BLAS import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.CheckedIteratorFunctions._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.LinearOperator import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class LinopMatrixAdjoint(@transient private val matrix: DMatrix) extends LinearOperator[DVector, DenseVector] { if (matrix.getStorageLevel == StorageLevel.NONE) { matrix.cache() } private lazy val n = matrix.first().size override def apply(x: DVector): DenseVector = { val n = this.n matrix.zipPartitions(x)((matrixPartition, xPartition) => Iterator.single( matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate( // NOTE A DenseVector result is assumed here (not sparse safe). Vectors.zeros(n).toDense)( seqop = (_, _) match { case (sum, (matrix_i, x_i)) => { // Multiply an element of x by its corresponding matrix row, and add to the // accumulation sum vector. BLAS.axpy(x_i, matrix_i, sum) sum } }, combop = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } )) ).treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 }, combOp = (sum1, sum2) => { // Add the intermediate sum vectors. BLAS.axpy(1.0, sum2, sum1) sum1 } ) } override def t: LinearOperator[DenseVector, DVector] = new LinopMatrix(matrix) }
Example 68
Source File: SmoothQuad.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.fs.dvector.double import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.{ Mode, SmoothFunction, Value } import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.storage.StorageLevel class SmoothQuad(x0: DVector) extends SmoothFunction[DVector] { if (x0.getStorageLevel == StorageLevel.NONE) { x0.cache() } override def apply(x: DVector, mode: Mode): Value[DVector] = { // Compute the squared error gradient (just the difference between vectors). val g = x.diff(x0) // If both f and g are requested then g will be read twice, so cache it. if (mode.f && mode.g) g.cache() val f = if (mode.f) { // Compute the squared error. // TODO If f is required but not g, then performance might be improved by reimplementing as // a single aggregate using 'x' and 'x0' without an intermediate 'g' DVector, which breaks // per-element pipelining. Some(g.aggregate(0.0)((sum, gPart) => sum + math.pow(Vectors.norm(gPart, 2), 2), _ + _) / 2.0) } else { None } Value(f, Some(g)) } }
Example 69
Source File: VectorSpaceSuite.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.VectorSpace._ import org.apache.spark.mllib.optimization.tfocs.vs.dvector.DVectorSpace import org.apache.spark.mllib.optimization.tfocs.vs.dvectordouble.DVectorDoubleSpace import org.apache.spark.mllib.optimization.tfocs.vs.vector.DenseVectorSpace class VectorSpaceSuite extends FunSuite with MLlibTestSparkContext { test("DenseVectorSpace.combine is implemented properly") { val alpha = 1.1 val a = new DenseVector(Array(2.0, 3.0)) val beta = 4.0 val b = new DenseVector(Array(5.0, 6.0)) val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0) assert(DenseVectorSpace.combine(alpha, a, beta, b) == expectedCombination, "DenseVectorSpace.combine should return the correct result.") } test("DenseVectorSpace.dot is implemented properly") { val a = new DenseVector(Array(2.0, 3.0)) val b = new DenseVector(Array(5.0, 6.0)) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 assert(DenseVectorSpace.dot(a, b) == expectedDot, "DenseVectorSpace.dot should return the correct result.") } test("DVectorSpace.combine is implemented properly") { val alpha = 1.1 val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2) val beta = 4.0 val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2) val combination = DVectorSpace.combine(alpha, a, beta, b) val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0) assert(Vectors.dense(combination.collectElements) == expectedCombination, "DVectorSpace.combine should return the correct result.") } test("DVectorSpace.dot is implemented properly") { val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2) val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 assert(DVectorSpace.dot(a, b) == expectedDot, "DVectorSpace.dot should return the correct result.") } test("DVectorDoubleSpace.combine is implemented properly") { val alpha = 1.1 val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2), 9.9) val beta = 4.0 val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2), 11.11) val combination = DVectorDoubleSpace.combine(alpha, a, beta, b) val expectedCombination = (Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0), 1.1 * 9.9 + 4.0 * 11.11) assert(Vectors.dense(combination._1.collectElements) == expectedCombination._1, "DVectorVectorSpace.combine should return the correct result.") assert(combination._2 == expectedCombination._2, "DVectorVectorSpace.combine should return the correct result.") } test("DVectorDoubleSpace.dot is implemented properly") { val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2), 9.9) val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2), 11.11) val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 + 9.9 * 11.11 assert(DVectorDoubleSpace.dot(a, b) == expectedDot, "DVectorVectorSpace.dot should return the correct result.") } }
Example 70
Source File: LinearOperatorSuite.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.scalatest.FunSuite import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.{ LinopMatrix => LinopMatrixVector } import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector.{ LinopMatrixAdjoint => LinopMatrixVectorAdjoint } import org.apache.spark.mllib.util.MLlibTestSparkContext class LinearOperatorSuite extends FunSuite with MLlibTestSparkContext { lazy val matrix = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)), 2) lazy val vector = new DenseVector(Array(2.2, 3.3, 4.4)) test("LinopMatrix multiplies properly") { val f = new LinopMatrix(matrix) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = Vectors.dense(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9) assert(Vectors.dense(result.collectElements) == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint multiplies properly") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6, 2 * 5 + 5 * 6, 3 * 5 + 6 * 6) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2) intercept[SparkException] { f(y) } } test("LinopMatrixVector multiplies properly") { val f = new LinopMatrixVector(matrix, vector) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = (new DenseVector(Array(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)), 7.0 * 2.2 + 8.0 * 3.3 + 9.0 * 4.4) assert(Vectors.dense(result._1.collectElements) == expectedResult._1, "should return the correct product") assert(result._2 == expectedResult._2, "should return the correct product") } test("LinopMatrixVectorAdjoint multiplies properly") { var f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2), 8.8) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6 + 2.2, 2 * 5 + 5 * 6 + 3.3, 3 * 5 + 6 * 6 + 4.4) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixVectorAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2), 8.8) intercept[SparkException] { f(y) } } }
Example 71
Source File: LocalLDAModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.clustering import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.{DataUtils, ParamUtils} import org.apache.spark.ml.clustering.{LocalLDAModel => SparkLocalLDA} import org.apache.spark.mllib.clustering.{LocalLDAModel => OldSparkLocalLDA} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.sql.SparkSession import DataUtils._ import scala.reflect.runtime.universe class LocalLDAModel(override val sparkTransformer: SparkLocalLDA) extends LocalTransformer[SparkLocalLDA] { lazy val oldModel: OldSparkLocalLDA = { val mirror = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader) val parentTerm = universe.typeOf[SparkLocalLDA].decl(universe.TermName("oldLocalModel")).asTerm mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldSparkLocalLDA] } override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val newData = column.data.mapToMlLibVectors.map(oldModel.topicDistribution(_).toList) localData.withColumn( LocalDataColumn( sparkTransformer.getTopicDistributionCol, newData ) ) case None => localData } } } object LocalLDAModel extends SimpleModelLoader[SparkLocalLDA] with TypedTransformerConverter[SparkLocalLDA] { override def build(metadata: Metadata, data: LocalData): SparkLocalLDA = { val topics = DataUtils.constructMatrix( data.column("topicsMatrix").get.data.head.asInstanceOf[Map[String, Any]] ) val gammaShape = data.column("gammaShape").get.data.head.asInstanceOf[java.lang.Double] val topicConcentration = data.column("topicConcentration").get.data.head.asInstanceOf[java.lang.Double] val docConcentration = DataUtils.constructVector( data.column("docConcentration").get.data.head.asInstanceOf[Map[String, Any]] ) val vocabSize = data.column("vocabSize").get.data.head.asInstanceOf[java.lang.Integer] val oldLdaCtor = classOf[OldSparkLocalLDA].getDeclaredConstructor( classOf[Matrix], classOf[Vector], classOf[Double], classOf[Double] ) val oldLDA = oldLdaCtor.newInstance( Matrices.fromML(topics), Vectors.fromML(docConcentration), topicConcentration, gammaShape ) val ldaCtor = classOf[SparkLocalLDA].getDeclaredConstructor( classOf[String], classOf[Int], classOf[OldSparkLocalLDA], classOf[SparkSession] ) val lda = ldaCtor.newInstance(metadata.uid, vocabSize, oldLDA, null) ParamUtils.set(lda, lda.optimizer, metadata) ParamUtils.set(lda, lda.keepLastCheckpoint, metadata) ParamUtils.set(lda, lda.seed, metadata) ParamUtils.set(lda, lda.featuresCol, metadata) ParamUtils.set(lda, lda.learningDecay, metadata) ParamUtils.set(lda, lda.checkpointInterval, metadata) ParamUtils.set(lda, lda.learningOffset, metadata) ParamUtils.set(lda, lda.maxIter, metadata) ParamUtils.set(lda, lda.k, metadata) lda } override implicit def toLocal(sparkTransformer: SparkLocalLDA): LocalTransformer[SparkLocalLDA] = new LocalLDAModel(sparkTransformer) }
Example 72
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 73
Source File: get_labels_from_VT_signatures.scala From gsoc_relationship with Apache License 2.0 | 5 votes |
import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.clustering.KMeans import PreProcessingConfig._ case class VT_sample_label_rdd_class(sha256:String, label:Array[Double]) def OnehotEncode(number : Double): Array[Double]={ var Listnew = Array.iterate(0.0,kmeans_cluster_number)(a=>0.0) Listnew(number.toInt)=1 return Listnew } val VT_sample_signatures_final_array_rdd = spark.read.format("parquet").load(VT_sample_signatures_final_array_file).rdd.map(row => new VT_sample_signatures_final_array_rdd_class(row(0).toString,row(1).asInstanceOf[Seq[Double]].toArray)) val VT_sample_signatures_with_sha_rddvector = VT_sample_signatures_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results))) val VT_sample_signatures_rddvector = VT_sample_signatures_with_sha_rddvector.map(x=>x._2) val KMeans_Model = KMeans.train(VT_sample_signatures_rddvector,kmeans_cluster_number,30,2) val VT_sample_signatures_label_with_sha_rdd = VT_sample_signatures_with_sha_rddvector.map(x=>(x._1,KMeans_Model.predict(x._2))) val VT_sample_label_rdd = VT_sample_signatures_label_with_sha_rdd.map(x=>new VT_sample_label_rdd_class(x._1, OnehotEncode(x._2.toDouble))) VT_sample_label_rdd.toDF().write.format("parquet").save(VT_sample_label_file)
Example 74
Source File: X2PHelper.scala From spark-tsne with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib import breeze.linalg._ import breeze.numerics._ import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLUtils object X2PHelper { case class VectorWithNorm(vector: Vector, norm: Double) def fastSquaredDistance(v1: VectorWithNorm, v2: VectorWithNorm): Double = { MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm) } def Hbeta(D: DenseVector[Double], beta: Double = 1.0) : (Double, DenseVector[Double]) = { val P: DenseVector[Double] = exp(- D * beta) val sumP = sum(P) if(sumP == 0) { (0.0, DenseVector.zeros(D.size)) }else { val H = log(sumP) + (beta * sum(D :* P) / sumP) (H, P / sumP) } } }
Example 75
Source File: X2P.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import breeze.linalg.DenseVector import org.apache.spark.mllib.X2PHelper._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix} import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.slf4j.LoggerFactory object X2P { private def logger = LoggerFactory.getLogger(X2P.getClass) def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = { require(tol >= 0, "Tolerance must be non-negative") require(perplexity > 0, "Perplexity must be positive") val mu = (3 * perplexity).toInt //TODO: Expose this as parameter val logU = Math.log(perplexity) val norms = x.rows.map(Vectors.norm(_, 2.0)) norms.persist() val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) } val neighbors = rowsWithNorm.zipWithIndex() .cartesian(rowsWithNorm.zipWithIndex()) .flatMap { case ((u, i), (v, j)) => if(i < j) { val dist = fastSquaredDistance(u, v) Seq((i, (j, dist)), (j, (i, dist))) } else Seq.empty } .topByKey(mu)(Ordering.by(e => -e._2)) val p_betas = neighbors.map { case (i, arr) => var betamin = Double.NegativeInfinity var betamax = Double.PositiveInfinity var beta = 1.0 val d = DenseVector(arr.map(_._2)) var (h, p) = Hbeta(d, beta) //logInfo("data was " + d.toArray.toList) //logInfo("array P was " + p.toList) // Evaluate whether the perplexity is within tolerance def Hdiff = h - logU var tries = 0 while (Math.abs(Hdiff) > tol && tries < 50) { //If not, increase or decrease precision if (Hdiff > 0) { betamin = beta beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2 } else { betamax = beta beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2 } // Recompute the values val HP = Hbeta(d, beta) h = HP._1 p = HP._2 tries = tries + 1 } //logInfo("array P is " + p.toList) (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta) } logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean) new CoordinateMatrix(p_betas.flatMap(_._1)) } }
Example 76
Source File: X2PSuite.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import org.apache.spark.SharedSparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.scalatest.{FunSuite, Matchers} class X2PSuite extends FunSuite with SharedSparkContext with Matchers { test("Test X2P against tsne.jl implementation") { val input = new RowMatrix( sc.parallelize(Seq(1 to 3, 4 to 6, 7 to 9, 10 to 12)) .map(x => Vectors.dense(x.map(_.toDouble).toArray)) ) val output = X2P(input, 1e-5, 2).toRowMatrix().rows.collect().map(_.toArray.toList) println(output.toList) //output shouldBe List(List(0, .5, .5), List(.5, 0, .5), List(.5, .5, .0)) } }
Example 77
Source File: BugDemonstrationTest.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} class BugDemonstrationTest extends FunSuite with Matchers with BeforeAndAfterAll { private var sparkSession : SparkSession = _ override def beforeAll(): Unit = { super.beforeAll() sparkSession = SparkSession.builder().appName("BugTests").master("local[2]").getOrCreate() } override def afterAll(): Unit = { super.afterAll() sparkSession.stop() } test("This demonstrates a bug was fixed in tsne-spark 2.1") { val sc = sparkSession.sparkContext val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) val expectedMean = Vectors.dense(2.0,20.0,200.0) val resultMean = summary.mean assertEqualEnough(resultMean, expectedMean) val expectedVariance = Vectors.dense(1.0,100.0,10000.0) assertEqualEnough(summary.variance, expectedVariance) val expectedNumNonZeros = Vectors.dense(3.0, 3.0, 3.0) assertEqualEnough(summary.numNonzeros, expectedNumNonZeros) } private def assertEqualEnough(sample: Vector, expected: Vector): Unit = { expected.toArray.zipWithIndex.foreach{ case(d: Double, i: Int) => sample(i) should be (d +- 1E-12) } } }
Example 78
Source File: MNIST.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne.examples import java.io.{BufferedWriter, OutputStreamWriter} import com.github.saurfang.spark.tsne.impl._ import com.github.saurfang.spark.tsne.tree.SPTree import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.LoggerFactory object MNIST { private def logger = LoggerFactory.getLogger(MNIST.getClass) def main (args: Array[String]) { val conf = new SparkConf() .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[SPTree])) val sc = new SparkContext(conf) val hadoopConf = sc.hadoopConfiguration val fs = FileSystem.get(hadoopConf) val dataset = sc.textFile("data/MNIST/mnist.csv.gz") .zipWithIndex() .filter(_._2 < 6000) .sortBy(_._2, true, 60) .map(_._1) .map(_.split(",")) .map(x => (x.head.toInt, x.tail.map(_.toDouble))) .cache() //logInfo(dataset.collect.map(_._2.toList).toList.toString) //val features = dataset.map(x => Vectors.dense(x._2)) //val scaler = new StandardScaler(true, true).fit(features) //val scaledData = scaler.transform(features) // .map(v => Vectors.dense(v.toArray.map(x => if(x.isNaN || x.isInfinite) 0.0 else x))) // .cache() val data = dataset.flatMap(_._2) val mean = data.mean() val std = data.stdev() val scaledData = dataset.map(x => Vectors.dense(x._2.map(v => (v - mean) / std))).cache() val labels = dataset.map(_._1).collect() val matrix = new RowMatrix(scaledData) val pcaMatrix = matrix.multiply(matrix.computePrincipalComponents(50)) pcaMatrix.rows.cache() val costWriter = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(s".tmp/MNIST/cost.txt"), true))) //SimpleTSNE.tsne(pcaMatrix, perplexity = 20, maxIterations = 200) BHTSNE.tsne(pcaMatrix, maxIterations = 500, callback = { //LBFGSTSNE.tsne(pcaMatrix, perplexity = 10, maxNumIterations = 500, numCorrections = 10, convergenceTol = 1e-8) case (i, y, loss) => if(loss.isDefined) logger.info(s"$i iteration finished with loss $loss") val os = fs.create(new Path(s".tmp/MNIST/result${"%05d".format(i)}.csv"), true) val writer = new BufferedWriter(new OutputStreamWriter(os)) try { (0 until y.rows).foreach { row => writer.write(labels(row).toString) writer.write(y(row, ::).inner.toArray.mkString(",", ",", "\n")) } if(loss.isDefined) costWriter.write(loss.get + "\n") } finally { writer.close() } }) costWriter.close() sc.stop() } }
Example 79
Source File: FactorizationMachineCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{FMModel, FMWithSGD, LabeledPoint} import org.apache.spark.sql.DataFrame class FactorizationMachineCtrModel extends BaseCtrModel { var _model:FMModel = _ def train(samples:DataFrame) : Unit = { //calculate inner product between item embedding and user embedding val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) _pipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct) val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct) val formatSamples = preparedSamples.rdd.map( row =>{ new LabeledPoint(row.getAs[Int]("label").toDouble, Vectors.fromML(row.getAs[DenseVector]("scaledFeatures"))) }) _model = FMWithSGD.train(formatSamples, task = 1, numIterations = 200, stepSize = 0.15, miniBatchFraction = 1, dim = (true, true, 2), regParam = (0, 0, 0), initStd = 0.1) } override def transform(samples:DataFrame):DataFrame = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct) _model.predict(preparedSamples) } }
Example 80
Source File: Preparator.scala From pio-template-sr with Apache License 2.0 | 5 votes |
package org.template.sr import org.apache.predictionio.controller.PPreparator import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import org.apache.spark.ml.feature.StandardScaler import org.apache.spark.sql.DataFrame import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.sql.SQLContext import org.apache.spark.mllib.linalg.Vectors class PreparedData( val rows: DataFrame, val dsp: DataSourceParams, val ssModel: org.apache.spark.mllib.feature.StandardScalerModel ) extends Serializable class Preparator extends PPreparator[TrainingData, PreparedData] { def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { val sqlContext = new SQLContext(sc) import sqlContext.implicits._ if (trainingData.dsp.useStandardScaler) { val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features") val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean) val scalerModel = scaler.fit(training) val scaledData = scalerModel.transform(training) val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features") //Prepare old StandardScaler val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd) val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3)))) new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel) } else { new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null) } } }
Example 81
Source File: SRAlgorithm.scala From pio-template-sr with Apache License 2.0 | 5 votes |
package org.template.sr import org.apache.predictionio.controller.P2LAlgorithm import org.apache.predictionio.controller.Params import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import grizzled.slf4j.Logger import org.apache.spark.mllib.linalg.{Vectors,DenseVector} import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.ml.regression.{AFTSurvivalRegression,AFTSurvivalRegressionModel} case class AlgorithmParams( val quantileProbabilities: Array[Double], val fitIntercept: Boolean, val maxIter: Int, val convTolerance: Double ) extends Params class SRModel( val aAFTSRModel: AFTSurvivalRegressionModel, val ssModel: org.apache.spark.mllib.feature.StandardScalerModel, val useStandardScaler: Boolean ) extends Serializable {} class SRAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, SRModel, Query, PredictedResult] { @transient lazy val logger = Logger[this.type] def train(sc: SparkContext, data: PreparedData): SRModel = { println("Training SR model.") val aft = new AFTSurvivalRegression().setQuantileProbabilities(ap.quantileProbabilities).setQuantilesCol("quantiles").setFitIntercept(ap.fitIntercept).setMaxIter(ap.maxIter).setTol(ap.convTolerance) val model = aft.fit(data.rows) new SRModel(aAFTSRModel = model, ssModel=data.ssModel, useStandardScaler = data.dsp.useStandardScaler) } def predict(model: SRModel, query: Query): PredictedResult = { // val qryRow0 = Vectors.dense(query.features) val qryRow = if (model.useStandardScaler) { model.ssModel.transform(qryRow0) } else { qryRow0 } val score = model.aAFTSRModel.predict(qryRow) val quantilesVec = model.aAFTSRModel.predictQuantiles(qryRow) PredictedResult(coefficients = model.aAFTSRModel.coefficients.toArray, intercept = model.aAFTSRModel.intercept, scale = model.aAFTSRModel.scale, prediction = score, quantiles = quantilesVec.toArray) } }
Example 82
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 83
Source File: KMeans.scala From spark-tda with Apache License 2.0 | 5 votes |
import java.io.{File, PrintWriter} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.sql.functions._ def computeKMeans( pathToTextFile: String, quantity: Int, iteration: Int) { case class Point(x: Double, y: Double) def save(f: File)(func: PrintWriter => Unit) { val p = new PrintWriter(f) try { func(p) } finally { p.close() } } val filename = pathToTextFile.split("\\.")(0) val outputFilename = s"$filename-KMEANS-k${quantity}-i${iteration}.tsv" val points = sc .textFile(pathToTextFile) .map { line => line.trim.split("\\s+") } .map { row => Point(row(0).toDouble, row(1).toDouble) } val features = points .map { p => Vectors.dense(p.x, p.y) } features.cache() val kmeans = KMeans.train(features, quantity, iteration) val predictions = features .map { f => (f(0), f(1), model.predict(f) + 1) } .collect save(new File(outputFilename)) { println(s"OUTPUT TO: ${outputFilename}") f => predictions.foreach{ case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}") } } }
Example 84
Source File: spark-latest.scala From ann-benchmark with Apache License 2.0 | 5 votes |
import org.apache.log4j._ Logger.getRootLogger.setLevel(Level.OFF) import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.ml.classification.MultilayerPerceptronClassifier // maximum number of worker nodes in cluster val numNodes = 5 // batch size, ~10K is good for GPU val batchSize = 1000 // number of iterations to run val numIterations = 5 val train = MLUtils.loadLibSVMFile(sc, "file:///data/mnist/mnist.scale") //val layers = Array[Int](780, 2500, 2000, 1500, 1000, 500, 10) val layers = Array[Int](780, 10) val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1000).setSeed(1234L).setMaxIter(1) for (i <- 1 to numNodes) { val dataPartitions = sc.parallelize(1 to i, i) val sample = train.sample(true, 1.0 / i, 11L).collect val parallelData = sqlContext.createDataFrame(dataPartitions.flatMap(x => sample)) parallelData.persist parallelData.count val t = System.nanoTime() val model = trainer.fit(parallelData) println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) parallelData.unpersist() }
Example 85
Source File: spark.scala From ann-benchmark with Apache License 2.0 | 5 votes |
import org.apache.log4j._ Logger.getRootLogger.setLevel(Level.OFF) import org.apache.spark.mllib.ann.{FeedForwardTrainer, FeedForwardTopology} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.classification.ANNClassifier // maximum number of worker nodes in cluster val numNodes = 5 // batch size, ~10K is good for GPU val batchSize = 1000 // number of iterations to run val numIterations = 5 val train = MLUtils.loadLibSVMFile(sc, "/mnist.scale") val topology = FeedForwardTopology.multiLayerPerceptron(Array[Int](780, 2500, 2000, 1500, 1000, 500, 10), false) val trainer = new FeedForwardTrainer(topology, 780, 10).setBatchSize(batchSize) trainer.SGDOptimizer.setNumIterations(numIterations).setMiniBatchFraction(1.0).setStepSize(0.03) // parallalize the data for N nodes, persist, run X iterations and print average time for each run for (i <- 1 to numNodes) { val dataPartitions = sc.parallelize(1 to i, i) val sample = train.sample(true, 1.0 / i, 11L).collect val parallelData = dataPartitions.flatMap(x => sample) parallelData.persist parallelData.count val t = System.nanoTime() val model = new ANNClassifier(trainer).train(parallelData) println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) }
Example 86
Source File: IDFOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.IDFModel import org.apache.spark.ml.param.Param import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.Vectors class IDFOp extends SimpleSparkOp[IDFModel] { override val Model: OpModel[SparkBundleContext, IDFModel] = new OpModel[SparkBundleContext, IDFModel] { override val klazz: Class[IDFModel] = classOf[IDFModel] override def opName: String = Bundle.BuiltinOps.feature.idf override def store(model: Model, obj: IDFModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("idf", Value.vector(obj.idf.toArray)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): IDFModel = { val idfModel = new feature.IDFModel(Vectors.dense(model.value("idf").getTensor[Double].toArray)) new IDFModel(uid = "", idfModel = idfModel) } } override def sparkLoad(uid: String, shape: NodeShape, model: IDFModel): IDFModel = { new IDFModel(uid = uid, idfModel = new feature.IDFModel(Vectors.dense(model.idf.toArray))) } override def sparkInputs(obj: IDFModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: IDFModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 87
Source File: KMeansOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.clustering.KMeansModel import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.mllib.clustering import org.apache.spark.mllib.linalg.Vectors class KMeansOp extends SimpleSparkOp[KMeansModel] { override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] { override val klazz: Class[KMeansModel] = classOf[KMeansModel] override def opName: String = Bundle.BuiltinOps.clustering.k_means override def store(model: Model, obj: KMeansModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))). withValue("num_features", Value.long(obj.clusterCenters.head.size)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): KMeansModel = { val clusterCenters = model.value("cluster_centers"). getTensorList[Double].toArray. map(t => Vectors.dense(t.toArray)) val mllibModel = new clustering.KMeansModel(clusterCenters) new KMeansModel(uid = "", parentModel = mllibModel) } } override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = { val clusterCenters = model.clusterCenters.map { case DenseVector(values) => Vectors.dense(values) case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values) } new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters)) } override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 88
Source File: SparkNodeWrapper.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.clustering.bundle.tree.clustering import ml.bundle.ctree.Node import ml.combust.bundle.tree.cluster.NodeWrapper import org.apache.spark.mllib.clustering.{ClusteringTreeNode, VectorWithNorm} import org.apache.spark.mllib.linalg.Vectors object SparkNodeWrapper extends NodeWrapper[ClusteringTreeNode] { override def node(n: ClusteringTreeNode): Node = { Node(index = n.index, norm = n.centerWithNorm.norm, values = n.centerWithNorm.vector.toArray.toSeq, numChildren = n.children.length) } override def children(n: ClusteringTreeNode): Array[ClusteringTreeNode] = n.children override def create(node: Node, children: Seq[ClusteringTreeNode]): ClusteringTreeNode = { new ClusteringTreeNode(index = node.index, size = 0, centerWithNorm = new VectorWithNorm(Vectors.dense(node.values.toArray), node.norm), cost = 0.0, height = 0, children = children.toArray) } }
Example 89
Source File: SupportVectorMachineOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.extension.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.mllib.linalg.Vectors class SupportVectorMachineOp extends SimpleSparkOp[SVMModel] { override val Model: OpModel[SparkBundleContext, SVMModel] = new OpModel[SparkBundleContext, SVMModel] { override val klazz: Class[SVMModel] = classOf[SVMModel] override def opName: String = Bundle.BuiltinOps.classification.support_vector_machine override def store(model: Model, obj: SVMModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None model.withValue("coefficients", Value.vector(obj.model.weights.toArray)). withValue("intercept", Value.double(obj.model.intercept)). withValue("num_classes", Value.long(2)). withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): SVMModel = { if(model.value("num_classes").getLong != 2) { throw new IllegalArgumentException("only binary logistic regression supported in Spark") } val weights = Vectors.dense(model.value("coefficients").getTensor[Double].toArray) val svm = new org.apache.spark.mllib.classification.SVMModel( weights = weights, intercept = model.value("intercept").getDouble ) val svmModel = new SVMModel(uid = "", model = svm) model.getValue("thresholds"). map(t => svmModel.setThresholds(t.getDoubleList.toArray)). getOrElse(svmModel) } } override def sparkLoad(uid: String, shape: NodeShape, model: SVMModel): SVMModel = { val m = new SVMModel(uid = uid, model = model.model) if(model.isDefined(model.thresholds)) { m.setThresholds(model.getThresholds) } m } override def sparkInputs(obj: SVMModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: SVMModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 90
Source File: SupportVectorMachineParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.parity.classification import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.mleap.classification.SVMModel import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql._ class SupportVectorMachineParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new SVMModel(uid = "svm", model = new mllib.classification.SVMModel(weights = Vectors.dense(0.53, 0.67), intercept = 0.77)). setRawPredictionCol("raw_prediction"). setProbabilityCol("probability"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 91
Source File: HivemallUtils.scala From hivemall-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, DataFrame, Row, UserDefinedFunction} object HivemallUtils { // # of maximum dimensions for feature vectors val maxDims = 100000000 def funcVectorizer(dense: Boolean = false, dims: Int = maxDims) : UserDefinedFunction = { udf(funcVectorizerImpl(dense, dims)) } private def funcVectorizerImpl(dense: Boolean, dims: Int) : Seq[String] => Vector = { if (dense) { // Dense features i: Seq[String] => { val features = new Array[Double](dims) i.map { ft => val s = ft.split(":").ensuring(_.size == 2) features(s(0).toInt) = s(1).toDouble } Vectors.dense(features) } } else { // Sparse features i: Seq[String] => { val features = i.map { ft => // val s = ft.split(":").ensuring(_.size == 2) val s = ft.split(":") (s(0).toInt, s(1).toDouble) } Vectors.sparse(dims, features) } } } }
Example 92
Source File: L9-3Statistics.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object StatisticsApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => { val stats = Statistics.colStats(rdd) println("Count: " + stats.count) println("Max: " + stats.max.toArray.mkString(" ")) println("Min: " + stats.min.toArray.mkString(" ")) println("Mean: " + stats.mean.toArray.mkString(" ")) println("L1-Norm: " + stats.normL1.toArray.mkString(" ")) println("L2-Norm: " + stats.normL2.toArray.mkString(" ")) println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" ")) println("Varience: " + stats.variance.toArray.mkString(" ")) }) ssc.start() ssc.awaitTermination() } }
Example 93
Source File: L9-7FeatureExtraction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.ChiSqSelector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object FeatureExtractionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: FeatureExtractionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048)))) datastream.foreachRDD(rdd => { val selector = new ChiSqSelector(5) val model = selector.fit(rdd) val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features))) filtered.take(20).foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 94
Source File: L9-9LogisticRegression.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD object LogisticRegressionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(4)) .setStepSize(0.0001) .setNumIterations(1) model.trainOn(train) model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd .map(v => math.pow((v._1 - v._2), 2)).mean()))) ssc.start() ssc.awaitTermination() } }
Example 95
Source File: L9-1LinearRegression.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object LinearRegressionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: LinearRegressionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(4)) .setStepSize(0.0001) .setNumIterations(1) model.trainOn(train) model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd .map(v => math.pow((v._1 - v._2), 2)).mean()))) ssc.start() ssc.awaitTermination() } }
Example 96
Source File: T9-4DataTypes.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataTypesApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val denseV = substream.map(f => Vectors.dense(f.slice(1, 5))) denseV.print() val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) }) .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l)) sparseV.print() val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) labeledP.print() val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53))) denseM.print() denseV.foreachRDD(rdd => { val rowM = new RowMatrix(rdd) println(rowM) }) denseV.foreachRDD(rdd => { val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1)) val iRowM = new IndexedRowMatrix(iRdd) println(iRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val cRowM = new CoordinateMatrix(entries) println(cRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val blockM = new CoordinateMatrix(entries).toBlockMatrix println(blockM) }) ssc.start() ssc.awaitTermination() } }
Example 97
Source File: L9-5ChiSq.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object ChiSqApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) .filter(f => f(0) == 4.0 || f(0) == 5.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) .foreachRDD(rdd => { Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2))) }) ssc.start() ssc.awaitTermination() } }
Example 98
Source File: L9-4Correlation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CorrelationApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) walkingOrRunning.map(f => f.features).foreachRDD(rdd => { val corrSpearman = Statistics.corr(rdd, "spearman") val corrPearson = Statistics.corr(rdd, "pearson") println("Correlation Spearman: \n" + corrSpearman) println("Correlation Pearson: \n" + corrPearson) }) ssc.start() ssc.awaitTermination() } }
Example 99
Source File: L9-6Preprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.StandardScaler import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object PreprocessingApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: PreprocessingAppApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") substream.map(f => Array(f(2), f(4), f(5), f(6))) .map(f => f.map(v => v.toDouble)) .map(f => Vectors.dense(f)) .foreachRDD(rdd => { val scalerModel = new StandardScaler().fit(rdd) val scaledRDD = scalerModel.transform(rdd) }) ssc.start() ssc.awaitTermination() } }
Example 100
Source File: L9-8PCA.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object PCAApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: PCAApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) datastream.foreachRDD(rdd => { val pca = new PCA(rdd.first().features.size / 2) .fit(rdd.map(_.features)) val testTrain = rdd.randomSplit(Array(0.3, 0.7)) val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features))) val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features))) train.take(20).foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 101
Source File: L9-10KMeans.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object KMeansClusteringApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val orientationStream = substream .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray) .map(arr => arr.map(_.toDouble)) .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingKMeans() .setK(3) .setDecayFactor(0) .setRandomCenters(18, 0.0) model.trainOn(train.map(v => v.features)) val prediction = model.predictOnValues(test.map(v => (v.label, v.features))) ssc.start() ssc.awaitTermination() } }
Example 102
Source File: value_model.scala From Spark_Personas with MIT License | 5 votes |
val input_df = hiveContext.sql("select t.lenovo_id,t.monetary,cast(t.frequency as int) as frequency,t.recency from model_input_rfm_t t") val row_nums = input_df.count.toInt //获得总行数 val row_partition = row_nums / 5 //获得5分区点 val row_partition6 = row_nums / 6 //获得6分区点 val input_sort_monetary = input_df.sort($"monetary".desc).collect() val input_sort_frequency = input_df.sort($"frequency".desc).collect() //wrong val input_sort_recency = input_df.sort($"recency".desc).collect() //monetary的分区 val monetary_1 = input_sort_monetary(row_partition * 1).get(1).asInstanceOf[Number].intValue val monetary_2 = input_sort_monetary(row_partition * 2).get(1).asInstanceOf[Number].intValue val monetary_3 = input_sort_monetary(row_partition * 3).get(1).asInstanceOf[Number].intValue val monetary_4 = input_sort_monetary(row_partition * 4).get(1).asInstanceOf[Number].intValue //frequency的分区 val frequency_1 = input_sort_frequency (row_partition * 1).get(2).asInstanceOf[Integer].toInt val frequency_2 = input_sort_frequency (row_partition * 2).get(2).asInstanceOf[Integer].toInt val frequency_3 = input_sort_frequency (row_partition * 3).get(2).asInstanceOf[Integer].toInt val frequency_4 = input_sort_frequency (row_partition * 4).get(2).asInstanceOf[Integer].toInt //recency的分区 val result= input_sort_recency(row_partition6 * 1).get(3).asInstanceOf[String].toString val recency_1 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 2).get(3).asInstanceOf[String].toString val recency_2 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 3).get(3).asInstanceOf[String].toString val recency_3 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 4).get(3).asInstanceOf[String].toString val recency_4 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val result= input_sort_recency(row_partition6 * 5).get(3).asInstanceOf[String].toString val recency_5 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10) val io_monetary = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1 where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,(case when t2.monetary > "+monetary_1+ " then 5 when t2.monetary >"+monetary_2+" then 4 when t2.monetary > "+monetary_3+" then 3 when t2.monetary >"+monetary_4 + " then 2 else 1 end) as points, ' ', ' ' from model_input_rfm_t t2)") io_monetary .registerTempTable("temporary_monetary") //金额临时表 val io_frequency = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1 where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,(case when t2.frequency> "+frequency_1 + " then (50+t3.points) when t2.frequency>"+frequency_2 +" then (40+t3.points) when t2.frequency> "+frequency_3 +" then (30+t3.points) when t2.frequency>"+frequency_4 + " then (20+t3.points) else (10+t3.points) end) as points, ' ', ' ' from model_input_rfm_t t2,temporary_monetary t3 where t2.lenovo_id = t3.lenovo_id)") io_frequency.registerTempTable("temporary_frequency") //频率临时表 //归一化 val result = hiveContext.sql("select max(cast(frequency as int)) from model_input_rfm_t") //求最大频率 val max_frequency = result.collect()(0).get(0).asInstanceOf[Integer].toInt val result = hiveContext.sql("select min(cast(frequency as int)) from temporary_frequency") //最小频率 val min_frequency = result.collect()(0).get(0).asInstanceOf[Integer].toInt val region_frequency = max_frequency - min_frequency val result = hiveContext.sql("select max(unix_timestamp(concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2)),'yyyyMMdd')) from temporary_frequency t2") val max_recency = result.collect()(0).get(0).asInstanceOf[Long] //最大时间 val result = hiveContext.sql("select min(unix_timestamp(concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2)),'yyyyMMdd')) from temporary_frequency t2") val min_recency = result.collect()(0).get(0).asInstanceOf[Long] //最小时间 val region_recency = max_recency - min_recency //时间最大区间 val result =hiveContext.sql("select max(monetary) from model_input_rfm_t") val max_monetary = result.collect()(0).get(0).asInstanceOf[Float] //最大金额 //val result =hiveContext.sql("select min(monetary) from model_input_rfm_t") //val min_monetary = result.collect()(0).get(0).asInstanceOf[Float] //最小金额 val min_monetary = 0 val region_monetary = max_monetary - min_monetary //金额最大区间 val io_recency = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1 where 1=0 ) union all (select t2.lenovo_id, ((t2.frequency - "+min_frequency+")/" + region_frequency + ") as frequency, ((t2.monetary - "+min_monetary+") /" + region_monetary+") as monetary, ((unix_timestamp(t2.recency,'yyyy-MM-dd')- "+min_recency+") / " + region_recency + ") as recency,(case when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))> "+recency_1+ " then (600+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_2+" then (500+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))> "+recency_3+" then (400+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_4+ " then (300+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_5+ " then (200+t3.points) else (100+t3.points) end) as points, ' ', ' ' from model_input_rfm_t t2,temporary_frequency t3 where t2.lenovo_id = t3.lenovo_id)") io_recency.registerTempTable("temporary_recency") //日期临时表 //聚类算法 import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} //DataFrame转化为RDD,直接io_recency.rdd即可 val parsedData = io_recency.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble)) //.cache() val numClusters = 8 val numIterations = 20 val model = KMeans.train(parsedData, numClusters, numIterations) model.clusterCenters.foreach(println) val WSSSE = model.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) val insertData = io_recency.rdd.map( s => Vectors.dense(s.get(0).asInstanceOf[String].toLong,s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble,s.get(4).asInstanceOf[Integer].toInt,' ',model.predict(Vectors.dense(s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble))) ) //.cache() import spark.implicits._ case class Cluster(lenovo_id: Long, frequency:Double,monetary:Double,recency:Double,points:Double,flag:Double,cluster:Double) val rdd_df = insertData.map(attributes => Cluster(attributes(0).toLong, attributes(1).toDouble, attributes(2).toDouble, attributes(3).toDouble, attributes(4).toDouble, attributes(5).toDouble, attributes(6).toDouble)).toDF() rdd_df.registerTempTable("temporary_cluster") hiveContext.sql("insert overwrite table userfigure_local.model_output_rfm_t partition (l_day='2016-10-01') select * from temporary_cluster") val io_cluster = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,t2.points, t2.flag,t2.cluster from temporary_cluster t2)") hiveContext.sql("insert into model_output_rfm_t partition(l_day='2016-10-01') select * from table1")
Example 103
Source File: activity _model.scala From Spark_Personas with MIT License | 5 votes |
//归一化 val result = h.sql("select max(visit_times) from model_input_active_t") //最大访问次数 val max_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(visit_times) from model_input_active_t") //最小访问次数 val min_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_visit_times =if(( max_visit_times - min_visit_times) == 0) 1 else ( max_visit_times - min_visit_times) val result = h.sql("select max(last_online_time) from model_input_active_t") //最远登录天数 val max_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(last_online_time) from model_input_active_t") //最小登录天数 val min_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_last_online_time =if(( max_last_online_time - min_last_online_time ) == 0) 1 else ( max_last_online_time - min_last_online_time) val result = h.sql("select max(pay_times) from model_input_active_t") //最大支付次数 val max_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(pay_times) from model_input_active_t") //最小支付次数 val min_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_pay_times =if(( max_pay_times - min_pay_times ) == 0) 1 else ( max_pay_times - min_pay_times) val result = h.sql("select max(comment_times) from model_input_active_t") //最大问询评论数 val max_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(comment_times) from model_input_active_t") //最小问询评论数 val min_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_comment_times =if(( max_comment_times - min_comment_times ) == 0) 1 else ( max_comment_times - min_comment_times) val result = h.sql("select max(stay_time) from model_input_active_t") //最大停留时间 val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val result = h.sql("select min(stay_time) from model_input_active_t") //最小停留时间 val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val region_stay_time =if(( max_stay_time - min_stay_time ) == 0) 1 else ( max_stay_time - min_stay_time) val result = h.sql("select max(visit_day_times) from model_input_active_t") //最大登录天数 val max_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(visit_day_times) from model_input_active_t") //最小登录天数 val min_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_visit_day_times =if(( max_visit_day_times - min_visit_day_times ) == 0) 1 else ( max_visit_day_times - min_visit_day_times) //权重:visit_times:0.2,visit_targetpage_percen:0.1,last_online_time:0.1,pay_times:0.2,comment_times:0.2,stay_time:0.1,visit_day_times 0.1 val normalization= h.sql("select t1.cookie , ((t1.visit_times- "+min_visit_times+")*0.2/"+region_visit_times+") as visit_times, t1.visit_targetpage_percen*0.1, ((t1.last_online_time- "+min_last_online_time+")*0.1/"+region_last_online_time+") as last_online_time, ((t1.pay_times- "+min_pay_times+")*0.2/"+region_pay_times+") as pay_times, ((t1.comment_times- "+min_comment_times+")*0.2/"+region_comment_times+") as comment_times, ((t1.stay_time- "+min_stay_time+")*0.1/"+region_stay_time+") as stay_time, ((t1.visit_day_times- "+min_visit_day_times+")*0.1/"+region_visit_day_times+") as visit_day_times from model_input_active_t t1") import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix //DataFrame转化为Vectors,没发现直接的API,方案是Dataframe转为rdd,然后,调用Vectors.dense,把它们集合起来 val data = normalization.rdd.map(line => Vectors.dense(line.get(1).toString.asInstanceOf[String].toDouble,line.get(2).toString.asInstanceOf[String].toDouble,line.get(3).toString.asInstanceOf[String].toDouble,line.get(4).toString.asInstanceOf[String].toDouble,line.get(5).toString.asInstanceOf[String].toDouble,line.get(6).toString.asInstanceOf[String].toDouble,line.get(7).toString.asInstanceOf[String].toDouble)) val rm = new RowMatrix(data) val pc = rm.computePrincipalComponents(1) val mx = rm.multiply(pc) //未完待续
Example 104
Source File: loyalty_model.scala From Spark_Personas with MIT License | 5 votes |
//��һ�� val result = hiveContext.sql("select max(login_times) from model_input_loyal_t") //�����ʴ��� val max_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble val result = hiveContext.sql("select min(login_times) from model_input_loyal_t") //��С���ʴ��� val min_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble val region_login_times = max_login_times - min_login_times val result = hiveContext.sql("select max(stay_time) from model_input_loyal_t") //���ͣ��ʱ�� val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val result = hiveContext.sql("select min(stay_time) from model_input_loyal_t") //��Сͣ��ʱ�� val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val region_stay_time = max_stay_time - min_stay_time val result = hiveContext.sql("select max(view_days) from model_input_loyal_t") //���ͣ������ val max_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble val result = hiveContext.sql("select min(view_days) from model_input_loyal_t") //��Сͣ������ val min_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble val region_view_days = max_view_days - min_view_days val result = hiveContext.sql("select max(pv) from model_input_loyal_t") //������ҳ���� val max_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble val result = hiveContext.sql("select min(pv) from model_input_loyal_t") //��С����ҳ���� val min_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble val region_pv = max_pv - min_pv val result =hiveContext.sql("select max(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from model_input_loyal_t t2") val max_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble //���ʱ�� val result = hiveContext.sql("select min(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from model_input_loyal_t t2") val min_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble //��Сʱ�� val region_last_viewtime = max_last_viewtime - min_last_viewtime //Ȩ�أ�login_times:0.2,stay_time:0.3,view_days:0.3,pv:0.15,last_viewtime:0.05 val normalization= hiveContext.sql("select t1.cookie , (((t1.login_times - "+min_login_times+") * 0.2/"+region_login_times+") + ((t1.stay_time- "+min_stay_time+") * 0.3/"+region_stay_time+") +((t1.view_days - "+min_view_days+")* 0.3/"+region_view_days+") +((t1.pv - "+min_pv+")* 0.15/"+region_pv+") +((unix_timestamp(t1.last_viewtime,'yyyy-MM-dd')- "+min_last_viewtime+")*0.05 / " + region_last_viewtime + "))*100 as loyalty_score from model_input_loyal_t t1") normalization.registerTempTable("temporary_points") //��һ����ʱ�� import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.linalg.Vectors val rdd = normalization.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[Double].toDouble)) val summary = Statistics.colStats(rdd) println(summary.mean) val means = summary.mean(0) println(summary.variance) val standard_deviation = summary.variance(0) //����һ���������ľ��룬��Ϊ��ֵ�Ƚ�С����ֵ��ȥ�����Ϊ�����������½�����Ϊ0���Ͻ粻�䣻 val r = means - standard_deviation*5 val low_bound = if (r > 0) r else 0 val up_bound = means + standard_deviation*5 val loyalty_temporary = hiveContext.sql("(select t1.lenovo_id,t1.loyalty_score,t1.loyalty_level from model_output_loyal_t t1 where 1=0) union all (select t2.cookie, t2.loyalty_score,(case when t2.loyalty_score <= "+low_bound+" then '��' when t2.loyalty_score < "+up_bound+" then '��' else '��' end)as loyalty_level from temporary_points t2)") loyalty_temporary.registerTempTable("temporary_loyalty") hiveContext.sql("insert overwrite table data.model_output_loyal_t partition (l_day='2016-10-01') select * from temporary_loyalty")
Example 105
Source File: LibLinAlg.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.liblinear import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import de.bwaldvogel.liblinear.Feature import de.bwaldvogel.liblinear.FeatureNode import de.bwaldvogel.liblinear.Linear import de.bwaldvogel.liblinear.Parameter import de.bwaldvogel.liblinear.Problem import de.bwaldvogel.liblinear.SolverType import se.uu.farmbio.cp.UnderlyingAlgorithm import se.uu.farmbio.cp.Deserializer object LibLinAlg { private def vectorToFeatures(v: Vector) = { val indices = v.toSparse.indices val values = v.toSparse.values indices .zip(values) .sortBy { case (i, v) => i } .map { case (i, v) => new FeatureNode(i + 1, v) .asInstanceOf[Feature] } } private def train( input: Array[LabeledPoint], solverType: SolverType, c: Double, tol: Double) = { //configure problem val problem = new Problem problem.l = input.length problem.n = input(0).features.size problem.x = input.map { p => vectorToFeatures(p.features) } problem.y = input.map(_.label + 1.0) problem.bias = -1.0 //train val parameter = new Parameter(solverType, c, tol) val libLinModel = Linear.train(problem, parameter) //convert to Spark SVMModel val weights = libLinModel.getFeatureWeights val intercept = libLinModel.getBias val svmModel = new SVMModel(Vectors.dense(weights).toSparse, intercept) svmModel.clearThreshold svmModel } } object LibLinAlgDeserializer extends Deserializer[LibLinAlg] { override def deserialize(alg: String) = { val splitted = alg.split(",", 2) val intercept = splitted(0) val weights = splitted(1) val model = new SVMModel(Vectors.parse(weights).toSparse, intercept.toDouble) model.clearThreshold() new LibLinAlg(model) } } class LibLinAlg( val svmModel: SVMModel) extends UnderlyingAlgorithm( (features: Vector) => svmModel.predict(features)) { def this( training: Array[LabeledPoint], solverType: SolverType, regParam: Double, tol: Double) = { this(LibLinAlg.train(training, solverType, regParam, tol)) } override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { score } else { -score } } override def toString = { this.svmModel.intercept + "," + this.svmModel.weights.toString } }
Example 106
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.classification.SVMModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.HingeGradient import org.apache.spark.mllib.optimization.LBFGS import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a SVMs UnderlyingAlgorithm private object SVM { def trainingProcedure( input: RDD[LabeledPoint], maxNumItearations: Int, regParam: Double, numCorrections: Int, convergenceTol: Double) = { //Train SVM with LBFGS val numFeatures = input.take(1)(0).features.size val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache() val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1)) val (weightsWithIntercept, _) = LBFGS.runLBFGS( training, new HingeGradient(), new SquaredL2Updater(), numCorrections, convergenceTol, maxNumItearations, regParam, initialWeightsWithIntercept) //Create the model using the weights val model = new SVMModel( Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), weightsWithIntercept(weightsWithIntercept.size - 1)) //Return raw score predictor model.clearThreshold() model } } class SVM(val model: SVMModel) extends UnderlyingAlgorithm(model.predict) { def this( input: RDD[LabeledPoint], maxNumItearations: Int = 100, regParam: Double = 0.1, numCorrections: Int = 10, convergenceTol: Double = 1e-4) = { this(SVM.trainingProcedure( input, maxNumItearations, regParam, numCorrections, convergenceTol)) } def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { -score } else { score } } }
Example 107
Source File: LogisticRegression.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.alg import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.LBFGS import org.apache.spark.mllib.optimization.LogisticGradient import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.UnderlyingAlgorithm //Define a LogisticRegression UnderlyingAlgorithm private object LogisticRegression { def trainingProcedure( input: RDD[LabeledPoint], maxNumItearations: Int, regParam: Double, numCorrections: Int, convergenceTol: Double): (Vector => Double) = { //Train Logistic Regression with LBFGS val numFeatures = input.take(1)(0).features.size val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache() val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1)) val (weightsWithIntercept, _) = LBFGS.runLBFGS( training, new LogisticGradient(), new SquaredL2Updater(), numCorrections, convergenceTol, maxNumItearations, regParam, initialWeightsWithIntercept) //Create the model using the weights val model = new LogisticRegressionModel( Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), weightsWithIntercept(weightsWithIntercept.size - 1)) //Return raw score predictor model.clearThreshold() model.predict } } class LogisticRegression( private val input: RDD[LabeledPoint], private val maxNumItearations: Int = 100, private val regParam: Double = 0.1, private val numCorrections: Int = 10, private val convergenceTol: Double = 1e-4) extends UnderlyingAlgorithm( LogisticRegression.trainingProcedure( input, maxNumItearations, regParam, numCorrections, convergenceTol)) { override def nonConformityMeasure(newSample: LabeledPoint) = { val score = predictor(newSample.features) if (newSample.label == 1.0) { 1-score } else { score } } }
Example 108
Source File: TestUtils.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp import scala.util.Random import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object TestUtils { def generate4ClassesData(instances: Int, seed: Long): Seq[LabeledPoint] = { val rnd = new Random(seed) Seq.fill(instances)((rnd.nextInt(100), rnd.nextInt(100))).map(r => { val label = if (r._1 < 50 && r._2 < 50) { 0.0 } else if (r._1 < 50) { 1.0 } else if (r._2 < 50) { 2.0 } else { 3.0 } new LabeledPoint(label, Vectors.dense(Array(r._1.toDouble, r._2.toDouble))) }) } def generate4ClassesTrainCalibTest(significance: Double) = { val numClasses = 4 val calibSamples = 4 * numClasses * (1 / significance - 1).ceil.toInt //4 times the minimum val training = generate4ClassesData(instances = 80, seed = Random.nextLong) val test = generate4ClassesData(instances = 20, seed = Random.nextLong) val calibration = generate4ClassesData(instances = calibSamples, seed = Random.nextLong) .toArray (training, calibration, test) } def generateBinaryData(instances: Int, seed: Long): Seq[LabeledPoint] = { val rnd = new Random(seed) Seq.fill(instances)(rnd.nextInt(100)).map(r => { val label = if (r < 50) { 0.0 } else { 1.0 } new LabeledPoint(label, Vectors.dense(r)) }) } def testPerformance[T <: UnderlyingAlgorithm]( model: ICPClassifierModel[T], test: RDD[LabeledPoint], sig: Double = 0.2, minEff: Double = 0.6, minRec: Double = 0.6) = { val pvAndLab = test.map { p => (model.mondrianPv(p.features), p.label) } val metrics = new BinaryClassificationICPMetrics(pvAndLab, Array(sig)) val eff = metrics.efficiencyBySignificance(sig) val rec = metrics.recallBySignificance(sig) eff >= minEff && rec >= minRec } }
Example 109
Source File: GamerSparkSQLExample.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.aggregates import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object GamerSparkSQLExample { def main(args:Array[String]): Unit = { if (args.length == 0) { println("{kudumaster} {runLocal}") return } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val runLocal = args(1).equals("l") println("Loading Spark Context") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } println("Loading Spark Context: Finished") println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer") println("Query 1: SELECT count(*) FROM gamer") val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT * FROM gamer limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100") val startTimeQ3 = System.currentTimeMillis() sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer") val startTimeQ4 = System.currentTimeMillis() sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4)) println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble) Vectors.dense(array) }) val dataCount = parsedData.count() if (dataCount > 0) { val clusters = KMeans.train(parsedData, 3, 5) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) } //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } }
Example 110
Source File: BasicSparkSQLExamples.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object BasicSparkSQLExamples { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tablename> <runLocal>") } Logger.getRootLogger.setLevel(Level.ERROR) val kuduMaster = args(0) val tableName = args(1) val runLocal = args(2).equals("l") println("starting") var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } try { println("Setting up Tables") val sqlContext = new SQLContext(sc) sqlContext.load("org.kududb.spark", Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName) println("Query 1: SELECT count(*) FROM " + tableName) val startTimeQ1 = System.currentTimeMillis() sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100") val startTimeQ2 = System.currentTimeMillis() sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)" println("Query 3: " + q3) val startTimeQ3 = System.currentTimeMillis() sqlContext.sql(q3).take(100).foreach(r => { println(" - (" + r + ")") }) println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName ) val startTimeQ5 = System.currentTimeMillis() val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000") val parsedData = resultDf.map(r => { val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble) Vectors.dense(array) }) val clusters = KMeans.train(parsedData, 3, 4) clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) //TODO add Mllib here println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) } finally { sc.stop() } } }
Example 111
package org.dizhang.seqspark.stat import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import org.apache.spark.mllib.feature.{PCA => SPCA} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.dizhang.seqspark.ds.{DenseCounter, Genotype, SparseCounter} import org.dizhang.seqspark.util.General._ import org.dizhang.seqspark.worker.Data import org.slf4j.LoggerFactory } def pc(n: Int): BDM[Double] = { val model = new SPCA(n) val data = this.prepare if (data.isEmpty()) { new BDM[Double](0, 0) } else { val res = model.fit(data).pc.values new BDM(res.length/n, n, res) } } }
Example 112
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 113
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.classification.stumbleupon import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 114
Source File: StandardScalarSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} object StandardScalarSample { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("Word2Vector") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, org.sparksamples.Util.SPARK_HOME + "/data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) println(data1.first()) // Without converting the features into dense vectors, transformation with zero mean will raise // exception on sparse vector. // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) println(data2.first()) } }
Example 115
Source File: StandardScalarSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} object StandardScalarSample { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("Word2Vector") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6/data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) println(data1.first()) // Without converting the features into dense vectors, transformation with zero mean will raise // exception on sparse vector. // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) println(data2.first()) } }
Example 116
Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.stumbleuponclassifier import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.mllib.classification.SVMWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint object SVMPipeline { @transient lazy val logger = Logger.getLogger(getClass.getName) def svmPipeline(sc: SparkContext) = { val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t")) val data = records.map { r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features)) } // params for SVM val numIterations = 10 // Run training algorithm to build the model val svmModel = SVMWithSGD.train(data, numIterations) // Clear the default threshold. svmModel.clearThreshold() val svmTotalCorrect = data.map { point => if(svmModel.predict(point.features) == point.label) 1 else 0 }.sum() // calculate accuracy val svmAccuracy = svmTotalCorrect / data.count() println(svmAccuracy) } }
Example 117
Source File: MovieLensDataPowerIterationClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.recommendation.{ALS, Rating} import org.apache.spark.{SparkConf, SparkContext} object MovieLensDataPowerIterationClustering { val PATH= "../data/ml-100k" def main(args: Array[String]): Unit = { val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val sc = new SparkContext(spConfig) //val path = PATH + "../data/" //val rdd = sc.wholeTextFiles(path) val movies = sc.textFile(PATH + "/u.item") println(movies.first) val genres = sc.textFile(PATH + "/u.genre") genres.take(5).foreach(println) val genreMap = genres.filter(!_.isEmpty).map(line => line.split("\\|")). map(array => (array(1), array(0))).collectAsMap val titlesAndGenres = movies.map(_.split("\\|")).map { array => val genres = array.toSeq.slice(5, array.size) val genresAssigned = genres.zipWithIndex.filter { case (g, idx) => g == "1" }.map { case (g, idx) => genreMap(idx.toString) } (array(0).toInt, (array(1), genresAssigned)) } val rawData = sc.textFile(PATH + "/u.data") val rawRatings = rawData.map(_.split("\t").take(3)) val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } ratings.cache val alsModel = ALS.train(ratings, 50, 10, 0.1) import org.apache.spark.mllib.linalg.Vectors val movieFactors = alsModel.productFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) } val movieVectors = movieFactors.map(_._2) val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) } val userVectors = userFactors.map(_._2) val numClusters = 5 val numIterations = 10 val numRuns = 3 import org.apache.spark.mllib.clustering.PowerIterationClustering //val bKMeans = new PowerIterationClustering()() val piClustering = new PowerIterationClustering() piClustering.setMaxIterations(10) piClustering.setK(numClusters) println("done") } }
Example 118
Source File: GMMClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gmm // scalastyle:off println // $example on$ import org.apache.spark.SparkConf import org.apache.spark.ml.clustering.{GaussianMixture, KMeans} // $example off$ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object GMMClustering { def main(args: Array[String]): Unit = { val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp"). set("spark.driver.allowMultipleContexts", "true") val spark = SparkSession .builder() .appName("Spark SQL Example") .config(spConfig) .getOrCreate() val datasetUsers = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_users_libsvm/part-00000") datasetUsers.show(3) val gmmUsers = new GaussianMixture().setK(5).setSeed(1L) val modelUsers = gmmUsers.fit(datasetUsers) for (i <- 0 until modelUsers.gaussians.length) { println("Users : weight=%f\ncov=%s\nmean=\n%s\n" format (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean)) } val dataSetItems = spark.read.format("libsvm").load( "./data/movie_lens_libsvm/movie_lens_items_libsvm/part-00000") val gmmItems = new GaussianMixture().setK(5).setSeed(1L) val modelItems = gmmItems.fit(dataSetItems) for (i <- 0 until modelItems.gaussians.length) { println("Items : weight=%f\ncov=%s\nmean=\n%s\n" format (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean)) } spark.stop() } def loadInLibSVMFormat(line: String, noOfFeatures : Int) : LabeledPoint = { val items = line.split(' ') val label = items.head.toDouble val (indices, values) = items.tail.filter(_.nonEmpty).map { item => val indexAndValue = item.split(':') val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based. val value = indexAndValue(1).toDouble (index, value) }.unzip // check if indices are one-based and in ascending order var previous = -1 var i = 0 val indicesLength = indices.length while (i < indicesLength) { val current = indices(i) require(current > previous, "indices should be one-based and in ascending order" ) previous = current i += 1 } (label, indices.toArray, values.toArray) import org.apache.spark.mllib.linalg.Vectors val d = noOfFeatures LabeledPoint(label, Vectors.sparse(d, indices, values)) } }
Example 119
Source File: LDATextExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.lda import scala.collection.mutable import org.apache.spark.mllib.clustering.LDA import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext object LDATextExample { val PATH = "/home/ubuntu/work/spark-src/spark/" val sc = new SparkContext("local[2]", "First Spark App") def main(args: Array[String]): Unit = { // Load documents from text files, 1 document per file val corpus: RDD[String] = sc.wholeTextFiles(PATH + "docs/*.md").map(_._2) // Split each document into a sequence of terms (words) val tokenized: RDD[Seq[String]] = corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3). filter(_.forall(java.lang.Character.isLetter))) // Choose the vocabulary. // termCounts: Sorted list of (term, termCount) pairs val termCounts: Array[(String, Long)] = tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2) // vocabArray: Chosen vocab (removing common terms) val numStopwords = 20 val vocabArray: Array[String] = termCounts.takeRight(termCounts.size - numStopwords).map(_._1) // vocab: Map term -> term index val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap // Convert documents into term count vectors val documents: RDD[(Long, Vector)] = tokenized.zipWithIndex.map { case (tokens, id) => val counts = new mutable.HashMap[Int, Double]() tokens.foreach { term => if (vocab.contains(term)) { val idx = vocab(term) counts(idx) = counts.getOrElse(idx, 0.0) + 1.0 } } (id, Vectors.sparse(vocab.size, counts.toSeq)) } // Set LDA parameters val numTopics = 10 val lda = new LDA().setK(numTopics).setMaxIterations(10) val ldaModel = lda.run(documents) //val avgLogLikelihood = ldaModel. / documents.count() // Print topics, showing top-weighted 10 terms for each topic. val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() } } }
Example 120
Source File: SparkMatrix.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.matrix import org.apache.spark.ml.linalg.Matrix import org.apache.spark.ml.linalg.Matrices import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.MatrixEntry object SparkMatrix { def main(args: Array[String]) { val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0)) println("dMatrix: \n" + dMatrix) val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7)) println("sMatrixOne: \n" + sMatrixOne) val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7)) println("sMatrixTwo: \n" + sMatrixTwo) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val denseData = Seq( Vectors.dense(0.0, 1.0, 2.1), Vectors.dense(3.0, 2.0, 4.0), Vectors.dense(5.0, 7.0, 8.0), Vectors.dense(9.0, 0.0, 1.1) ) val sparseData = Seq( Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))), Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))), Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))), Vectors.sparse(3, Seq((0, 9.0), (2, 1.0))) ) val denseMat = new RowMatrix(sc.parallelize(denseData, 2)) val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2)) println("Dense Matrix - Num of Rows :" + denseMat.numRows()) println("Dense Matrix - Num of Cols:" + denseMat.numCols()) println("Sparse Matrix - Num of Rows :" + sparseMat.numRows()) println("Sparse Matrix - Num of Cols:" + sparseMat.numCols()) val data = Seq( (0L, Vectors.dense(0.0, 1.0, 2.0)), (1L, Vectors.dense(3.0, 4.0, 5.0)), (3L, Vectors.dense(9.0, 0.0, 1.0)) ).map(x => IndexedRow(x._1, x._2)) val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2) val indexedRowsMat = new IndexedRowMatrix(indexedRows) println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows()) println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols()) val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } val coordinateMat = new CoordinateMatrix(entries) println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows()) println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols()) sc.stop() } }
Example 121
Source File: SparkSVDExampleOne.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.svd import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors} object SparkSVDExampleOne { def main(args: Array[String]) { val denseData = Seq( Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1), Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3), Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8), Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0) ) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo") val sc = new SparkContext(spConfig) val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2)) // Compute the top 20 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. println("U:" + U) println("s:" + s) println("V:" + V) sc.stop() } }
Example 122
Source File: SparkSGD.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.sgd import scala.util.Random import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.GradientDescent import org.apache.spark.mllib.optimization.SquaredL2Updater import org.apache.spark.mllib.optimization.LogisticGradient import org.apache.spark.SparkContext object SparkSGD { def main(args: Array[String]): Unit = { val m = 4 val n = 200000 val sc = new SparkContext("local[2]", "") val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) => val random = new Random(idx) iter.map(i => (1.0, Vectors.dense(Array.fill(n)(random.nextDouble())))) }.cache() val (weights, loss) = GradientDescent.runMiniBatchSGD( points, new LogisticGradient, new SquaredL2Updater, 0.1, 2, 1.0, 1.0, Vectors.dense(new Array[Double](n))) println("w:" + weights(0)) println("loss:" + loss(0)) sc.stop() } }
Example 123
Source File: DataFrameExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 124
Source File: SummaryStatisticsExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 125
Source File: PCAOnSourceVectorExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 126
Source File: PCAOnRowMatrixExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 127
Source File: TallSkinnyPCA.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 128
Source File: GaussianMixtureExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object GaussianMixtureExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("GaussianMixtureExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/gmm_data.txt") val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using GaussianMixture val gmm = new GaussianMixture().setK(2).run(parsedData) // Save and load model gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") val sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") // output parameters of max-likelihood model for (i <- 0 until gmm.k) { println("weight=%f\nmu=%s\nsigma=\n%s\n" format (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma)) } // $example off$ sc.stop() } } // scalastyle:on println
Example 129
Source File: PCAExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 130
Source File: DenseKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 131
Source File: CosineSimilarity.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 132
Source File: ElementwiseProductExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors // $example off$ object ElementwiseProductExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("ElementwiseProductExample") val sc = new SparkContext(conf) // $example on$ // Create some vector data; also works for sparse vectors val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) val transformingVector = Vectors.dense(0.0, 1.0, 2.0) val transformer = new ElementwiseProduct(transformingVector) // Batch transform and per-row transform give the same results: val transformedData = transformer.transform(data) val transformedData2 = data.map(x => transformer.transform(x)) // $example off$ println("transformedData: ") transformedData.foreach(x => println(x)) println("transformedData2: ") transformedData2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 133
Source File: SVDExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 134
Source File: TallSkinnySVD.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 135
Source File: StandardScalerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 136
Source File: KMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 137
Source File: MultivariateSummarizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 138
Source File: LinearRegressionWithSGDExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 139
Source File: StreamingLinearRegressionExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") System.exit(1) } val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") val ssc = new StreamingContext(conf, Seconds(1)) // $example on$ val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ ssc.stop() } } // scalastyle:on println
Example 140
Source File: BisectingKMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println
Example 141
Source File: StreamingKMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 142
Source File: Normalizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 143
Source File: GaussianMixtureModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import scala.collection.JavaConverters import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.GaussianMixtureModel import org.apache.spark.mllib.linalg.{Vector, Vectors} val gaussians: Array[Byte] = { val modelGaussians = model.gaussians.map { gaussian => Array[Any](gaussian.mu, gaussian.sigma) } SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava) } def predictSoft(point: Vector): Vector = { Vectors.dense(model.predictSoft(point)) } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 144
Source File: Word2VecModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 145
Source File: MatrixFactorizationModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 146
Source File: SpearmanCorrelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 147
Source File: LogisticRegressionDataGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 148
Source File: SVMDataGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 149
Source File: LabeledPoint.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 150
Source File: ChiSqSelectorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { test("ChiSqSelector transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("ChiSqSelector by fpr transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) val preFilteredData = Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr") .setFpr(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSeq assert(filteredData === preFilteredData) } test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString try { model.save(sc, path) val sameModel = ChiSqSelectorModel.load(sc, path) ChiSqSelectorSuite.checkEqual(model, sameModel) } finally { Utils.deleteRecursively(tempDir) } } } object ChiSqSelectorSuite extends SparkFunSuite { def createModel(): ChiSqSelectorModel = { val arr = Array(1, 2, 3, 4) new ChiSqSelectorModel(arr) } def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = { assert(a.selectedFeatures.deep == b.selectedFeatures.deep) } }
Example 151
Source File: ElementwiseProductSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext { test("elementwise (hadamard) product should properly apply vector to dense data set") { val denseData = Array( Vectors.dense(1.0, 4.0, 1.9, -9.0) ) val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) val transformer = new ElementwiseProduct(scalingVec) val transformedData = transformer.transform(sc.makeRDD(denseData)) val transformedVecs = transformedData.collect() val transformedVec = transformedVecs(0) val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) assert(transformedVec ~== expectedVec absTol 1E-5, s"Expected transformed vector $expectedVec but found $transformedVec") } test("elementwise (hadamard) product should properly apply vector to sparse data set") { val sparseData = Array( Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) ) val dataRDD = sc.parallelize(sparseData, 3) val scalingVec = Vectors.dense(1.0, 0.0, 0.5) val transformer = new ElementwiseProduct(scalingVec) val data2 = sparseData.map(transformer.transform) val data2RDD = transformer.transform(dataRDD) assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after hadamard product") assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) } }
Example 152
Source File: IDFSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext { test("idf") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((m + 1.0) / (x + 1.0)) }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } test("idf minimum document frequency filtering") { val n = 4 val localTermFrequencies = Seq( Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(n, Array(1), Array(1.0)) ) val m = localTermFrequencies.size val termFrequencies = sc.parallelize(localTermFrequencies, 2) val idf = new IDF(minDocFreq = 1) val model = idf.fit(termFrequencies) val expected = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) { math.log((m + 1.0) / (x + 1.0)) } else { 0 } }) assert(model.idf ~== expected absTol 1e-12) val assertHelper = (tfidf: Array[Vector]) => { assert(tfidf.size === 3) val tfidf0 = tfidf(0).asInstanceOf[SparseVector] assert(tfidf0.indices === Array(1, 3)) assert(Vectors.dense(tfidf0.values) ~== Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12) val tfidf1 = tfidf(1).asInstanceOf[DenseVector] assert(Vectors.dense(tfidf1.values) ~== Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12) val tfidf2 = tfidf(2).asInstanceOf[SparseVector] assert(tfidf2.indices === Array(1)) assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12) } // Transforms a RDD val tfidf = model.transform(termFrequencies).collect() assertHelper(tfidf) // Transforms local vectors val localTfidf = localTermFrequencies.map(model.transform(_)).toArray assertHelper(localTfidf) } }
Example 153
Source File: PCASuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 154
Source File: HashingTFSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext { test("hashing tf on a single doc") { val hashingTF = new HashingTF(1000) val doc = "a a b b c d".split(" ") val n = hashingTF.numFeatures val termFreqs = Seq( (hashingTF.indexOf("a"), 2.0), (hashingTF.indexOf("b"), 2.0), (hashingTF.indexOf("c"), 1.0), (hashingTF.indexOf("d"), 1.0)) assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n), "index must be in range [0, #features)") assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing") val expected = Vectors.sparse(n, termFreqs) assert(hashingTF.transform(doc) === expected) } test("hashing tf on an RDD") { val hashingTF = new HashingTF val localDocs: Seq[Seq[String]] = Seq( "a a b b b c d".split(" "), "a b c d a b c".split(" "), "c b a c b a a".split(" ")) val docs = sc.parallelize(localDocs, 2) assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet) } test("applying binary term freqs") { val hashingTF = new HashingTF(100).setBinary(true) val doc = "a a b c c c".split(" ") val n = hashingTF.numFeatures val expected = Vectors.sparse(n, Seq( (hashingTF.indexOf("a"), 1.0), (hashingTF.indexOf("b"), 1.0), (hashingTF.indexOf("c"), 1.0))) assert(hashingTF.transform(doc) ~== expected absTol 1e-14) } }
Example 155
Source File: EnsembleTestHelper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 156
Source File: PythonMLLibAPISuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.mllib.regression.LabeledPoint class PythonMLLibAPISuite extends SparkFunSuite { SerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = SerDe.loads(SerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle labeled point") { val points = Seq( LabeledPoint(0.0, Vectors.dense(Array.empty[Double])), LabeledPoint(1.0, Vectors.dense(0.0)), LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)), LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])), LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])), LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0)))) points.foreach { p => val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint] assert(q.label === p.label) assert(q.features.getClass === p.features.getClass) assert(q.features === p.features) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } test("pickle rating") { val rat = new Rating(1, 2, 3.0) val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating] assert(rat == rat2) // Test name of class only occur once val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray val bytes = SerDe.dumps(rats) assert(bytes.toString.split("Rating").length == 1) assert(bytes.length / 10 < 25) // 25 bytes per rating } }
Example 157
Source File: MultivariateGaussianSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Matrices, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 158
Source File: KMeansPMMLModelExportSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.dmg.pmml.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors class KMeansPMMLModelExportSuite extends SparkFunSuite { test("KMeansPMMLModelExport generate PMML format") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) // assert that the PMML format is as expected assert(modelExport.isInstanceOf[PMMLModelExport]) val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml assert(pmml.getHeader.getDescription === "k-means clustering") // check that the number of fields match the single vector size assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size) // This verify that there is a model attached to the pmml object and the model is a clustering // one. It also verifies that the pmml model has the same number of clusters of the spark model. val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel] assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length) } }
Example 159
Source File: PMMLModelExportFactorySuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.pmml.export import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel} import org.apache.spark.mllib.clustering.KMeansModel import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} import org.apache.spark.mllib.util.LinearDataGenerator class PMMLModelExportFactorySuite extends SparkFunSuite { test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") { val clusterCenters = Array( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0)) val kmeansModel = new KMeansModel(clusterCenters) val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel) assert(modelExport.isInstanceOf[KMeansPMMLModelExport]) } test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a " + "LinearRegressionModel, RidgeRegressionModel or LassoModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val linearRegressionModel = new LinearRegressionModel(linearInput(0).features, linearInput(0).label) val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel) assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val ridgeRegressionModel = new RidgeRegressionModel(linearInput(0).features, linearInput(0).label) val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel) assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label) val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel) assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport]) } test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport " + "when passing a LogisticRegressionModel or SVMModel") { val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17) val logisticRegressionModel = new LogisticRegressionModel(linearInput(0).features, linearInput(0).label) val logisticRegressionModelExport = PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel) assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label) val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel) assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport]) } test("PMMLModelExportFactory throw IllegalArgumentException " + "when passing a Multinomial Logistic Regression") { val multiclassLogisticRegressionModel = new LogisticRegressionModel( weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0, numFeatures = 2, numClasses = 3) intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel) } } test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") { val invalidModel = new Object intercept[IllegalArgumentException] { PMMLModelExportFactory.createPMMLModelExport(invalidModel) } } }
Example 160
Source File: CoordinateMatrixSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { val m = 5 val n = 4 var mat: CoordinateMatrix = _ override def beforeAll() { super.beforeAll() val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } mat = new CoordinateMatrix(entries) } test("size") { assert(mat.numRows() === m) assert(mat.numCols() === n) } test("empty entries") { val entries = sc.parallelize(Seq[MatrixEntry](), 1) val emptyMat = new CoordinateMatrix(entries) intercept[RuntimeException] { emptyMat.numCols() } intercept[RuntimeException] { emptyMat.numRows() } } test("toBreeze") { val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(mat.toBreeze() === expected) } test("transpose") { val transposed = mat.transpose() assert(mat.toBreeze().t === transposed.toBreeze()) } test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( (1.0, 2.0, 0.0, 0.0), (0.0, 3.0, 4.0, 0.0), (0.0, 0.0, 5.0, 6.0), (7.0, 0.0, 0.0, 8.0), (0.0, 9.0, 0.0, 0.0)) assert(indexedRowMatrix.toBreeze() === expected) } test("toRowMatrix") { val rowMatrix = mat.toRowMatrix() val rows = rowMatrix.rows.collect().toSet val expected = Set( Vectors.dense(1.0, 2.0, 0.0, 0.0), Vectors.dense(0.0, 3.0, 4.0, 0.0), Vectors.dense(0.0, 0.0, 5.0, 6.0), Vectors.dense(7.0, 0.0, 0.0, 8.0), Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } test("toBlockMatrix") { val blockMat = mat.toBlockMatrix(2, 2) assert(blockMat.numRows() === m) assert(blockMat.numCols() === n) assert(blockMat.toBreeze() === mat.toBreeze()) intercept[IllegalArgumentException] { mat.toBlockMatrix(-1, 2) } intercept[IllegalArgumentException] { mat.toBlockMatrix(2, 0) } } }
Example 161
Source File: LabeledPointSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.Vectors class LabeledPointSuite extends SparkFunSuite { test("parse labeled points") { val points = Seq( LabeledPoint(1.0, Vectors.dense(1.0, 0.0)), LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0)))) points.foreach { p => assert(p === LabeledPoint.parse(p.toString)) } } test("parse labeled points with whitespaces") { val point = LabeledPoint.parse("(0.0, [1.0, 2.0])") assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) } test("parse labeled points with v0.9 format") { val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0") assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) } test("conversions between new ml LabeledPoint and mllib LabeledPoint") { val points: Seq[LabeledPoint] = Seq( LabeledPoint(1.0, Vectors.dense(1.0, 0.0)), LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0)))) val newPoints: Seq[NewLabeledPoint] = points.map(_.asML) points.zip(newPoints).foreach { case (p1, p2) => assert(p1 === LabeledPoint.fromML(p2)) } } }
Example 162
Source File: RidgeRegressionSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.{LinearDataGenerator, LocalClusterSparkContext, MLlibTestSparkContext} import org.apache.spark.util.Utils private object RidgeRegressionSuite { val model = new RidgeRegressionModel(weights = Vectors.dense(0.1, 0.2, 0.3), intercept = 0.5) } class RidgeRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { def predictionError(predictions: Seq[Double], input: Seq[LabeledPoint]): Double = { predictions.zip(input).map { case (prediction, expected) => (prediction - expected.label) * (prediction - expected.label) }.sum / predictions.size } test("ridge regression can help avoid overfitting") { // For small number of examples and large variance of error distribution, // ridge regression should give smaller generalization error that linear regression. val numExamples = 50 val numFeatures = 20 // Pick weights as random values distributed uniformly in [-0.5, 0.5] val random = new Random(42) val w = Array.fill(numFeatures)(random.nextDouble() - 0.5) // Use half of data for training and other half for validation val data = LinearDataGenerator.generateLinearInput(3.0, w, 2 * numExamples, 42, 10.0) val testData = data.take(numExamples) val validationData = data.takeRight(numExamples) val testRDD = sc.parallelize(testData, 2).cache() val validationRDD = sc.parallelize(validationData, 2).cache() // First run without regularization. val linearReg = new LinearRegressionWithSGD() linearReg.optimizer.setNumIterations(200) .setStepSize(1.0) val linearModel = linearReg.run(testRDD) val linearErr = predictionError( linearModel.predict(validationRDD.map(_.features)).collect(), validationData) val ridgeReg = new RidgeRegressionWithSGD() ridgeReg.optimizer.setNumIterations(200) .setRegParam(0.1) .setStepSize(1.0) val ridgeModel = ridgeReg.run(testRDD) val ridgeErr = predictionError( ridgeModel.predict(validationRDD.map(_.features)).collect(), validationData) // Ridge validation error should be lower than linear regression. assert(ridgeErr < linearErr, "ridgeError (" + ridgeErr + ") was not less than linearError(" + linearErr + ")") } test("model save/load") { val model = RidgeRegressionSuite.model val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString // Save model, load it back, and compare. try { model.save(sc, path) val sameModel = RidgeRegressionModel.load(sc, path) assert(model.weights == sameModel.weights) assert(model.intercept == sameModel.intercept) } finally { Utils.deleteRecursively(tempDir) } } } class RidgeRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext { test("task size should be small in both training and prediction") { val m = 4 val n = 200000 val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) => val random = new Random(idx) iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble())))) }.cache() // If we serialize data directly in the task closure, the size of the serialized task would be // greater than 1MB and hence Spark would throw an error. val model = RidgeRegressionWithSGD.train(points, 2) val predictions = model.predict(points.map(_.features)) } }
Example 163
Source File: HoltWintersBestModelEvaluation.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberHoltWintersModel import eleflow.uberdata.enums.SupportedAlgorithm import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.{ParamMap, ParamPair} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import scala.reflect.ClassTag abstract class HoltWintersBestModelEvaluation[L, M <: ForecastBaseModel[M]]( implicit kt: ClassTag[L], ord: Ordering[L] = null ) extends BestModelFinder[L, M] with HoltWintersParams { protected def holtWintersEvaluation( row: Row, model: UberHoltWintersModel, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[L]], id: L ): (UberHoltWintersModel, ModelParamEvaluation[L]) = { val features = row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) log.warn( s"Evaluating forecast for id $id, with parameters " + s"alpha ${model.alpha}, beta ${model.beta} and gamma ${model.gamma}" ) val expectedResult = row.getAs[org.apache.spark.ml.linalg.Vector](partialValidationCol) val forecastToBeValidated = Vectors.dense(new Array[Double]($(nFutures))) model.forecast(org.apache.spark.mllib.linalg.Vectors.fromML(features), forecastToBeValidated).toArray val toBeValidated = expectedResult.toArray.zip(forecastToBeValidated.toArray) val metric = broadcastEvaluator.value.evaluate(toBeValidated) val metricName = broadcastEvaluator.value.getMetricName val params = ParamMap().put( ParamPair(gamma, model.gamma), ParamPair(beta, model.beta), ParamPair(alpha, model.alpha) ) (model, new ModelParamEvaluation[L]( id, metric, params, Some(metricName), SupportedAlgorithm.HoltWinters )) } }
Example 164
Source File: QuadraticRenyiEntropy.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.prototype import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import io.github.mandar2812.dynaml.kernels.DensityKernel override def entropy(data: List[DenseVector[Double]]): Double = { val dim = data.head.length val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2)) val product = for(i <- data.view; j <- data.view) yield (i, j) -1*log_e(product.map((couple) => { val point1: DenseVector[Double] = couple._1 / sqrt(2.0) val point2: DenseVector[Double] = couple._2 / sqrt(2.0) density.eval(point1 - point2) }).sum) } override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = { val dim = data.first()._2.features.size -1*log_e(data.cartesian(data).map((couple) =>{ val point1: DenseVector[Double] = DenseVector(couple._1._2.features.toArray) / sqrt(2.0) val point2: DenseVector[Double] = DenseVector(couple._2._2.features.toArray) / sqrt(2.0) density.eval(point1 - point2) }).reduce((a,b) => a + b)) } def entropyDifference(entropy: Double, data: List[DenseVector[Double]], add: DenseVector[Double], remove: DenseVector[Double]): Double = { val dim = data.head.length val expEntropy = math.exp(-1.0*entropy) val product1 = for(i <- data.view) yield (remove, i) val subtractEnt = 2*product1.map((couple) => { density.eval((couple._1 - couple._2) / sqrt(2.0)) }).sum - density.eval(DenseVector.zeros(dim)) val product2 = for(i <- data.view) yield (add, i) val addEnt = 2*product2.map((couple) => { density.eval((couple._1 - couple._2) / sqrt(2.0)) }).sum - 2*density.eval((add - remove) / sqrt(2.0)) + density.eval(DenseVector.zeros(dim)) -1.0*log_e(expEntropy + addEnt - subtractEnt) - entropy } }
Example 165
Source File: SparkLogisticGLM.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.lm //Breeze Imports import breeze.linalg.DenseVector import breeze.numerics.sigmoid import breeze.stats.distributions.Gaussian import io.github.mandar2812.dynaml.optimization.ProbitGradient import org.apache.spark.mllib.linalg.Vectors //DynaML Imports import io.github.mandar2812.dynaml.optimization.{ GradientDescentSpark, LogisticGradient, RegularizedOptimizer, SquaredL2Updater} //Spark Imports import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD class SparkProbitGLM( data: RDD[(DenseVector[Double], Double)], numPoints: Long, map: (DenseVector[Double]) => DenseVector[Double] = identity[DenseVector[Double]]) extends SparkLogisticGLM(data, numPoints, map) { private val standardGaussian = new Gaussian(0, 1.0) override val h: (Double) => Double = (x: Double) => standardGaussian.cdf(x) override protected val optimizer: RegularizedOptimizer[ DenseVector[Double], DenseVector[Double], Double, RDD[LabeledPoint]] = new GradientDescentSpark(new ProbitGradient, new SquaredL2Updater) }
Example 166
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 167
Source File: NegativeCorrelationExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.sql.SparkSession object NegativeCorrelationExample { def getSparkSession(): SparkSession = { val spark = SparkSession.builder().master("local").getOrCreate() spark.sparkContext.setLogLevel("ERROR") spark } def main(args: Array[String]): Unit = { val spark = getSparkSession() val data = spark.sparkContext.parallelize( Seq( Vectors.dense(0.0, 1.0, 100.0), Vectors.dense(-10.0, 10.0, 200.0), Vectors.dense(-20.0, 100.0, 300.0), Vectors.dense(-30.0, 1000.0, 400.0), Vectors.dense(-40.0, 10000.0, 500.0), Vectors.dense(-50.0, 100000.0, 600.0), Vectors.dense(-60.0, 1000000.0, 700.0), Vectors.dense(-70.0, 10000000.0, 800.0), Vectors.dense(-80.0, 100000000.0, 900.0), Vectors.dense(-90.0, 1000000000.0, 1000.0) ) ) val corr = Statistics.corr(data) println(s"Correlation:\n${corr}") spark.stop() } }
Example 168
Source File: PositiveCorrelationExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics object PositiveCorrelationExample { def getSparkSession(): SparkSession = { val spark = SparkSession.builder().master("local").getOrCreate() spark.sparkContext.setLogLevel("ERROR") spark } def main(args: Array[String]): Unit = { val spark = getSparkSession() val data = spark.sparkContext.parallelize( Seq( Vectors.dense(0.0, 1.0, 100.0), Vectors.dense(10.0, 10.0, 200.0), Vectors.dense(20.0, 100.0, 300.0), Vectors.dense(30.0, 1000.0, 400.0), Vectors.dense(40.0, 10000.0, 500.0), Vectors.dense(50.0, 100000.0, 600.0), Vectors.dense(60.0, 1000000.0, 700.0), Vectors.dense(70.0, 10000000.0, 800.0), Vectors.dense(80.0, 100000000.0, 900.0), Vectors.dense(90.0, 1000000000.0, 1000.0) ) ) val summary = Statistics.colStats(data) // Compute column summary statistics println( s"""Summary: ${summary.count} // number of records ${summary.mean} // mean value for each column ${summary.min} // column-wise min ${summary.max} // column-wise max ${summary.normL1} // column-wise norm L1 ${summary.normL2} // column-wise Euclidean magnitude ${summary.variance} // column-wise variance ${summary.numNonzeros} // column-wise count of non-zero values """.stripMargin) val corr = Statistics.corr(data) println(s"Correlation:\n${corr}") spark.stop() } }
Example 169
Source File: HandsOnKMeanStreaming.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.mllib.clustering.StreamingKMeans object HandsOnKMeanStreaming { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnKMeanStreaming") val ssc = new StreamingContext(conf, Seconds(10)) val model = new StreamingKMeans(). setK(4). // number of clusters is 4 setDecayFactor(1.0). // decay factor (the forgetfulness of the previous centroids) setRandomCenters(3, 0.0) // 3 dimensions and 0 weight import org.apache.spark.mllib.linalg.Vectors val trainingData = ssc.textFileStream("file:/tmp/k-means-train-data").map(Vectors.parse).cache() trainingData.print() import org.apache.spark.mllib.regression.LabeledPoint val testData = ssc.textFileStream("file:/tmp/k-means-test-data").map(LabeledPoint.parse) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes) } }
Example 170
Source File: HandsOnLinRegStreaming.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD object HandsOnLinRegStreaming { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnLinRegStreaming") val ssc = new StreamingContext(conf, Seconds(10)) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD().setInitialWeights(Vectors.zeros(numFeatures)) val trainingData = ssc.textFileStream("file:/tmp/lin-reg-train-data").map(LabeledPoint.parse).cache() trainingData.print() // output training data for debug purpose val testData = ssc.textFileStream("file:/tmp/lin-reg-test-data").map(LabeledPoint.parse) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes) } }
Example 171
Source File: LinearRegExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark.sql.SparkSession object LinearRegExample { val homeDir = System.getProperty("user.home") def main(args: Array[String]): Unit = { // 1. Set Spark session val spark = SparkSession.builder().master("local").getOrCreate() // 2. Set logging level to WARNING spark.sparkContext.setLogLevel("WARN") // 3. Import necessary classes from Spark MLLib package that are needed for linear regression import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // 4. Load the data val data = spark.sparkContext.textFile(s"${homeDir}/lpsa.data") // 5. Parse the data into LabeledPoint and cache val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // 6. Build the model by setting number of iterations, step size val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // 7. Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println(s"training Mean Squared Error $MSE") // 8. Save the model model.save(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel") // 9. Load the saved model val sameModel = LinearRegressionModel.load(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel") // 10. Output the model println(sameModel) } }
Example 172
Source File: MnistExample.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession object MnistExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate() val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8) .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => Vectors.dense(arr.slice(1, 785))) val model = new KMeans() .setK(10) .setInitializationMode("random") .setMaxIterations(10) .run(trainRDD) println("final clusters:") println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) } }
Example 173
Source File: KMeanTest.scala From SparseML with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector} import scala.util.Random //spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9 //guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15 object ScalableKMeanTest { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}") val sc = new SparkContext(conf) val k = args(0).toInt val dimension = args(1).toInt val recordNum = args(2).toInt val sparsity = args(3).toDouble val iterations = args(4).toInt val means = args(5) val parNumber = args(6).toInt val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => { val ran = new Random() val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray val vec: Vector = new SparseVector(dimension, indexArr, valueArr) vec }).cache() println(args.mkString(", ")) println(data.count() + " records generated") val st = System.nanoTime() val model = if(means == "my") { println("running scalable kmeans") val model = new ScalableKMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } else { println("running mllib kmeans") val model = new KMeans() .setK(k) .setInitializationMode("random") .setMaxIterations(iterations) .run(data) model } println((System.nanoTime() - st) / 1e9 + " seconds cost") println("final clusters: " + model.clusterCenters.length) println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n")) sc.stop() } }
Example 174
Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0 | 5 votes |
import collection.JavaConversions._ import scala.collection.mutable import opennlp.tools.tokenize.SimpleTokenizer import opennlp.tools.stemmer.PorterStemmer import org.apache.spark.rdd._ import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA} import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors} import org.apache.spark.mllib.feature.IDF // add openNLP jar to the Spark Context sc.addJar("opennlp-tools-1.6.0.jar") // Load documents from text files, 1 element (text string) per file val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2) // read stop words from file val stopwordFile = "stop-words.txt" val st_words = sc.textFile(stopwordFile).collect() .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet val stopwords = sc.broadcast(st_words) val minWordLength = 3 val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => val tokenizer = SimpleTokenizer.INSTANCE val stemmer = new PorterStemmer() val tokens = tokenizer.tokenize(text) val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w))) .map(w => stemmer.stem(w)) id -> words }.filter(_._2.length > 0) tokenized.cache() val numDocs = tokenized.count() val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => tokens.map(_ -> 1L) }.reduceByKey(_ + _) wordCounts.cache() val fullVocabSize = wordCounts.count() val vSize = 10000 val (vocab: Map[String, Int], selectedTokenCount: Long) = { val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)} (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum) } val documents = tokenized.map { case (id, tokens) => // Filter tokens by vocabulary, and create word count vector representation of document. val wc = new mutable.HashMap[Int, Int]() tokens.foreach { term => if (vocab.contains(term)) { val termIndex = vocab(term) wc(termIndex) = wc.getOrElse(termIndex, 0) + 1 } } val indices = wc.keys.toArray.sorted val values = indices.map(i => wc(i).toDouble) val sb = Vectors.sparse(vocab.size, indices, values) (id, sb) } val vocabArray = new Array[String](vocab.size) vocab.foreach { case (term, i) => vocabArray(i) = term } val tf = documents.map { case (id, vec) => vec }.cache() val idfVals = new IDF().fit(tf).idf.toArray val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) => val indices = vec.asInstanceOf[SparseVector].indices val counts = new mutable.HashMap[Int, Double]() for (idx <- indices) { counts(idx) = vec(idx) * idfVals(idx) } (id, Vectors.sparse(vocab.size, counts.toSeq)) } val numTopics = 5 val numIterations = 50 val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online") val ldaModel = lda.run(tfidfDocs) val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5) topicIndices.foreach { case (terms, termWeights) => println("TOPIC:") terms.zip(termWeights).foreach { case (term, weight) => println(s"${vocabArray(term.toInt)}\t$weight") } println() }
Example 175
Source File: MatrixUtilSuite.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts import com.cloudera.sparkts.MatrixUtil._ import org.apache.spark.mllib.linalg.{Matrices, Vectors} import org.scalatest._ class MatrixUtilSuite extends FunSuite with ShouldMatchers { test("modifying toBreeze version modifies original tensor") { val vec = Vectors.dense(1.0, 2.0, 3.0) val breezeVec = toBreeze(vec) breezeVec(1) = 4.0 vec(1) should be (4.0) val mat = Matrices.zeros(3, 4) val breezeMat = toBreeze(mat) breezeMat(0, 1) = 2.0 mat(0, 1) should be (2.0) } }
Example 176
Source File: LocalDBSCANArcherySuite.scala From dbscan-on-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.clustering.dbscan import java.net.URI import scala.io.Source import org.scalatest.FunSuite import org.scalatest.Matchers import org.apache.spark.mllib.linalg.Vectors class LocalDBSCANArcherySuite extends FunSuite with Matchers { private val dataFile = "labeled_data.csv" test("should cluster") { val labeled: Map[DBSCANPoint, Double] = new LocalDBSCANArchery(eps = 0.3F, minPoints = 10) .fit(getRawData(dataFile)) .map(l => (l, l.cluster.toDouble)) .toMap val expected: Map[DBSCANPoint, Double] = getExpectedData(dataFile).toMap labeled.foreach { case (key, value) => { val t = expected(key) if (t != value) { println(s"expected: $t but got $value for $key") } } } labeled should equal(expected) } def getExpectedData(file: String): Iterator[(DBSCANPoint, Double)] = { Source .fromFile(getFile(file)) .getLines() .map(s => { val vector = Vectors.dense(s.split(',').map(_.toDouble)) val point = DBSCANPoint(vector) (point, vector(2)) }) } def getRawData(file: String): Iterable[DBSCANPoint] = { Source .fromFile(getFile(file)) .getLines() .map(s => DBSCANPoint(Vectors.dense(s.split(',').map(_.toDouble)))) .toIterable } def getFile(filename: String): URI = { getClass.getClassLoader.getResource(filename).toURI } }
Example 177
Source File: printMatrix.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkContext, SparkConf} import breeze.linalg.{DenseMatrix => BDM, kron} object printMatrix { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))(0))) val lines2 = sc.textFile("dataset/train.format", 8) val data2 = lines2.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(784), Example.Vector2Tensor(Vectors.dense(arr.slice(0, 784)))(0))) data2.take(10).foreach(record =>{ println("label: " + record._1) val intm = new BDM[Int](28, 28, record._2.toArray.map(d => d.toInt)) val str = intm.toString(1000, 1000).replace('0', '.').replace('0', '*') println(str) }) } }
Example 178
Source File: Example.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.{SparkConf, SparkContext} import breeze.linalg.{DenseMatrix => BDM, _} object Example { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/train.format", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784))))) val topology = new CNNTopology topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5))) topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4))) val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16) val start = System.nanoTime() cnn.trainOneByOne(data) println("Training time: " + (System.nanoTime() - start) / 1e9) val right = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Predicting precision: $right " + right.toDouble/(data.count())) // val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8) // .map(line => line.split(",")).map(arr => arr.map(_.toDouble)) // .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0))))) val rightM = data.map(record =>{ val result = cnn.predict(record._2) if(result == record._1) 1 else 0 }).sum() println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count())) } def Vector2Tensor(record: Vector): Array[BDM[Double]] = { val mapSize = new Scale(28, 28) val m = new BDM[Double](mapSize.x, mapSize.y) var i: Int = 0 while (i < mapSize.x) { var j: Int = 0 while (j < mapSize.y) { m(i, j) = record(mapSize.x * i + j) j += 1 } i += 1 } Array(m) } }
Example 179
Source File: Driver.scala From mCNN with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.log4j.{Logger, Level} import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.{SparkContext, SparkConf} object CNNDriver { def main(args: Array[String]) { val myLayers = new Array[Layer](8) myLayers(0) = new ConvolutionalLayer(1, 6, kernelSize = new MapSize(5, 5), inputMapSize = new MapSize(28, 28)) myLayers(1) = new FunctionalLayer(new SigmoidFunction()) myLayers(2) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(24, 24)) myLayers(3) = new ConvolutionalLayer(6, 12, new MapSize(5, 5), new MapSize(12, 12)) myLayers(4) = new FunctionalLayer(new SigmoidFunction()) myLayers(5) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(8, 8)) myLayers(6) = new ConvolutionalLayer(12, 12, new MapSize(4, 4), new MapSize(4, 4)) myLayers(7) = new FunctionalLayer(new SigmoidFunction()) val topology = FeedForwardTopology(myLayers) Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/train.format", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => { val target = new Array[Double](12) target(arr(784).toInt) = 1 val in = Vector2BDM(Vectors.dense(arr.slice(0, 784))) (Vectors.fromBreeze(in.toDenseVector), Vectors.dense(target)) }).cache() val feedForwardTrainer = new FeedForwardTrainer(topology, 784, 12) feedForwardTrainer.setStackSize(4) // CNN does not benefit from the stacked data // .LBFGSOptimizer.setNumIterations(20) .SGDOptimizer .setMiniBatchFraction(0.002) .setConvergenceTol(0) .setNumIterations(1000) .setUpdater(new CNNUpdater(0.85)) for(iter <- 1 to 1000){ val start = System.nanoTime() val mlpModel = feedForwardTrainer.train(data) feedForwardTrainer.setWeights(mlpModel.weights()) println(s"Training time $iter: " + (System.nanoTime() - start) / 1e9) // predict val right = data.filter(v => mlpModel.predict(v._1).argmax == v._2.argmax).count() val precision = right.toDouble / data.count() println(s"right: $right, count: ${data.count()}, precision: $precision") } } def Vector2BDM(record: Vector): BDM[Double] = { val mapSize = new MapSize(28, 28) val m = new BDM[Double](mapSize.x, mapSize.y) var i: Int = 0 while (i < mapSize.x) { var j: Int = 0 while (j < mapSize.y) { m(i, j) = record(mapSize.x * i + j) j += 1 } i += 1 } m } }
Example 180
Source File: CNNUpdater.scala From mCNN with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import breeze.linalg.{*, DenseMatrix => BDM, DenseVector => BDV, Vector => BV, axpy => Baxpy, sum => Bsum} import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.optimization.Updater private[ann] class CNNUpdater(alpha: Double) extends Updater { override def compute( weightsOld: Vector, gradient: Vector, stepSize: Double, iter: Int, regParam: Double): (Vector, Double) = { val thisIterStepSize = stepSize val brzWeights: BV[Double] = weightsOld.toBreeze.toDenseVector Baxpy(-thisIterStepSize, gradient.toBreeze * alpha, brzWeights) (Vectors.fromBreeze(brzWeights), 0) } }
Example 181
Source File: MnistCSVDriver.scala From mCNN with Apache License 2.0 | 5 votes |
package hhbyyh.mCNN import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.{SparkConf, SparkContext} object MnistCSVDriver { def main(args: Array[String]) { val topology = new CNNTopology topology.addLayer(CNNLayer.buildConvolutionLayer(new Scale(28, 28))) topology.addLayer(CNNLayer.buildConvLayer(6, new Scale(5, 5))) topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(5, 5))) topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2))) topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(4, 4))) val cnn: CNN = new CNN(topology).setMaxIterations(500000).setMiniBatchSize(16) Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.WARN) val conf = new SparkConf().setMaster("local[8]").setAppName("ttt") val sc = new SparkContext(conf) val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8) val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble)) .map(arr => new LabeledPoint(arr(0), Vectors.dense(arr.slice(1, 785).map(v => if(v > 0) 1.0 else 0)))) val start = System.nanoTime() cnn.trainOneByOne(data) println("Training time: " + (System.nanoTime() - start) / 1e9) } }
Example 182
Source File: RichIndexedRowMatrixSuite.scala From hail with MIT License | 5 votes |
package is.hail.utils import breeze.linalg.{DenseMatrix => BDM, _} import is.hail.{HailSuite, TestUtils} import is.hail.linalg.BlockMatrix import is.hail.linalg.BlockMatrix.ops._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{DistributedMatrix, IndexedRow, IndexedRowMatrix} import org.apache.spark.rdd.RDD import org.testng.annotations.Test class RichIndexedRowMatrixSuite extends HailSuite { private def convertDistributedMatrixToBreeze(sparkMatrix: DistributedMatrix): Matrix[Double] = { val breezeConverter = sparkMatrix.getClass.getMethod("toBreeze") breezeConverter.invoke(sparkMatrix).asInstanceOf[Matrix[Double]] } @Test def testToBlockMatrixDense() { val nRows = 9L val nCols = 6L val data = Seq( (0L, Vectors.dense(0.0, 1.0, 2.0, 1.0, 3.0, 4.0)), (1L, Vectors.dense(3.0, 4.0, 5.0, 1.0, 1.0, 1.0)), (3L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)), (4L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)), (5L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)), (6L, Vectors.dense(1.0, 2.0, 3.0, 1.0, 1.0, 1.0)), (7L, Vectors.dense(4.0, 5.0, 6.0, 1.0, 1.0, 1.0)), (8L, Vectors.dense(7.0, 8.0, 9.0, 1.0, 1.0, 1.0)) ).map(IndexedRow.tupled) val indexedRows: RDD[IndexedRow] = sc.parallelize(data) val irm = new IndexedRowMatrix(indexedRows) for { blockSize <- Seq(1, 2, 3, 4, 6, 7, 9, 10) } { val blockMat = irm.toHailBlockMatrix(blockSize) assert(blockMat.nRows === nRows) assert(blockMat.nCols === nCols) assert(blockMat.toBreezeMatrix() === convertDistributedMatrixToBreeze(irm)) } intercept[IllegalArgumentException] { irm.toHailBlockMatrix(-1) } intercept[IllegalArgumentException] { irm.toHailBlockMatrix(0) } } @Test def emptyBlocks() { val nRows = 9 val nCols = 2 val data = Seq( (3L, Vectors.dense(1.0, 2.0)), (4L, Vectors.dense(1.0, 2.0)), (5L, Vectors.dense(1.0, 2.0)), (8L, Vectors.dense(1.0, 2.0)) ).map(IndexedRow.tupled) val irm = new IndexedRowMatrix(sc.parallelize(data)) val m = irm.toHailBlockMatrix(2) assert(m.nRows == nRows) assert(m.nCols == nCols) assert(m.toBreezeMatrix() == convertDistributedMatrixToBreeze(irm)) assert(m.blocks.count() == 5) (m.dot(m.T)).toBreezeMatrix() // assert no exception assert(m.mapWithIndex { case (i, j, v) => i + 10 * j + v }.toBreezeMatrix() === new BDM[Double](nRows, nCols, Array[Double]( 0.0, 1.0, 2.0, 4.0, 5.0, 6.0, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 15.0, 16.0, 17.0, 16.0, 17.0, 20.0 ))) } }
Example 183
Source File: DataFrameExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 184
Source File: SummaryStatisticsExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 185
Source File: PCAOnSourceVectorExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD // $example off$ object PCAOnSourceVectorExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnSourceVectorExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[LabeledPoint] = sc.parallelize(Seq( new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)), new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)), new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)))) // Compute the top 5 principal components. val pca = new PCA(5).fit(data.map(_.features)) // Project vectors to the linear space spanned by the top 5 principal // components, keeping the label val projected = data.map(p => p.copy(features = pca.transform(p.features))) // $example off$ val collect = projected.collect() println("Projected vector of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 186
Source File: PCAOnRowMatrixExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 187
Source File: TallSkinnyPCA.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 188
Source File: GaussianMixtureExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object GaussianMixtureExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("GaussianMixtureExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/gmm_data.txt") val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using GaussianMixture val gmm = new GaussianMixture().setK(2).run(parsedData) // Save and load model gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") val sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel") // output parameters of max-likelihood model for (i <- 0 until gmm.k) { println("weight=%f\nmu=%s\nsigma=\n%s\n" format (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma)) } // $example off$ sc.stop() } } // scalastyle:on println
Example 189
Source File: PCAExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 190
Source File: DenseKMeans.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default: ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = sc.textFile(params.input).map { line => Vectors.dense(line.split(' ').map(_.toDouble)) }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } } // scalastyle:on println
Example 191
Source File: CosineSimilarity.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 192
Source File: ElementwiseProductExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors // $example off$ object ElementwiseProductExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("ElementwiseProductExample") val sc = new SparkContext(conf) // $example on$ // Create some vector data; also works for sparse vectors val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))) val transformingVector = Vectors.dense(0.0, 1.0, 2.0) val transformer = new ElementwiseProduct(transformingVector) // Batch transform and per-row transform give the same results: val transformedData = transformer.transform(data) val transformedData2 = data.map(x => transformer.transform(x)) // $example off$ println("transformedData: ") transformedData.foreach(x => println(x)) println("transformedData2: ") transformedData2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 193
Source File: SVDExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 194
Source File: TallSkinnySVD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 195
Source File: StandardScalerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLUtils // $example off$ object StandardScalerExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StandardScalerExample") val sc = new SparkContext(conf) // $example on$ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) // scaler3 is an identical model to scaler2, and will produce identical transformations val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) // data2 will be unit variance and zero mean. val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray)))) // $example off$ println("data1: ") data1.foreach(x => println(x)) println("data2: ") data2.foreach(x => println(x)) sc.stop() } } // scalastyle:on println
Example 196
Source File: KMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors // $example off$ object KMeansExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("KMeansExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/kmeans_data.txt") val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache() // Cluster the data into two classes using KMeans val numClusters = 2 val numIterations = 20 val clusters = KMeans.train(parsedData, numClusters, numIterations) // Evaluate clustering by computing Within Set Sum of Squared Errors val WSSSE = clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) // Save and load model clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel") val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 197
Source File: MultivariateSummarizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 198
Source File: LinearRegressionWithSGDExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 199
Source File: StreamingLinearRegressionExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") System.exit(1) } val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") val ssc = new StreamingContext(conf, Seconds(1)) // $example on$ val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val numFeatures = 3 val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ ssc.stop() } } // scalastyle:on println
Example 200
Source File: BisectingKMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.clustering.BisectingKMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} // $example off$ object BisectingKMeansExample { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample") val sc = new SparkContext(sparkConf) // $example on$ // Loads and parses data def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble)) val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache() // Clustering the data into 6 clusters by BisectingKMeans. val bkm = new BisectingKMeans().setK(6) val model = bkm.run(data) // Show the compute cost and the cluster centers println(s"Compute Cost: ${model.computeCost(data)}") model.clusterCenters.zipWithIndex.foreach { case (center, idx) => println(s"Cluster Center ${idx}: ${center}") } // $example off$ sc.stop() } } // scalastyle:on println