org.apache.spark.mllib.linalg.distributed.RowMatrix Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.distributed.RowMatrix.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TallSkinnyPCA.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println(s"Principal components are:\n $pc") sc.stop() } } // scalastyle:on println
Example 2
Source File: TallSkinnySVD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 3
Source File: PearsonCorrelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 4
Source File: PCASuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new PCA) val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix] val explainedVariance = Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector] val model = new PCAModel("pca", mat, explainedVariance) ParamsSuite.checkParams(model) } test("pca") { val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val dataRDD = sc.parallelize(data, 2) val mat = new RowMatrix(dataRDD.map(OldVectors.fromML)) val pc = mat.computePrincipalComponents(3) val expected = mat.multiply(pc).rows.map(_.asML) val df = dataRDD.zip(expected).toDF("features", "expected") val pca = new PCA() .setInputCol("features") .setOutputCol("pca_features") .setK(3) .fit(df) // copied model must have the same parent. MLTestingUtils.checkCopy(pca) pca.transform(df).select("pca_features", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("PCA read/write") { val t = new PCA() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setK(3) testDefaultReadWrite(t) } test("PCAModel read/write") { val instance = new PCAModel("myPCAModel", Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix], Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector]) val newInstance = testDefaultReadWrite(instance) assert(newInstance.pc === instance.pc) } }
Example 5
Source File: PCASuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 6
Source File: TallSkinnyPCA.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } }
Example 7
Source File: CosineSimilarity.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} import org.apache.spark.{SparkConf, SparkContext} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { System.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } }
Example 8
Source File: TallSkinnySVD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } }
Example 9
Source File: PearsonCorrelation.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 10
Source File: PCASuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val pc = mat.computePrincipalComponents(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() assert(pca_transform.toSet === mat_multiply.toSet) } }
Example 11
Source File: IndexRowMatrixDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.SparkConf import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.mllib.linalg.Vectors object IndexRowMatrixDemo { def main(args: Array[String]) { //val sparkConf = new SparkConf().setMast("local[2]").setAppName("SparkHdfsLR") val conf = new SparkConf().setAppName("test").setMaster("local") val sc = new SparkContext(conf) //定义一个隐式转换函数 implicit def double2long(x: Double) = x.toLong //数据中的第一个元素为IndexedRow中的index,剩余的映射到vector //f.take(1)(0)获取到第一个元素并自动进行隐式转换,转换成Long类型 val rdd1 = sc.parallelize( Array( Array(1.0, 2.0, 3.0, 4.0), Array(2.0, 3.0, 4.0, 5.0), Array(3.0, 4.0, 5.0, 6.0))).map(f => IndexedRow(f.take(1)(0), Vectors.dense(f.drop(1)))) //索引行矩阵(IndexedRowMatrix)按行分布式存储,有行索引,其底层支撑结构是索引的行组成的RDD,所以每行可以通过索引(long)和局部向量表示 val indexRowMatrix = new IndexedRowMatrix(rdd1) //计算拉姆矩阵 var gramianMatrix: Matrix = indexRowMatrix.computeGramianMatrix() //转换成行矩阵RowMatrix var rowMatrix: RowMatrix = indexRowMatrix.toRowMatrix() //其它方法例如computeSVD计算奇异值、multiply矩阵相乘等操作,方法使用与RowMaxtrix相同 } }
Example 12
Source File: TallSkinnySVD.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors val conf = new SparkConf().setAppName("TallSkinnySVD").setMaster("local") val sc = new SparkContext(conf) // Load and parse the data file. //加载和解析数据文件 val rows = sc.textFile("../data/mllib/kmeans_data.txt").map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. 计算SVD val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 13
Source File: PearsonCorrelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 14
Source File: PCASuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") {//正确的计算使用一个主成分分析包装 val k = dataRDD.count().toInt //fit()方法将DataFrame转化为一个Transformer的算法 val pca = new PCA(k).fit(dataRDD) //转换分布式矩阵分 val mat = new RowMatrix(dataRDD) //计算主成分析,将维度降为K val pc = mat.computePrincipalComponents(k) //PCA变换 //transform()方法将DataFrame转化为另外一个DataFrame的算法 val pca_transform = pca.transform(dataRDD).collect() //Mat _相乘 val mat_multiply = mat.multiply(pc).rows.collect() assert(pca_transform.toSet === mat_multiply.toSet) } }
Example 15
Source File: PCAOnRowMatrixExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val rows = sc.parallelize(data) val mat: RowMatrix = new RowMatrix(rows) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } sc.stop() } } // scalastyle:on println
Example 16
Source File: SVDExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 17
Source File: CosineSimilarity.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 18
Source File: SVDExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val rows = sc.parallelize(data) val mat: RowMatrix = new RowMatrix(rows) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") sc.stop() } } // scalastyle:on println
Example 19
Source File: TallSkinnySVD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println(s"Singular values are ${svd.s}") sc.stop() } } // scalastyle:on println
Example 20
Source File: PearsonCorrelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 21
Source File: PCASuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.sql.Row class PCASuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new PCA) val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix] val explainedVariance = Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector] val model = new PCAModel("pca", mat, explainedVariance) ParamsSuite.checkParams(model) } test("pca") { val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val dataRDD = sc.parallelize(data, 2) val mat = new RowMatrix(dataRDD.map(OldVectors.fromML)) val pc = mat.computePrincipalComponents(3) val expected = mat.multiply(pc).rows.map(_.asML) val df = dataRDD.zip(expected).toDF("features", "expected") val pca = new PCA() .setInputCol("features") .setOutputCol("pca_features") .setK(3) val pcaModel = pca.fit(df) MLTestingUtils.checkCopyAndUids(pca, pcaModel) testTransformer[(Vector, Vector)](df, pcaModel, "pca_features", "expected") { case Row(result: Vector, expected: Vector) => assert(result ~== expected absTol 1e-5, "Transformed vector is different with expected vector.") } } test("PCA read/write") { val t = new PCA() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setK(3) testDefaultReadWrite(t) } test("PCAModel read/write") { val instance = new PCAModel("myPCAModel", Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix], Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector]) val newInstance = testDefaultReadWrite(instance) assert(newInstance.pc === instance.pc) } }
Example 22
Source File: PCASuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } test("memory cost computation") { assert(PCAUtil.memoryCost(10, 100) < Int.MaxValue) // check overflowing assert(PCAUtil.memoryCost(40000, 60000) > Int.MaxValue) } }
Example 23
Source File: TallSkinnyPCA.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 24
Source File: CosineSimilarity.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} import org.apache.spark.{SparkConf, SparkContext} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { System.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 25
Source File: TallSkinnySVD.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 26
Source File: PearsonCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 27
Source File: PCASuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.mllib.feature.{PCAModel => OldPCAModel} import org.apache.spark.sql.Row class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new PCA) val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix] val model = new PCAModel("pca", mat) ParamsSuite.checkParams(model) } test("pca") { val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val dataRDD = sc.parallelize(data, 2) val mat = new RowMatrix(dataRDD) val pc = mat.computePrincipalComponents(3) val expected = mat.multiply(pc).rows val df = sqlContext.createDataFrame(dataRDD.zip(expected)).toDF("features", "expected") val pca = new PCA() .setInputCol("features") .setOutputCol("pca_features") .setK(3) .fit(df) // copied model must have the same parent. MLTestingUtils.checkCopy(pca) pca.transform(df).select("pca_features", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("PCA read/write") { val t = new PCA() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setK(3) testDefaultReadWrite(t) } test("PCAModel read/write") { val instance = new PCAModel("myPCAModel", Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix]) val newInstance = testDefaultReadWrite(instance) assert(newInstance.pc === instance.pc) } }
Example 28
Source File: PCASuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val pc = mat.computePrincipalComponents(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() assert(pca_transform.toSet === mat_multiply.toSet) } }
Example 29
Source File: SVDDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{Vectors,Vector} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.random.RandomRDDs object SVDDataGenerator { def generateDistributedRowMatrix( sc: SparkContext, m: Long, n: Int, numPartitions: Int, seed: Long = System.currentTimeMillis()): RDD[Vector] = { val data: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed) data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("SVDDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $SVDDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateDistributedRowMatrix(sc, numExamples, numFeatures, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 30
Source File: SVDExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD import scopt.OptionParser object SVDExample { case class Params( numFeatures: Int = 0, numSingularValues: Int = 0, computeU: Boolean = true, maxResultSize: String = "1g", dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("SVD") { head("SVD: an example of SVD for matrix decomposition.") opt[Int]("numFeatures") .text(s"numFeatures, default: ${defaultParams.numFeatures}") .action((x,c) => c.copy(numFeatures = x)) opt[Int]("numSingularValues") .text(s"numSingularValues, default: ${defaultParams.numSingularValues}") .action((x,c) => c.copy(numSingularValues = x)) opt[Boolean]("computeU") .text(s"computeU, default: ${defaultParams.computeU}") .action((x,c) => c.copy(computeU = x)) opt[String]("maxResultSize") .text(s"maxResultSize, default: ${defaultParams.maxResultSize}") .action((x,c) => c.copy(maxResultSize = x)) arg[String]("<dataPath>") .required() .text("data path of SVD") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf() .setAppName(s"SVD with $params") .set("spark.driver.maxResultSize", params.maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val dataPath = params.dataPath val numFeatures = params.numFeatures val numSingularValues = params.numSingularValues val computeU = params.computeU val data: RDD[Vector] = sc.objectFile(dataPath) val mat: RowMatrix = new RowMatrix(data) val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(numSingularValues, computeU) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. sc.stop() } }
Example 31
Source File: SparkMatrix.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.matrix import org.apache.spark.ml.linalg.Matrix import org.apache.spark.ml.linalg.Matrices import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.MatrixEntry object SparkMatrix { def main(args: Array[String]) { val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0)) println("dMatrix: \n" + dMatrix) val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7)) println("sMatrixOne: \n" + sMatrixOne) val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7)) println("sMatrixTwo: \n" + sMatrixTwo) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val denseData = Seq( Vectors.dense(0.0, 1.0, 2.1), Vectors.dense(3.0, 2.0, 4.0), Vectors.dense(5.0, 7.0, 8.0), Vectors.dense(9.0, 0.0, 1.1) ) val sparseData = Seq( Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))), Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))), Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))), Vectors.sparse(3, Seq((0, 9.0), (2, 1.0))) ) val denseMat = new RowMatrix(sc.parallelize(denseData, 2)) val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2)) println("Dense Matrix - Num of Rows :" + denseMat.numRows()) println("Dense Matrix - Num of Cols:" + denseMat.numCols()) println("Sparse Matrix - Num of Rows :" + sparseMat.numRows()) println("Sparse Matrix - Num of Cols:" + sparseMat.numCols()) val data = Seq( (0L, Vectors.dense(0.0, 1.0, 2.0)), (1L, Vectors.dense(3.0, 4.0, 5.0)), (3L, Vectors.dense(9.0, 0.0, 1.0)) ).map(x => IndexedRow(x._1, x._2)) val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2) val indexedRowsMat = new IndexedRowMatrix(indexedRows) println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows()) println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols()) val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } val coordinateMat = new CoordinateMatrix(entries) println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows()) println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols()) sc.stop() } }
Example 32
Source File: TallSkinnyPCA.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 33
Source File: CosineSimilarity.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 34
Source File: SVDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 35
Source File: TallSkinnySVD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 36
Source File: PearsonCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 37
Source File: PCASuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new PCA) val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix] val explainedVariance = Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector] val model = new PCAModel("pca", mat, explainedVariance) ParamsSuite.checkParams(model) } test("pca") { val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val dataRDD = sc.parallelize(data, 2) val mat = new RowMatrix(dataRDD.map(OldVectors.fromML)) val pc = mat.computePrincipalComponents(3) val expected = mat.multiply(pc).rows.map(_.asML) val df = dataRDD.zip(expected).toDF("features", "expected") val pca = new PCA() .setInputCol("features") .setOutputCol("pca_features") .setK(3) .fit(df) // copied model must have the same parent. MLTestingUtils.checkCopy(pca) pca.transform(df).select("pca_features", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("PCA read/write") { val t = new PCA() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setK(3) testDefaultReadWrite(t) } test("PCAModel read/write") { val instance = new PCAModel("myPCAModel", Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix], Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector]) val newInstance = testDefaultReadWrite(instance) assert(newInstance.pc === instance.pc) } }
Example 38
Source File: PCASuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 39
Source File: X2P.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import breeze.linalg.DenseVector import org.apache.spark.mllib.X2PHelper._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix} import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.slf4j.LoggerFactory object X2P { private def logger = LoggerFactory.getLogger(X2P.getClass) def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = { require(tol >= 0, "Tolerance must be non-negative") require(perplexity > 0, "Perplexity must be positive") val mu = (3 * perplexity).toInt //TODO: Expose this as parameter val logU = Math.log(perplexity) val norms = x.rows.map(Vectors.norm(_, 2.0)) norms.persist() val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) } val neighbors = rowsWithNorm.zipWithIndex() .cartesian(rowsWithNorm.zipWithIndex()) .flatMap { case ((u, i), (v, j)) => if(i < j) { val dist = fastSquaredDistance(u, v) Seq((i, (j, dist)), (j, (i, dist))) } else Seq.empty } .topByKey(mu)(Ordering.by(e => -e._2)) val p_betas = neighbors.map { case (i, arr) => var betamin = Double.NegativeInfinity var betamax = Double.PositiveInfinity var beta = 1.0 val d = DenseVector(arr.map(_._2)) var (h, p) = Hbeta(d, beta) //logInfo("data was " + d.toArray.toList) //logInfo("array P was " + p.toList) // Evaluate whether the perplexity is within tolerance def Hdiff = h - logU var tries = 0 while (Math.abs(Hdiff) > tol && tries < 50) { //If not, increase or decrease precision if (Hdiff > 0) { betamin = beta beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2 } else { betamax = beta beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2 } // Recompute the values val HP = Hbeta(d, beta) h = HP._1 p = HP._2 tries = tries + 1 } //logInfo("array P is " + p.toList) (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta) } logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean) new CoordinateMatrix(p_betas.flatMap(_._1)) } }
Example 40
Source File: BHTSNE.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne.impl import breeze.linalg._ import breeze.stats.distributions.Rand import com.github.saurfang.spark.tsne.tree.SPTree import com.github.saurfang.spark.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.storage.StorageLevel import org.slf4j.LoggerFactory import scala.util.Random object BHTSNE { private def logger = LoggerFactory.getLogger(BHTSNE.getClass) def tsne( input: RowMatrix, noDims: Int = 2, maxIterations: Int = 1000, perplexity: Double = 30, theta: Double = 0.5, reportLoss: Int => Boolean = {i => i % 10 == 0}, callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => }, seed: Long = Random.nextLong() ): DenseMatrix[Double] = { if(input.rows.getStorageLevel == StorageLevel.NONE) { logger.warn("Input is not persisted and performance could be bad") } Rand.generator.setSeed(seed) val tsneParam = TSNEParam() import tsneParam._ val n = input.numRows().toInt val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) :/ 1e4 val iY = DenseMatrix.zeros[Double](n, noDims) val gains = DenseMatrix.ones[Double](n, noDims) // approximate p_{j|i} val p_ji = X2P(input, 1e-5, perplexity) val P = TSNEHelper.computeP(p_ji, n).glom() .map(rows => rows.map { case (i, data) => (i, data.map(_._1).toSeq, DenseVector(data.map(_._2 * exaggeration_factor).toArray)) }) .cache() var iteration = 1 while(iteration <= maxIterations) { val bcY = P.context.broadcast(Y) val bcTree = P.context.broadcast(SPTree(Y)) val initialValue = (DenseMatrix.zeros[Double](n, noDims), DenseMatrix.zeros[Double](n, noDims), 0.0) val (posF, negF, sumQ) = P.treeAggregate(initialValue)( seqOp = (c, v) => { // c: (pos, neg, sumQ), v: Array[(i, Seq(j), vec(Distance))] TSNEGradient.computeEdgeForces(v, bcY.value, c._1) val q = TSNEGradient.computeNonEdgeForces(bcTree.value, bcY.value, theta, c._2, v.map(_._1): _*) (c._1, c._2, c._3 + q) }, combOp = (c1, c2) => { // c: (grad, loss) (c1._1 + c2._1, c1._2 + c2._2, c1._3 + c2._3) }) val dY: DenseMatrix[Double] = posF :- (negF :/ sumQ) TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam) if(reportLoss(iteration)) { val loss = P.treeAggregate(0.0)( seqOp = (c, v) => { TSNEGradient.computeLoss(v, bcY.value, sumQ) }, combOp = _ + _ ) logger.debug(s"Iteration $iteration finished with $loss") callback(iteration, Y.copy, Some(loss)) } else { logger.debug(s"Iteration $iteration finished") callback(iteration, Y.copy, None) } bcY.destroy() bcTree.destroy() //undo early exaggeration if(iteration == early_exaggeration) { P.foreach { rows => rows.foreach { case (_, _, vec) => vec.foreachPair { case (i, v) => vec.update(i, v / exaggeration_factor) } } } } iteration += 1 } Y } }
Example 41
Source File: SimpleTSNE.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne.impl import breeze.linalg._ import breeze.stats.distributions.Rand import com.github.saurfang.spark.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.storage.StorageLevel import org.slf4j.LoggerFactory import scala.util.Random object SimpleTSNE { private def logger = LoggerFactory.getLogger(SimpleTSNE.getClass) def tsne( input: RowMatrix, noDims: Int = 2, maxIterations: Int = 1000, perplexity: Double = 30, callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => }, seed: Long = Random.nextLong()): DenseMatrix[Double] = { if(input.rows.getStorageLevel == StorageLevel.NONE) { logger.warn("Input is not persisted and performance could be bad") } Rand.generator.setSeed(seed) val tsneParam = TSNEParam() import tsneParam._ val n = input.numRows().toInt val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) val iY = DenseMatrix.zeros[Double](n, noDims) val gains = DenseMatrix.ones[Double](n, noDims) // approximate p_{j|i} val p_ji = X2P(input, 1e-5, perplexity) val P = TSNEHelper.computeP(p_ji, n).glom().cache() var iteration = 1 while(iteration <= maxIterations) { val bcY = P.context.broadcast(Y) val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache() val bcNumerator = P.context.broadcast({ numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _) }) val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))( seqOp = (c, v) => { // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator) val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, iteration <= early_exaggeration) (c._1, c._2 + l) }, combOp = (c1, c2) => { // c: (grad, loss) (c1._1 + c2._1, c1._2 + c2._2) }) bcY.destroy() bcNumerator.destroy() numerator.unpersist() TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam) logger.debug(s"Iteration $iteration finished with $loss") callback(iteration, Y.copy, Some(loss)) iteration += 1 } Y } }
Example 42
Source File: X2PSuite.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import org.apache.spark.SharedSparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.scalatest.{FunSuite, Matchers} class X2PSuite extends FunSuite with SharedSparkContext with Matchers { test("Test X2P against tsne.jl implementation") { val input = new RowMatrix( sc.parallelize(Seq(1 to 3, 4 to 6, 7 to 9, 10 to 12)) .map(x => Vectors.dense(x.map(_.toDouble).toArray)) ) val output = X2P(input, 1e-5, 2).toRowMatrix().rows.collect().map(_.toArray.toList) println(output.toList) //output shouldBe List(List(0, .5, .5), List(.5, 0, .5), List(.5, .5, .0)) } }
Example 43
Source File: MNIST.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne.examples import java.io.{BufferedWriter, OutputStreamWriter} import com.github.saurfang.spark.tsne.impl._ import com.github.saurfang.spark.tsne.tree.SPTree import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.LoggerFactory object MNIST { private def logger = LoggerFactory.getLogger(MNIST.getClass) def main (args: Array[String]) { val conf = new SparkConf() .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[SPTree])) val sc = new SparkContext(conf) val hadoopConf = sc.hadoopConfiguration val fs = FileSystem.get(hadoopConf) val dataset = sc.textFile("data/MNIST/mnist.csv.gz") .zipWithIndex() .filter(_._2 < 6000) .sortBy(_._2, true, 60) .map(_._1) .map(_.split(",")) .map(x => (x.head.toInt, x.tail.map(_.toDouble))) .cache() //logInfo(dataset.collect.map(_._2.toList).toList.toString) //val features = dataset.map(x => Vectors.dense(x._2)) //val scaler = new StandardScaler(true, true).fit(features) //val scaledData = scaler.transform(features) // .map(v => Vectors.dense(v.toArray.map(x => if(x.isNaN || x.isInfinite) 0.0 else x))) // .cache() val data = dataset.flatMap(_._2) val mean = data.mean() val std = data.stdev() val scaledData = dataset.map(x => Vectors.dense(x._2.map(v => (v - mean) / std))).cache() val labels = dataset.map(_._1).collect() val matrix = new RowMatrix(scaledData) val pcaMatrix = matrix.multiply(matrix.computePrincipalComponents(50)) pcaMatrix.rows.cache() val costWriter = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(s".tmp/MNIST/cost.txt"), true))) //SimpleTSNE.tsne(pcaMatrix, perplexity = 20, maxIterations = 200) BHTSNE.tsne(pcaMatrix, maxIterations = 500, callback = { //LBFGSTSNE.tsne(pcaMatrix, perplexity = 10, maxNumIterations = 500, numCorrections = 10, convergenceTol = 1e-8) case (i, y, loss) => if(loss.isDefined) logger.info(s"$i iteration finished with loss $loss") val os = fs.create(new Path(s".tmp/MNIST/result${"%05d".format(i)}.csv"), true) val writer = new BufferedWriter(new OutputStreamWriter(os)) try { (0 until y.rows).foreach { row => writer.write(labels(row).toString) writer.write(y(row, ::).inner.toArray.mkString(",", ",", "\n")) } if(loss.isDefined) costWriter.write(loss.get + "\n") } finally { writer.close() } }) costWriter.close() sc.stop() } }
Example 44
Source File: T9-4DataTypes.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataTypesApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val denseV = substream.map(f => Vectors.dense(f.slice(1, 5))) denseV.print() val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) }) .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l)) sparseV.print() val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) labeledP.print() val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53))) denseM.print() denseV.foreachRDD(rdd => { val rowM = new RowMatrix(rdd) println(rowM) }) denseV.foreachRDD(rdd => { val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1)) val iRowM = new IndexedRowMatrix(iRdd) println(iRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val cRowM = new CoordinateMatrix(entries) println(cRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val blockM = new CoordinateMatrix(entries).toBlockMatrix println(blockM) }) ssc.start() ssc.awaitTermination() } }
Example 45
Source File: activity _model.scala From Spark_Personas with MIT License | 5 votes |
//归一化 val result = h.sql("select max(visit_times) from model_input_active_t") //最大访问次数 val max_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(visit_times) from model_input_active_t") //最小访问次数 val min_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_visit_times =if(( max_visit_times - min_visit_times) == 0) 1 else ( max_visit_times - min_visit_times) val result = h.sql("select max(last_online_time) from model_input_active_t") //最远登录天数 val max_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(last_online_time) from model_input_active_t") //最小登录天数 val min_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_last_online_time =if(( max_last_online_time - min_last_online_time ) == 0) 1 else ( max_last_online_time - min_last_online_time) val result = h.sql("select max(pay_times) from model_input_active_t") //最大支付次数 val max_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(pay_times) from model_input_active_t") //最小支付次数 val min_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_pay_times =if(( max_pay_times - min_pay_times ) == 0) 1 else ( max_pay_times - min_pay_times) val result = h.sql("select max(comment_times) from model_input_active_t") //最大问询评论数 val max_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(comment_times) from model_input_active_t") //最小问询评论数 val min_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_comment_times =if(( max_comment_times - min_comment_times ) == 0) 1 else ( max_comment_times - min_comment_times) val result = h.sql("select max(stay_time) from model_input_active_t") //最大停留时间 val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val result = h.sql("select min(stay_time) from model_input_active_t") //最小停留时间 val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val region_stay_time =if(( max_stay_time - min_stay_time ) == 0) 1 else ( max_stay_time - min_stay_time) val result = h.sql("select max(visit_day_times) from model_input_active_t") //最大登录天数 val max_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(visit_day_times) from model_input_active_t") //最小登录天数 val min_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_visit_day_times =if(( max_visit_day_times - min_visit_day_times ) == 0) 1 else ( max_visit_day_times - min_visit_day_times) //权重:visit_times:0.2,visit_targetpage_percen:0.1,last_online_time:0.1,pay_times:0.2,comment_times:0.2,stay_time:0.1,visit_day_times 0.1 val normalization= h.sql("select t1.cookie , ((t1.visit_times- "+min_visit_times+")*0.2/"+region_visit_times+") as visit_times, t1.visit_targetpage_percen*0.1, ((t1.last_online_time- "+min_last_online_time+")*0.1/"+region_last_online_time+") as last_online_time, ((t1.pay_times- "+min_pay_times+")*0.2/"+region_pay_times+") as pay_times, ((t1.comment_times- "+min_comment_times+")*0.2/"+region_comment_times+") as comment_times, ((t1.stay_time- "+min_stay_time+")*0.1/"+region_stay_time+") as stay_time, ((t1.visit_day_times- "+min_visit_day_times+")*0.1/"+region_visit_day_times+") as visit_day_times from model_input_active_t t1") import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix //DataFrame转化为Vectors,没发现直接的API,方案是Dataframe转为rdd,然后,调用Vectors.dense,把它们集合起来 val data = normalization.rdd.map(line => Vectors.dense(line.get(1).toString.asInstanceOf[String].toDouble,line.get(2).toString.asInstanceOf[String].toDouble,line.get(3).toString.asInstanceOf[String].toDouble,line.get(4).toString.asInstanceOf[String].toDouble,line.get(5).toString.asInstanceOf[String].toDouble,line.get(6).toString.asInstanceOf[String].toDouble,line.get(7).toString.asInstanceOf[String].toDouble)) val rm = new RowMatrix(data) val pc = rm.computePrincipalComponents(1) val mx = rm.multiply(pc) //未完待续
Example 46
Source File: PCAOnRowMatrixExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 47
Source File: SparkSVDExampleOne.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.svd import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors} object SparkSVDExampleOne { def main(args: Array[String]) { val denseData = Seq( Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1), Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3), Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8), Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0) ) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo") val sc = new SparkContext(spConfig) val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2)) // Compute the top 20 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. println("U:" + U) println("s:" + s) println("V:" + V) sc.stop() } }
Example 48
Source File: PCAOnRowMatrixExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 49
Source File: TallSkinnyPCA.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 50
Source File: CosineSimilarity.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println
Example 51
Source File: SVDExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 52
Source File: TallSkinnySVD.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnySVD { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnySVD <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnySVD") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute SVD. val svd = mat.computeSVD(mat.numCols().toInt) println("Singular values are " + svd.s) sc.stop() } } // scalastyle:on println
Example 53
Source File: PearsonCorrelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 54
Source File: PCASuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new PCA) val mat = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix] val explainedVariance = Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector] val model = new PCAModel("pca", mat, explainedVariance) ParamsSuite.checkParams(model) } test("pca") { val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val dataRDD = sc.parallelize(data, 2) val mat = new RowMatrix(dataRDD.map(OldVectors.fromML)) val pc = mat.computePrincipalComponents(3) val expected = mat.multiply(pc).rows.map(_.asML) val df = dataRDD.zip(expected).toDF("features", "expected") val pca = new PCA() .setInputCol("features") .setOutputCol("pca_features") .setK(3) .fit(df) // copied model must have the same parent. MLTestingUtils.checkCopy(pca) pca.transform(df).select("pca_features", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.") } } test("PCA read/write") { val t = new PCA() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setK(3) testDefaultReadWrite(t) } test("PCAModel read/write") { val instance = new PCAModel("myPCAModel", Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)).asInstanceOf[DenseMatrix], Vectors.dense(0.5, 0.5).asInstanceOf[DenseVector]) val newInstance = testDefaultReadWrite(instance) assert(newInstance.pc === instance.pc) } }
Example 55
Source File: PCASuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { private val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) private lazy val dataRDD = sc.parallelize(data, 2) test("Correct computing use a PCA wrapper") { val k = dataRDD.count().toInt val pca = new PCA(k).fit(dataRDD) val mat = new RowMatrix(dataRDD) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => assert(calculated ~== expected relTol 1e-8) } assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } }
Example 56
Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package tests import org.apache.log4j.{ Level, Logger } import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.storage.StorageLevel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.linalg.{ Vector, Vectors } import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd, max => Bmax, min => Bmin, sum => Bsum } import scala.collection.mutable.ArrayBuffer import CNN.CNN object Test_example_CNN { def main(args: Array[String]) { //1 ����Spark���� val conf = new SparkConf().setAppName("CNNtest") val sc = new SparkContext(conf) //2 �������� Logger.getRootLogger.setLevel(Level.WARN) val data_path = "/deeplearn/train_d3.txt" val examples = sc.textFile(data_path).cache() val train_d1 = examples.map { line => val f1 = line.split("\t") val f = f1.map(f => f.toDouble) val y = f.slice(0, 10) val x = f.slice(10, f.length) (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0) } val train_d = train_d1.map(f => (f._1, f._2)) //3 ����ѵ������������ģ�� // opts:��������������������������֤���� val opts = Array(50.0, 1.0, 0.0) train_d.cache val numExamples = train_d.count() println(s"numExamples = $numExamples.") val CNNmodel = new CNN(). setMapsize(new BDM(1, 2, Array(28.0, 28.0))). setTypes(Array("i", "c", "s", "c", "s")). setLayer(5). setOnum(10). setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)). setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)). setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)). setAlpha(1.0). CNNtrain(train_d, opts) //4 ģ�Ͳ��� val CNNforecast = CNNmodel.predict(train_d) val CNNerror = CNNmodel.Loss(CNNforecast) println(s"NNerror = $CNNerror.") val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) println("Ԥ��ֵ") for (i <- 0 until printf1.length) { val outi = printf1(i)._2.mkString("\t") println(outi) } } }
Example 57
Source File: PCAOnRowMatrixExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 58
Source File: TallSkinnyPCA.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.Vectors object TallSkinnyPCA { def main(args: Array[String]) { if (args.length != 1) { System.err.println("Usage: TallSkinnyPCA <input>") System.exit(1) } val conf = new SparkConf().setAppName("TallSkinnyPCA") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(args(0)).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) } val mat = new RowMatrix(rows) // Compute principal components. val pc = mat.computePrincipalComponents(mat.numCols().toInt) println("Principal components are:\n" + pc) sc.stop() } } // scalastyle:on println
Example 59
Source File: CosineSimilarity.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix} object CosineSimilarity { case class Params(inputFile: String = null, threshold: Double = 0.1) extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("CosineSimilarity") { head("CosineSimilarity: an example app.") opt[Double]("threshold") .required() .text(s"threshold similarity: to tradeoff computation vs quality estimate") .action((x, c) => c.copy(threshold = x)) arg[String]("<inputFile>") .required() .text(s"input file, one row per line, space-separated") .action((x, c) => c.copy(inputFile = x)) note( """ |For example, the following command runs this app on a dataset: | | ./bin/spark-submit --class org.apache.spark.examples.mllib.CosineSimilarity \ | examplesjar.jar \ | --threshold 0.1 data/mllib/sample_svm_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName("CosineSimilarity") val sc = new SparkContext(conf) // Load and parse the data file. val rows = sc.textFile(params.inputFile).map { line => val values = line.split(' ').map(_.toDouble) Vectors.dense(values) }.cache() val mat = new RowMatrix(rows) // Compute similar columns perfectly, with brute force. val exact = mat.columnSimilarities() // Compute similar columns with estimation using DIMSUM val approx = mat.columnSimilarities(params.threshold) val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) } val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) } val MAE = exactEntries.leftOuterJoin(approxEntries).values.map { case (u, Some(v)) => math.abs(u - v) case (u, None) => math.abs(u) }.mean() println(s"Average absolute error in estimate is: $MAE") sc.stop() } } // scalastyle:on println