org.apache.spark.mllib.linalg.Matrix Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.Matrix.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SpearmanCorrelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 2
Source File: SpearmanCorrelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 3
Source File: PearsonCorrelation.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 4
Source File: SpearmanCorrelation.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 5
Source File: BasicStatistics.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix val sc: SparkContext = null val seriesX: RDD[Double] = null // a series 一系列 //必须与seriesX具有相同数量的分区和基数 val seriesY: RDD[Double] = null // must have the same number of partitions and cardinality as seriesX // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a // method is not specified, Pearson's method will be used by default. //pearson皮尔森相关性 val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") println("pearson:"+correlation) //请注意,每个向量是一个行,而不是一个列 val data: RDD[Vector] = null // note that each Vector is a row and not a column //spearman 斯皮尔曼相关性 // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. //用皮尔森法计算相关矩阵,用“斯皮尔曼”的斯皮尔曼方法 // If a method is not specified, Pearson's method will be used by default. //如果没有指定方法,皮尔森的方法将被默认使用 val correlMatrix: Matrix = Statistics.corr(data, "pearson") println("correlMatrix:"+correlMatrix.toString()) } }
Example 6
Source File: ChiSqLearning.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.{ Matrix, Matrices, Vectors } import org.apache.spark.mllib.stat.Statistics import org.apache.spark.{ SparkConf, SparkContext } object ChiSqLearning { def main(args: Array[String]) { val vd = Vectors.dense(1, 2, 3, 4, 5) val vdResult = Statistics.chiSqTest(vd) println(vd) println(vdResult) println("-------------------------------") val mtx = Matrices.dense(3, 2, Array(1, 3, 5, 2, 4, 6)) val mtxResult = Statistics.chiSqTest(mtx) println(mtx) println(mtxResult) //print :方法、自由度、方法的统计量、p值,推论犯错的概率p println("-------------------------------") val mtx2 = Matrices.dense(2, 2, Array(19.0, 34, 24, 10.0)) printChiSqTest(mtx2) printChiSqTest(Matrices.dense(2, 2, Array(26.0, 36, 7, 2.0))) // val mtxResult2 = Statistics.chiSqTest(mtx2) // println(mtx2) // println(mtxResult2) } def printChiSqTest(matrix: Matrix): Unit = { println("-------------------------------") val mtxResult2 = Statistics.chiSqTest(matrix) println(matrix) println(mtxResult2) } }
Example 7
Source File: CorrelationDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD //采用spearman相关系数 //执行结果: //correlation5: Double = 0.9428571428571412 val correlation5: Double = Statistics.corr(rdd4, rdd5, "spearman") println("spearman:"+correlation5) //从上面的执行结果来看,相关性从pearson的值0.6915716600436548提高到了0.9428571428571412,由于利用的等级相关, //因而spearman相关性分析也称为spearman等级相关分析或等级差数法,但需要注意的是spearman相关性分析方法涉及到等级的排序问题, //在分布式环境下的排序可能会涉及到大量的网络IO操作,算法效率不是特别高。 } }
Example 8
Source File: IndexRowMatrixDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.SparkConf import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.mllib.linalg.Vectors object IndexRowMatrixDemo { def main(args: Array[String]) { //val sparkConf = new SparkConf().setMast("local[2]").setAppName("SparkHdfsLR") val conf = new SparkConf().setAppName("test").setMaster("local") val sc = new SparkContext(conf) //定义一个隐式转换函数 implicit def double2long(x: Double) = x.toLong //数据中的第一个元素为IndexedRow中的index,剩余的映射到vector //f.take(1)(0)获取到第一个元素并自动进行隐式转换,转换成Long类型 val rdd1 = sc.parallelize( Array( Array(1.0, 2.0, 3.0, 4.0), Array(2.0, 3.0, 4.0, 5.0), Array(3.0, 4.0, 5.0, 6.0))).map(f => IndexedRow(f.take(1)(0), Vectors.dense(f.drop(1)))) //索引行矩阵(IndexedRowMatrix)按行分布式存储,有行索引,其底层支撑结构是索引的行组成的RDD,所以每行可以通过索引(long)和局部向量表示 val indexRowMatrix = new IndexedRowMatrix(rdd1) //计算拉姆矩阵 var gramianMatrix: Matrix = indexRowMatrix.computeGramianMatrix() //转换成行矩阵RowMatrix var rowMatrix: RowMatrix = indexRowMatrix.toRowMatrix() //其它方法例如computeSVD计算奇异值、multiply矩阵相乘等操作,方法使用与RowMaxtrix相同 } }
Example 9
Source File: GaussianMixtureModelWrapper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrix} import org.apache.spark.mllib.clustering.GaussianMixtureModel val gaussians: JList[Object] = { val modelGaussians = model.gaussians var i = 0 var mu = ArrayBuffer.empty[Vector] var sigma = ArrayBuffer.empty[Matrix] while (i < k) { mu += modelGaussians(i).mu sigma += modelGaussians(i).sigma i += 1 } List(mu.toArray, sigma.toArray).map(_.asInstanceOf[Object]).asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 10
Source File: PearsonCorrelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 11
Source File: PearsonCorrelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 12
Source File: PCAOnRowMatrixExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val rows = sc.parallelize(data) val mat: RowMatrix = new RowMatrix(rows) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } sc.stop() } } // scalastyle:on println
Example 13
Source File: SVDExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val rows = sc.parallelize(data) val mat: RowMatrix = new RowMatrix(rows) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") sc.stop() } } // scalastyle:on println
Example 14
Source File: PearsonCorrelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 15
Source File: SpearmanCorrelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 val cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 16
Source File: PearsonCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.toBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 17
Source File: SpearmanCorrelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 18
Source File: SVDExample.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD import scopt.OptionParser object SVDExample { case class Params( numFeatures: Int = 0, numSingularValues: Int = 0, computeU: Boolean = true, maxResultSize: String = "1g", dataPath: String = null ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("SVD") { head("SVD: an example of SVD for matrix decomposition.") opt[Int]("numFeatures") .text(s"numFeatures, default: ${defaultParams.numFeatures}") .action((x,c) => c.copy(numFeatures = x)) opt[Int]("numSingularValues") .text(s"numSingularValues, default: ${defaultParams.numSingularValues}") .action((x,c) => c.copy(numSingularValues = x)) opt[Boolean]("computeU") .text(s"computeU, default: ${defaultParams.computeU}") .action((x,c) => c.copy(computeU = x)) opt[String]("maxResultSize") .text(s"maxResultSize, default: ${defaultParams.maxResultSize}") .action((x,c) => c.copy(maxResultSize = x)) arg[String]("<dataPath>") .required() .text("data path of SVD") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf() .setAppName(s"SVD with $params") .set("spark.driver.maxResultSize", params.maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val dataPath = params.dataPath val numFeatures = params.numFeatures val numSingularValues = params.numSingularValues val computeU = params.computeU val data: RDD[Vector] = sc.objectFile(dataPath) val mat: RowMatrix = new RowMatrix(data) val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(numSingularValues, computeU) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. sc.stop() } }
Example 19
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.etl.machinelearning.kudu import com.cloudera.sa.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 20
Source File: SparkSVDExampleOne.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.svd import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors} object SparkSVDExampleOne { def main(args: Array[String]) { val denseData = Seq( Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1), Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3), Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8), Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0) ) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo") val sc = new SparkContext(spConfig) val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2)) // Compute the top 20 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. println("U:" + U) println("s:" + s) println("V:" + V) sc.stop() } }
Example 21
Source File: SVDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 22
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.MLUtils private def calculateCovarianceConstants: (DBM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = MLUtils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 23
Source File: PearsonCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 24
Source File: SpearmanCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 25
Source File: Util.scala From spark-lp with Apache License 2.0 | 5 votes |
def triuToFull(U: Array[Double], n: Int): Matrix = { val G = new BDM[Double](n, n) var row = 0 var col = 0 var idx = 0 var value = 0.0 while (col < n) { row = 0 while (row < col) { value = U(idx) G(row, col) = value G(col, row) = value idx += 1 row += 1 } G(col, col) = U(idx) idx += 1 col +=1 } Matrices.dense(n, n, G.data) } }
Example 26
Source File: LocalLDAModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.clustering import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.{DataUtils, ParamUtils} import org.apache.spark.ml.clustering.{LocalLDAModel => SparkLocalLDA} import org.apache.spark.mllib.clustering.{LocalLDAModel => OldSparkLocalLDA} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.sql.SparkSession import DataUtils._ import scala.reflect.runtime.universe class LocalLDAModel(override val sparkTransformer: SparkLocalLDA) extends LocalTransformer[SparkLocalLDA] { lazy val oldModel: OldSparkLocalLDA = { val mirror = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader) val parentTerm = universe.typeOf[SparkLocalLDA].decl(universe.TermName("oldLocalModel")).asTerm mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldSparkLocalLDA] } override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val newData = column.data.mapToMlLibVectors.map(oldModel.topicDistribution(_).toList) localData.withColumn( LocalDataColumn( sparkTransformer.getTopicDistributionCol, newData ) ) case None => localData } } } object LocalLDAModel extends SimpleModelLoader[SparkLocalLDA] with TypedTransformerConverter[SparkLocalLDA] { override def build(metadata: Metadata, data: LocalData): SparkLocalLDA = { val topics = DataUtils.constructMatrix( data.column("topicsMatrix").get.data.head.asInstanceOf[Map[String, Any]] ) val gammaShape = data.column("gammaShape").get.data.head.asInstanceOf[java.lang.Double] val topicConcentration = data.column("topicConcentration").get.data.head.asInstanceOf[java.lang.Double] val docConcentration = DataUtils.constructVector( data.column("docConcentration").get.data.head.asInstanceOf[Map[String, Any]] ) val vocabSize = data.column("vocabSize").get.data.head.asInstanceOf[java.lang.Integer] val oldLdaCtor = classOf[OldSparkLocalLDA].getDeclaredConstructor( classOf[Matrix], classOf[Vector], classOf[Double], classOf[Double] ) val oldLDA = oldLdaCtor.newInstance( Matrices.fromML(topics), Vectors.fromML(docConcentration), topicConcentration, gammaShape ) val ldaCtor = classOf[SparkLocalLDA].getDeclaredConstructor( classOf[String], classOf[Int], classOf[OldSparkLocalLDA], classOf[SparkSession] ) val lda = ldaCtor.newInstance(metadata.uid, vocabSize, oldLDA, null) ParamUtils.set(lda, lda.optimizer, metadata) ParamUtils.set(lda, lda.keepLastCheckpoint, metadata) ParamUtils.set(lda, lda.seed, metadata) ParamUtils.set(lda, lda.featuresCol, metadata) ParamUtils.set(lda, lda.learningDecay, metadata) ParamUtils.set(lda, lda.checkpointInterval, metadata) ParamUtils.set(lda, lda.learningOffset, metadata) ParamUtils.set(lda, lda.maxIter, metadata) ParamUtils.set(lda, lda.k, metadata) lda } override implicit def toLocal(sparkTransformer: SparkLocalLDA): LocalTransformer[SparkLocalLDA] = new LocalLDAModel(sparkTransformer) }
Example 27
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 28
Source File: activity _model.scala From Spark_Personas with MIT License | 5 votes |
//归一化 val result = h.sql("select max(visit_times) from model_input_active_t") //最大访问次数 val max_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(visit_times) from model_input_active_t") //最小访问次数 val min_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_visit_times =if(( max_visit_times - min_visit_times) == 0) 1 else ( max_visit_times - min_visit_times) val result = h.sql("select max(last_online_time) from model_input_active_t") //最远登录天数 val max_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(last_online_time) from model_input_active_t") //最小登录天数 val min_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_last_online_time =if(( max_last_online_time - min_last_online_time ) == 0) 1 else ( max_last_online_time - min_last_online_time) val result = h.sql("select max(pay_times) from model_input_active_t") //最大支付次数 val max_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(pay_times) from model_input_active_t") //最小支付次数 val min_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_pay_times =if(( max_pay_times - min_pay_times ) == 0) 1 else ( max_pay_times - min_pay_times) val result = h.sql("select max(comment_times) from model_input_active_t") //最大问询评论数 val max_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(comment_times) from model_input_active_t") //最小问询评论数 val min_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_comment_times =if(( max_comment_times - min_comment_times ) == 0) 1 else ( max_comment_times - min_comment_times) val result = h.sql("select max(stay_time) from model_input_active_t") //最大停留时间 val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val result = h.sql("select min(stay_time) from model_input_active_t") //最小停留时间 val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val region_stay_time =if(( max_stay_time - min_stay_time ) == 0) 1 else ( max_stay_time - min_stay_time) val result = h.sql("select max(visit_day_times) from model_input_active_t") //最大登录天数 val max_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val result = h.sql("select min(visit_day_times) from model_input_active_t") //最小登录天数 val min_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble val region_visit_day_times =if(( max_visit_day_times - min_visit_day_times ) == 0) 1 else ( max_visit_day_times - min_visit_day_times) //权重:visit_times:0.2,visit_targetpage_percen:0.1,last_online_time:0.1,pay_times:0.2,comment_times:0.2,stay_time:0.1,visit_day_times 0.1 val normalization= h.sql("select t1.cookie , ((t1.visit_times- "+min_visit_times+")*0.2/"+region_visit_times+") as visit_times, t1.visit_targetpage_percen*0.1, ((t1.last_online_time- "+min_last_online_time+")*0.1/"+region_last_online_time+") as last_online_time, ((t1.pay_times- "+min_pay_times+")*0.2/"+region_pay_times+") as pay_times, ((t1.comment_times- "+min_comment_times+")*0.2/"+region_comment_times+") as comment_times, ((t1.stay_time- "+min_stay_time+")*0.1/"+region_stay_time+") as stay_time, ((t1.visit_day_times- "+min_visit_day_times+")*0.1/"+region_visit_day_times+") as visit_day_times from model_input_active_t t1") import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix //DataFrame转化为Vectors,没发现直接的API,方案是Dataframe转为rdd,然后,调用Vectors.dense,把它们集合起来 val data = normalization.rdd.map(line => Vectors.dense(line.get(1).toString.asInstanceOf[String].toDouble,line.get(2).toString.asInstanceOf[String].toDouble,line.get(3).toString.asInstanceOf[String].toDouble,line.get(4).toString.asInstanceOf[String].toDouble,line.get(5).toString.asInstanceOf[String].toDouble,line.get(6).toString.asInstanceOf[String].toDouble,line.get(7).toString.asInstanceOf[String].toDouble)) val rm = new RowMatrix(data) val pc = rm.computePrincipalComponents(1) val mx = rm.multiply(pc) //未完待续
Example 29
Source File: PCAOnRowMatrixExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 30
Source File: PCAOnRowMatrixExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 31
Source File: SVDExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println
Example 32
Source File: PearsonCorrelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 33
Source File: SpearmanCorrelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 34
Source File: ApspResult.scala From spark-all-pairs-shortest-path with Apache License 2.0 | 5 votes |
import java.io.Serializable import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.BlockMatrix import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel class ApspResult ( var size: Long, var distMatrix: BlockMatrix) extends Serializable with Logging{ validateResult(distMatrix) private def validateResult(result: BlockMatrix): Unit = { require(result.numRows == result.numCols, "The shortest distance matrix is not square.") require(size == result.numRows, s"The size of the shortest distance matrix does not match $size.") if (result.blocks.getStorageLevel == StorageLevel.NONE) { logWarning("The APSP result is not cached. Lookup could be slow") } } def lookupDist(srcId: Long, dstId: Long): Double = { val sizePerBlock = distMatrix.rowsPerBlock val rowBlockId = (srcId/sizePerBlock).toInt val colBlockId = (dstId/sizePerBlock).toInt val block = distMatrix.blocks.filter{case ((i, j), _) => ( i == rowBlockId) & (j == colBlockId)} .first._2 block.toArray((dstId % sizePerBlock).toInt * block.numRows + (srcId % sizePerBlock).toInt) } def toLocal(): Matrix = { distMatrix.toLocalMatrix() } }
Example 35
Source File: APSPSpec.scala From spark-all-pairs-shortest-path with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} import org.scalatest.{Outcome, FlatSpec} import AllPairsShortestPath._ import breeze.linalg.{DenseMatrix => BDM} class APSPSpec extends FlatSpec { val conf = new SparkConf().setAppName("AllPairsShortestPath").setMaster("local[4]").set("spark.driver.allowMultipleContexts", "true") val sc = new SparkContext(conf) override def withFixture(test: NoArgTest) : Outcome = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) try { test() // invoke the test function } } def fourByFourBlockMatrx = { val entries = sc.parallelize(Array( (0, 1, 20), (0, 2, 4), (0, 3, 2), (1, 0, 2), (1, 2, 1), (1, 3, 3), (2, 0, 1), (2, 1, 6), (2, 3, 5), (3, 0, 4), (3, 1, 2), (3, 2, 2))).map { case (i, j, v) => MatrixEntry(i, j, v) } val coordMat = new CoordinateMatrix(entries) val matA = coordMat.toBlockMatrix(2, 2).cache() matA } def ApspPartitioner = { GridPartitioner(fourByFourBlockMatrx.numRowBlocks, fourByFourBlockMatrx.numColBlocks, fourByFourBlockMatrx.blocks.partitions.length) } def toBreeze(A: Matrix): BDM[Double] = { new BDM[Double](A.numRows, A.numCols, A.toArray) } "The sample 4x4 Block Matrix" should "be valid" in { fourByFourBlockMatrx.validate() } it should "match our APSP matrix" in { println(fourByFourBlockMatrx.toLocalMatrix()) val result = new DistributedBlockFW val observed = toBreeze(result.compute(fourByFourBlockMatrx).toLocal()) val expected = BDM( (0.0, 4.0, 4.0, 2.0), (2.0, 0.0, 1.0, 3.0), (1.0, 5.0, 0.0, 3.0), (3.0, 2.0, 2.0, 0.0) ) assert(observed === expected) } }
Example 36
Source File: PCAOnRowMatrixExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object PCAOnRowMatrixExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAOnRowMatrixExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 4 principal components. // Principal components are stored in a local dense matrix. val pc: Matrix = mat.computePrincipalComponents(4) // Project the rows to the linear space spanned by the top 4 principal components. val projected: RowMatrix = mat.multiply(pc) // $example off$ val collect = projected.rows.collect() println("Projected Row Matrix of principal component:") collect.foreach { vector => println(vector) } } } // scalastyle:on println
Example 37
Source File: SVDExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.SingularValueDecomposition import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix // $example off$ object SVDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SVDExample") val sc = new SparkContext(conf) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) val dataRDD = sc.parallelize(data, 2) val mat: RowMatrix = new RowMatrix(dataRDD) // Compute the top 5 singular values and corresponding singular vectors. val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true) val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. // $example off$ val collect = U.rows.collect() println("U factor is:") collect.foreach { vector => println(vector) } println(s"Singular values are: $s") println(s"V factor is:\n$V") } } // scalastyle:on println