org.apache.spark.mllib.linalg.distributed.CoordinateMatrix Scala Examples
The following examples show how to use org.apache.spark.mllib.linalg.distributed.CoordinateMatrix.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TSNEHelper.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import breeze.linalg._ import breeze.stats._ import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.rdd.RDD object TSNEHelper { // p_ij = (p_{i|j} + p_{j|i}) / 2n def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = { p_ji.entries .flatMap(e => Seq( ((e.i.toInt, e.j.toInt), e.value), ((e.j.toInt, e.i.toInt), e.value) )) .reduceByKey(_ + _) // p + p' .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n .groupByKey() } def update(Y: DenseMatrix[Double], dY: DenseMatrix[Double], iY: DenseMatrix[Double], gains: DenseMatrix[Double], iteration: Int, param: TSNEParam): DenseMatrix[Double] = { import param._ val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum gains.foreachPair { case ((i, j), old_gain) => val new_gain = math.max(min_gain, if ((dY(i, j) > 0.0) != (iY(i, j) > 0.0)) old_gain + 0.2 else old_gain * 0.8 ) gains.update(i, j, new_gain) val new_iY = momentum * iY(i, j) - eta * new_gain * dY(i, j) iY.update(i, j, new_iY) Y.update(i, j, Y(i, j) + new_iY) // Y += iY } val t_Y: DenseVector[Double] = mean(Y(::, *)).t val y_sub = Y(*, ::) Y := y_sub - t_Y } }
Example 2
Source File: X2P.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import breeze.linalg.DenseVector import org.apache.spark.mllib.X2PHelper._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix} import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.slf4j.LoggerFactory object X2P { private def logger = LoggerFactory.getLogger(X2P.getClass) def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = { require(tol >= 0, "Tolerance must be non-negative") require(perplexity > 0, "Perplexity must be positive") val mu = (3 * perplexity).toInt //TODO: Expose this as parameter val logU = Math.log(perplexity) val norms = x.rows.map(Vectors.norm(_, 2.0)) norms.persist() val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) } val neighbors = rowsWithNorm.zipWithIndex() .cartesian(rowsWithNorm.zipWithIndex()) .flatMap { case ((u, i), (v, j)) => if(i < j) { val dist = fastSquaredDistance(u, v) Seq((i, (j, dist)), (j, (i, dist))) } else Seq.empty } .topByKey(mu)(Ordering.by(e => -e._2)) val p_betas = neighbors.map { case (i, arr) => var betamin = Double.NegativeInfinity var betamax = Double.PositiveInfinity var beta = 1.0 val d = DenseVector(arr.map(_._2)) var (h, p) = Hbeta(d, beta) //logInfo("data was " + d.toArray.toList) //logInfo("array P was " + p.toList) // Evaluate whether the perplexity is within tolerance def Hdiff = h - logU var tries = 0 while (Math.abs(Hdiff) > tol && tries < 50) { //If not, increase or decrease precision if (Hdiff > 0) { betamin = beta beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2 } else { betamax = beta beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2 } // Recompute the values val HP = Hbeta(d, beta) h = HP._1 p = HP._2 tries = tries + 1 } //logInfo("array P is " + p.toList) (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta) } logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean) new CoordinateMatrix(p_betas.flatMap(_._1)) } }
Example 3
Source File: AmplificationsTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg.distributed.impl import org.scalacheck.Gen.{choose, oneOf, listOfN} import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.mllib.linalg.{ DenseVector, CosineDistance, JaccardDistance } import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix class AmplificationsTest extends ImplPropSpec with GeneratorDrivenPropertyChecks with Matchers { property( "or-construction generates the correct number of indexed rows for the given data points") { forAllNoShrink(simhashBucketsGen) { case (buckets, numVectors) => val or = ORConstruction(CosineDistance) val sim = new CoordinateMatrix(or(buckets, numVectors)).toIndexedRowMatrix.rows .collect() sim.size === numVectors sim.forall(s => s.vector.size <= numVectors) } } property( "band or-construction generates the correct number of indexed rows for the given data points") { forAllNoShrink(minhashBucketsGen) { case (buckets, numVectors, numBands) => val bor = BandORConstruction(JaccardDistance, numBands) val sim = new CoordinateMatrix(bor(buckets, numVectors)).toIndexedRowMatrix.rows .collect() sim.size === numVectors sim.forall(s => s.vector.size <= numVectors) } } }
Example 4
Source File: T9-4DataTypes.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataTypesApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val denseV = substream.map(f => Vectors.dense(f.slice(1, 5))) denseV.print() val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) }) .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l)) sparseV.print() val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) labeledP.print() val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53))) denseM.print() denseV.foreachRDD(rdd => { val rowM = new RowMatrix(rdd) println(rowM) }) denseV.foreachRDD(rdd => { val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1)) val iRowM = new IndexedRowMatrix(iRdd) println(iRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val cRowM = new CoordinateMatrix(entries) println(cRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val blockM = new CoordinateMatrix(entries).toBlockMatrix println(blockM) }) ssc.start() ssc.awaitTermination() } }
Example 5
Source File: SparkMatrix.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.matrix import org.apache.spark.ml.linalg.Matrix import org.apache.spark.ml.linalg.Matrices import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.MatrixEntry object SparkMatrix { def main(args: Array[String]) { val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0)) println("dMatrix: \n" + dMatrix) val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7)) println("sMatrixOne: \n" + sMatrixOne) val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7)) println("sMatrixTwo: \n" + sMatrixTwo) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val denseData = Seq( Vectors.dense(0.0, 1.0, 2.1), Vectors.dense(3.0, 2.0, 4.0), Vectors.dense(5.0, 7.0, 8.0), Vectors.dense(9.0, 0.0, 1.1) ) val sparseData = Seq( Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))), Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))), Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))), Vectors.sparse(3, Seq((0, 9.0), (2, 1.0))) ) val denseMat = new RowMatrix(sc.parallelize(denseData, 2)) val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2)) println("Dense Matrix - Num of Rows :" + denseMat.numRows()) println("Dense Matrix - Num of Cols:" + denseMat.numCols()) println("Sparse Matrix - Num of Rows :" + sparseMat.numRows()) println("Sparse Matrix - Num of Cols:" + sparseMat.numCols()) val data = Seq( (0L, Vectors.dense(0.0, 1.0, 2.0)), (1L, Vectors.dense(3.0, 4.0, 5.0)), (3L, Vectors.dense(9.0, 0.0, 1.0)) ).map(x => IndexedRow(x._1, x._2)) val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2) val indexedRowsMat = new IndexedRowMatrix(indexedRows) println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows()) println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols()) val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } val coordinateMat = new CoordinateMatrix(entries) println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows()) println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols()) sc.stop() } }
Example 6
Source File: APSPSpec.scala From spark-all-pairs-shortest-path with Apache License 2.0 | 5 votes |
import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} import org.scalatest.{Outcome, FlatSpec} import AllPairsShortestPath._ import breeze.linalg.{DenseMatrix => BDM} class APSPSpec extends FlatSpec { val conf = new SparkConf().setAppName("AllPairsShortestPath").setMaster("local[4]").set("spark.driver.allowMultipleContexts", "true") val sc = new SparkContext(conf) override def withFixture(test: NoArgTest) : Outcome = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) try { test() // invoke the test function } } def fourByFourBlockMatrx = { val entries = sc.parallelize(Array( (0, 1, 20), (0, 2, 4), (0, 3, 2), (1, 0, 2), (1, 2, 1), (1, 3, 3), (2, 0, 1), (2, 1, 6), (2, 3, 5), (3, 0, 4), (3, 1, 2), (3, 2, 2))).map { case (i, j, v) => MatrixEntry(i, j, v) } val coordMat = new CoordinateMatrix(entries) val matA = coordMat.toBlockMatrix(2, 2).cache() matA } def ApspPartitioner = { GridPartitioner(fourByFourBlockMatrx.numRowBlocks, fourByFourBlockMatrx.numColBlocks, fourByFourBlockMatrx.blocks.partitions.length) } def toBreeze(A: Matrix): BDM[Double] = { new BDM[Double](A.numRows, A.numCols, A.toArray) } "The sample 4x4 Block Matrix" should "be valid" in { fourByFourBlockMatrx.validate() } it should "match our APSP matrix" in { println(fourByFourBlockMatrx.toLocalMatrix()) val result = new DistributedBlockFW val observed = toBreeze(result.compute(fourByFourBlockMatrx).toLocal()) val expected = BDM( (0.0, 4.0, 4.0, 2.0), (2.0, 0.0, 1.0, 3.0), (1.0, 5.0, 0.0, 3.0), (3.0, 2.0, 2.0, 0.0) ) assert(observed === expected) } }
Example 7
Source File: LocalMinPlus.scala From spark-all-pairs-shortest-path with Apache License 2.0 | 5 votes |
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} import org.scalatest.{FlatSpec} import breeze.linalg.{DenseMatrix => BDM, DenseVector, min, Matrix =>BM} import AllPairsShortestPath._ class LocalMinPlus extends FlatSpec { def localMinPlus(A: BDM[Double], B: BDM[Double]): BDM[Double] = { require(A.cols == B.rows, " Num cols of A does not match the num rows of B") val k = A.cols val onesA = DenseVector.ones[Double](B.cols) val onesB = DenseVector.ones[Double](A.rows) var AMinPlusB = A(::, 0) * onesA.t + onesB * B(0, ::) if (k > 1) { for (i <- 1 until k) { val a = A(::, i) val b = B(i, ::) val aPlusb = a * onesA.t + onesB * b AMinPlusB = min(aPlusb, AMinPlusB) } } AMinPlusB } def fourByFourBlockMatrx = { BDM( (0.0, 20.0, 4.0, 2.0), (2.0, 0.0, 1.0, 3.0), (1.0, 6.0, 0.0, 5.0), (4.0, 2.0, 2.0, 0.0) ) } def fourByFourMinPlusProduct = { BDM( (0.0, 2.0, 1.0, 2.0), (2.0, 0.0, 1.0, 2.0), (1.0, 1.0, 0.0, 2.0), (2.0, 2.0, 2.0, 0.0) ) } "The minPlus product of the sample 4x4 matrix with itself" should "be correct" in { assert(localMinPlus(fourByFourBlockMatrx, fourByFourBlockMatrx.t) === fourByFourMinPlusProduct) } }
Example 8
Source File: QueryNearestNeighbours.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry} class QueryNearestNeighbours( distance: VectorDistance, threshold: Double, queryFraction: Double, catalogFraction: Double ) extends QueryJoiner with Serializable { def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = { val sampledQueries = queryMatrix.rows.sample(false, queryFraction) val sampledCatalog = catalogMatrix.rows.sample(false, catalogFraction) val joined = sampledQueries.cartesian(sampledCatalog) val neighbours = joined.map { case ((query: IndexedRow), (catalogEntry: IndexedRow)) => new MatrixEntry(query.index, catalogEntry.index, distance(query.vector, catalogEntry.vector)) }.filter(_.value >= threshold) new CoordinateMatrix(neighbours) } }
Example 9
Source File: NearestNeighbours.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry} class NearestNeighbours( distance: VectorDistance, threshold: Double, fraction: Double) extends Joiner with Serializable { def join(inputMatrix: IndexedRowMatrix): CoordinateMatrix = { val rows = inputMatrix.rows val sampledRows = rows.sample(false, fraction) sampledRows.cache() val joined = sampledRows.cartesian(rows) val similarity = joined.map { case ((rowA: IndexedRow), (rowB: IndexedRow)) => ((rowA.index, rowB.index), distance(rowA.vector, rowB.vector)) } val neighbours = similarity.filter { case ((indexA: Long, indexB: Long), similarity) => similarity >= threshold && indexA < indexB // make upper triangular and remove self similarities } val resultRows = neighbours.map { case ((indexA: Long, indexB: Long), similarity) => MatrixEntry(indexA, indexB, similarity) } new CoordinateMatrix(resultRows) } }
Example 10
Source File: QueryHamming.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import org.apache.spark.broadcast.Broadcast import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRowMatrix, MatrixEntry} import org.apache.spark.rdd.RDD class QueryHamming(minCosineSimilarity: Double, dimensions: Int, resultSize: Int, broadcastCatalog: Boolean = true) extends QueryJoiner with Serializable { override def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = { val numFeatures = queryMatrix.numCols().toInt val randomMatrix = localRandomMatrix(dimensions, numFeatures) val querySignatures = matrixToBitSetSparse(queryMatrix, randomMatrix) val catalogSignatures = matrixToBitSetSparse(catalogMatrix, randomMatrix) var rddSignatures: RDD[SparseSignature] = null var broadcastSignatures: Broadcast[Array[SparseSignature]] = null if (broadcastCatalog) { rddSignatures = querySignatures broadcastSignatures = querySignatures.sparkContext.broadcast(catalogSignatures.collect) } else { rddSignatures = catalogSignatures broadcastSignatures = catalogSignatures.sparkContext.broadcast(querySignatures.collect) } val approximated = rddSignatures.mapPartitions { rddSignatureIterator => val signaturesBC = broadcastSignatures.value rddSignatureIterator.flatMap { rddSignature => signaturesBC.map { broadCastSignature => val approximatedCosine = hammingToCosine(hamming(rddSignature.bitSet, broadCastSignature.bitSet), dimensions) if (broadcastCatalog) new MatrixEntry(rddSignature.index, broadCastSignature.index, approximatedCosine) else new MatrixEntry(broadCastSignature.index, rddSignature.index, approximatedCosine) }.filter(_.value >= minCosineSimilarity).sortBy(-_.value).take(resultSize) } } broadcastSignatures.unpersist(true) new CoordinateMatrix(approximated) } }