org.apache.spark.mllib.linalg.distributed.MatrixEntry Scala Examples

Example 1
Source File: CosineSimilarity.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}

object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
          |For example, the following command runs this app on a dataset:
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries) {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>

    println(s"Average absolute error in estimate is: $MAE")

// scalastyle:on println 
Example 2
Source File: QueryNearestNeighboursTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class QueryNearestNeighboursTest
  extends FunSuite
    with SparkLocalContext
    with Matchers {

  def denseVector(input: Double*): Vector = {

  test("nearest neighbours cosine") {
    val queryVectorA = denseVector(1.0, 1.0)
    val queryVectorB = denseVector(-1.0, 1.0)

    val catalogVectorA = denseVector(1.0, 1.0)
    val catalogVectorB = denseVector(-1.0, 1.0)
    val catalogVectorC = denseVector(-1.0, 0.5)
    val catalogVectorD = denseVector(1.0, 0.5)

    val queryRows = Seq(
      IndexedRow(0, queryVectorA),
      IndexedRow(1, queryVectorB)

    val catalogRows = Seq(
      IndexedRow(0, catalogVectorA),
      IndexedRow(1, catalogVectorB),
      IndexedRow(2, catalogVectorC),
      IndexedRow(3, catalogVectorD)

    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryNearestNeighbours(Cosine, 0.4, 1.0, 1.0)
    val expected = Seq(
      MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)),
      MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)),
      MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)),
      MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC))

    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect
    got should be(expected)
Example 3
Source File: QueryHammingTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import com.soundcloud.TestHelper
import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class QueryHammingTest
  extends FunSuite
    with SparkLocalContext
    with Matchers
    with TestHelper {

  def denseVector(input: Double*): Vector = {

  val queryVectorA = denseVector(1.0, 1.0)
  val queryVectorB = denseVector(-1.0, 1.0)

  val catalogVectorA = denseVector(1.0, 1.0)
  val catalogVectorB = denseVector(-1.0, 1.0)
  val catalogVectorC = denseVector(-1.0, 0.5)
  val catalogVectorD = denseVector(1.0, 0.5)

  val queryRows = Seq(
    IndexedRow(0, queryVectorA),
    IndexedRow(1, queryVectorB)

  val catalogRows = Seq(
    IndexedRow(0, catalogVectorA),
    IndexedRow(1, catalogVectorB),
    IndexedRow(2, catalogVectorC),
    IndexedRow(3, catalogVectorD)

  val expected = Array(
    MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)),
    MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)),
    MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)),
    MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC))

  test("broadcast catalog") {
    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, true)
    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect

    implicit val equality = new MatrixEquality(0.02)
    got.sortBy(t => (t.i, t.j)) should equal(expected)

  test("broadcast query") {
    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, false)
    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect

    implicit val equality = new MatrixEquality(0.02)
    got.sortBy(t => (t.i, t.j)) should equal(expected)

Example 4
Source File: NearestNeighboursTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class NearestNeighboursTest
  extends FunSuite
  with SparkLocalContext
  with Matchers {

  def denseVector(input: Double*): Vector = {

  test("nearest neighbours cosine") {
    val vecA = denseVector(1.0, 0.0)
    val vecB = denseVector(0.0, 1.0)
    val vecC = denseVector(-1.0, 0.0)
    val vecD = denseVector(1.0, 0.0)

    val rows = Seq(
      IndexedRow(0, vecA),
      IndexedRow(1, vecB),
      IndexedRow(2, vecC),
      IndexedRow(3, vecD)
    val indexedMatrix = new IndexedRowMatrix(sc.parallelize(rows))

    val nearestNeighbour = new NearestNeighbours(Cosine, 0.0, 1.0)
    val got = nearestNeighbour.join(indexedMatrix)

    val expected = Seq(
      MatrixEntry(0, 1, 0.0),
      MatrixEntry(0, 3, 1.0),
      MatrixEntry(1, 2, 0.0),
      MatrixEntry(1, 3, 0.0)
    val gotEntries = got.entries.collect().toSeq
    gotEntries should be(expected)

Example 5
Source File: QueryHamming.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.rdd.RDD

class QueryHamming(minCosineSimilarity: Double,
                   dimensions: Int,
                   resultSize: Int,
                   broadcastCatalog: Boolean = true) extends QueryJoiner with Serializable {

  override def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val numFeatures = queryMatrix.numCols().toInt

    val randomMatrix = localRandomMatrix(dimensions, numFeatures)
    val querySignatures = matrixToBitSetSparse(queryMatrix, randomMatrix)
    val catalogSignatures = matrixToBitSetSparse(catalogMatrix, randomMatrix)

    var rddSignatures: RDD[SparseSignature] = null
    var broadcastSignatures: Broadcast[Array[SparseSignature]] = null

    if (broadcastCatalog) {
      rddSignatures = querySignatures
      broadcastSignatures = querySignatures.sparkContext.broadcast(catalogSignatures.collect)
    } else {
      rddSignatures = catalogSignatures
      broadcastSignatures = catalogSignatures.sparkContext.broadcast(querySignatures.collect)

    val approximated = rddSignatures.mapPartitions {
      rddSignatureIterator =>
        val signaturesBC = broadcastSignatures.value
        rddSignatureIterator.flatMap {
          rddSignature =>
              broadCastSignature =>
                val approximatedCosine = hammingToCosine(hamming(rddSignature.bitSet, broadCastSignature.bitSet), dimensions)

                if (broadcastCatalog)
                  new MatrixEntry(rddSignature.index, broadCastSignature.index, approximatedCosine)
                  new MatrixEntry(broadCastSignature.index, rddSignature.index, approximatedCosine)
            }.filter(_.value >= minCosineSimilarity).sortBy(-_.value).take(resultSize)

    new CoordinateMatrix(approximated)

Example 6
Source File: NearestNeighbours.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry}

class NearestNeighbours(
                         distance: VectorDistance,
                         threshold: Double,
                         fraction: Double) extends Joiner with Serializable {

  def join(inputMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val rows = inputMatrix.rows
    val sampledRows = rows.sample(false, fraction)

    val joined = sampledRows.cartesian(rows)

    val similarity = {
      case ((rowA: IndexedRow), (rowB: IndexedRow)) =>
        ((rowA.index, rowB.index), distance(rowA.vector, rowB.vector))

    val neighbours = similarity.filter {
      case ((indexA: Long, indexB: Long), similarity) =>
        similarity >= threshold &&
          indexA < indexB // make upper triangular and remove self similarities

    val resultRows = {
      case ((indexA: Long, indexB: Long), similarity) =>
        MatrixEntry(indexA, indexB, similarity)

    new CoordinateMatrix(resultRows)
Example 7
Source File: QueryNearestNeighbours.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry}

class QueryNearestNeighbours(
                              distance: VectorDistance,
                              threshold: Double,
                              queryFraction: Double,
                              catalogFraction: Double
                            ) extends QueryJoiner with Serializable {

  def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val sampledQueries = queryMatrix.rows.sample(false, queryFraction)
    val sampledCatalog = catalogMatrix.rows.sample(false, catalogFraction)

    val joined = sampledQueries.cartesian(sampledCatalog)

    val neighbours = { case ((query: IndexedRow), (catalogEntry: IndexedRow)) =>
      new MatrixEntry(query.index, catalogEntry.index, distance(query.vector, catalogEntry.vector))
    }.filter(_.value >= threshold)

    new CoordinateMatrix(neighbours)
Example 8
Source File: ReadingWritingData.scala    From Spark-RSVD   with Apache License 2.0 5 votes vote down vote up
package com.criteo.rsvd

import java.nio.ByteBuffer

import com.esotericsoftware.kryo.Kryo
import com.typesafe.scalalogging.slf4j.StrictLogging
import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer
import org.apache.hadoop.fs.{FileSystem, Path}
import{BytesWritable, NullWritable}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.apache.spark.{SparkConf, SparkContext}

import scala.reflect.ClassTag

object ReadingWritingData extends StrictLogging {

  def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = {
    val fs = FileSystem.get(sc.hadoopConfiguration)
    val path = new Path(inputPathPattern)
    (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt

  def loadMatrixEntries(inputPath: String,
                        singlePartitionSizeMB: Int,
                        sc: SparkContext): RDD[MatrixEntry] = {"Input matrix path: $inputPath")
    val inputDataSizeMB = getInputDataSizeMB(inputPath + "
  def makeRddFromKryoFile[T: ClassTag](
      sc: SparkContext,
      path: String,
      minPartitionsOpt: Option[Int] = None): RDD[T] = {
    val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions)
    val serializer = new KryoSerializer(sc.getConf)
      .mapPartitions { it =>
        val instance = serializer.newInstance()
        it.flatMap {
          case (_, v) =>

  object RandomizedSVDKryoRegistrator extends KryoRegistrator {

    def registerClasses(kryo: Kryo): Unit = {

  def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf =

  def appendRegistratorToSparkConf(sparkConf: SparkConf,
                                   registratorName: String): SparkConf = {
    val oldValue = sparkConf.get("spark.kryo.registrator", "")
    if (oldValue == "") {
      sparkConf.set("spark.kryo.registrator", registratorName)
    } else {
      sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName)

Example 9
Source File: CosineSimilarity.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}
import org.apache.spark.{SparkConf, SparkContext}

object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
          |For example, the following command runs this app on a dataset:
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt

    parser.parse(args, defaultParams).map { params =>
    } getOrElse {

  def run(params: Params) {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries) {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>

    println(s"Average absolute error in estimate is: $MAE")

// scalastyle:on println 
Example 10
Source File: CosineSimilarity.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}

object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
          |For example, the following command runs this app on a dataset:
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries) {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>

    println(s"Average absolute error in estimate is: $MAE")

// scalastyle:on println 
Example 11
Source File: CosineSimilarity.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}
import org.apache.spark.{SparkConf, SparkContext}

object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
          |For example, the following command runs this app on a dataset:
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt

    parser.parse(args, defaultParams).map { params =>
    } getOrElse {

  def run(params: Params) {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries) {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>

    println(s"Average absolute error in estimate is: $MAE")

Example 12
Source File: CosineSimilarity.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}

object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
          |For example, the following command runs this app on a dataset:
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries) {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>

    println(s"Average absolute error in estimate is: $MAE")

// scalastyle:on println 
Example 13
Source File: LocalMinPlus.scala    From spark-all-pairs-shortest-path   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
import org.scalatest.{FlatSpec}
import breeze.linalg.{DenseMatrix => BDM, DenseVector, min, Matrix =>BM}
import AllPairsShortestPath._

class LocalMinPlus extends FlatSpec {

  def localMinPlus(A: BDM[Double], B: BDM[Double]): BDM[Double] = {
    require(A.cols == B.rows, " Num cols of A does not match the num rows of B")
    val k = A.cols
    val onesA = DenseVector.ones[Double](B.cols)
    val onesB = DenseVector.ones[Double](A.rows)
    var AMinPlusB = A(::, 0) * onesA.t + onesB * B(0, ::)
    if (k > 1) {
      for (i <- 1 until k) {
        val a = A(::, i)
        val b = B(i, ::)
        val aPlusb = a * onesA.t + onesB * b
        AMinPlusB = min(aPlusb, AMinPlusB)

  def fourByFourBlockMatrx = {
      (0.0, 20.0, 4.0, 2.0),
      (2.0, 0.0, 1.0, 3.0),
      (1.0, 6.0, 0.0, 5.0),
      (4.0, 2.0, 2.0, 0.0)

  def fourByFourMinPlusProduct = {
      (0.0,  2.0,  1.0,  2.0),
      (2.0,  0.0,  1.0,  2.0),
      (1.0,  1.0,  0.0,  2.0),
      (2.0,  2.0,  2.0,  0.0)

  "The minPlus product of the sample 4x4 matrix with itself" should "be correct" in {
    assert(localMinPlus(fourByFourBlockMatrx, fourByFourBlockMatrx.t) === fourByFourMinPlusProduct)
Example 14
Source File: APSPSpec.scala    From spark-all-pairs-shortest-path   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
import org.scalatest.{Outcome, FlatSpec}
import AllPairsShortestPath._
import breeze.linalg.{DenseMatrix => BDM}

class APSPSpec extends FlatSpec {

  val conf = new SparkConf().setAppName("AllPairsShortestPath").setMaster("local[4]").set("spark.driver.allowMultipleContexts", "true")
  val sc = new SparkContext(conf)

  override def withFixture(test: NoArgTest) : Outcome = {
    try {
      test() // invoke the test function

  def fourByFourBlockMatrx = {
    val entries = sc.parallelize(Array(
      (0, 1, 20), (0, 2, 4), (0, 3, 2),
      (1, 0, 2), (1, 2, 1), (1, 3, 3), (2, 0, 1),
      (2, 1, 6), (2, 3, 5), (3, 0, 4), (3, 1, 2), (3, 2, 2))).map { case (i, j, v) => MatrixEntry(i, j, v) }
    val coordMat = new CoordinateMatrix(entries)
    val matA = coordMat.toBlockMatrix(2, 2).cache()

  def ApspPartitioner = {
    GridPartitioner(fourByFourBlockMatrx.numRowBlocks, fourByFourBlockMatrx.numColBlocks, fourByFourBlockMatrx.blocks.partitions.length)

  def toBreeze(A: Matrix): BDM[Double] = {
    new BDM[Double](A.numRows, A.numCols, A.toArray)

  "The sample 4x4 Block Matrix" should "be valid" in {

  it should "match our APSP matrix" in {
    val result = new DistributedBlockFW
    val observed = toBreeze(result.compute(fourByFourBlockMatrx).toLocal())
    val expected = BDM(
      (0.0, 4.0, 4.0, 2.0),
      (2.0, 0.0, 1.0, 3.0),
      (1.0, 5.0, 0.0, 3.0),
      (3.0, 2.0, 2.0, 0.0)
    assert(observed === expected)
Example 15
Source File: CosineSimilarity.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}

object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
          |For example, the following command runs this app on a dataset:
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries) {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>

    println(s"Average absolute error in estimate is: $MAE")

// scalastyle:on println 
Example 16
Source File: SparkMatrix.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package linalg.matrix

import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry

object SparkMatrix {

  def main(args: Array[String]) {

    val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0))
    println("dMatrix: \n" + dMatrix)

    val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7))
    println("sMatrixOne: \n" + sMatrixOne)

    val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7))
    println("sMatrixTwo: \n" + sMatrixTwo)

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.1),
      Vectors.dense(3.0, 2.0, 4.0),
      Vectors.dense(5.0, 7.0, 8.0),
      Vectors.dense(9.0, 0.0, 1.1)
    val sparseData = Seq(
      Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))),
      Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))),
      Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))),
      Vectors.sparse(3, Seq((0, 9.0), (2, 1.0)))

    val denseMat = new RowMatrix(sc.parallelize(denseData, 2))
    val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2))

    println("Dense Matrix - Num of Rows :" + denseMat.numRows())
    println("Dense Matrix - Num of Cols:" + denseMat.numCols())
    println("Sparse Matrix - Num of Rows :" + sparseMat.numRows())
    println("Sparse Matrix - Num of Cols:" + sparseMat.numCols())

    val data = Seq(
      (0L, Vectors.dense(0.0, 1.0, 2.0)),
      (1L, Vectors.dense(3.0, 4.0, 5.0)),
      (3L, Vectors.dense(9.0, 0.0, 1.0))
    ).map(x => IndexedRow(x._1, x._2))
    val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2)
    val indexedRowsMat = new IndexedRowMatrix(indexedRows)
    println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows())
    println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols())

    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    val coordinateMat = new CoordinateMatrix(entries)
    println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows())
    println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols())



Example 17
Source File: T9-4DataTypes.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object DataTypesApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
        "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>")
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => => f.toDouble))

    val denseV = => Vectors.dense(f.slice(1, 5)))
    val sparseV = => f.slice(1, 5).toList).map(f => { case (s, i) => (i, s) })
      .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l))
    val labeledP = => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    val denseM = => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53)))
    denseV.foreachRDD(rdd => {
      val rowM = new RowMatrix(rdd)
    denseV.foreachRDD(rdd => {
      val iRdd = => new IndexedRow(v._2, v._1))
      val iRowM = new IndexedRowMatrix(iRdd)
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37) => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val cRowM = new CoordinateMatrix(entries)
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37) => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val blockM = new CoordinateMatrix(entries).toBlockMatrix


Example 18
Source File: X2P.scala    From spark-tsne   with Apache License 2.0 5 votes vote down vote up
package com.github.saurfang.spark.tsne

import breeze.linalg.DenseVector
import org.apache.spark.mllib.X2PHelper._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix}
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
import org.slf4j.LoggerFactory

object X2P {

  private def logger = LoggerFactory.getLogger(X2P.getClass)

  def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = {
    require(tol >= 0, "Tolerance must be non-negative")
    require(perplexity > 0, "Perplexity must be positive")

    val mu = (3 * perplexity).toInt //TODO: Expose this as parameter
    val logU = Math.log(perplexity)
    val norms =, 2.0))
    val rowsWithNorm ={ case (v, norm) => VectorWithNorm(v, norm) }
    val neighbors = rowsWithNorm.zipWithIndex()
      .flatMap {
      case ((u, i), (v, j)) =>
        if(i < j) {
          val dist = fastSquaredDistance(u, v)
          Seq((i, (j, dist)), (j, (i, dist)))
        } else Seq.empty
      .topByKey(mu)( => -e._2))

    val p_betas = {
        case (i, arr) =>
          var betamin = Double.NegativeInfinity
          var betamax = Double.PositiveInfinity
          var beta = 1.0

          val d = DenseVector(
          var (h, p) = Hbeta(d, beta)

          //logInfo("data was " + d.toArray.toList)
          //logInfo("array P was " + p.toList)

          // Evaluate whether the perplexity is within tolerance
          def Hdiff = h - logU
          var tries = 0
          while (Math.abs(Hdiff) > tol && tries < 50) {
            //If not, increase or decrease precision
            if (Hdiff > 0) {
              betamin = beta
              beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2
            } else {
              betamax = beta
              beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2

            // Recompute the values
            val HP = Hbeta(d, beta)
            h = HP._1
            p = HP._2
            tries = tries + 1

          //logInfo("array P is " + p.toList)

          ( { case (j, v) => MatrixEntry(i, j, v) }, beta)
      }"Mean value of sigma: " + => math.sqrt(1 / x._2)).mean)
    new CoordinateMatrix(p_betas.flatMap(_._1))