org.apache.spark.graphx.VertexRDD Scala Example

Source File: PipeClusteringStrongestPath.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.clustering

import scala.Iterator

import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexRDD

import de.unihamburg.vsis.sddf.reading.Tuple


class PipeClusteringStrongestPath extends PipeClusteringTransitiveClosure {
  
  override def manipulateGraph(graph: Graph[Tuple, Double]): Graph[_, Double] = {

    val cGraph = graph.mapVertices((vid, tuple) => (vid, Double.MinPositiveValue))

    // attach the max adjacent edge attribute to each vertice
    val verticesMaxEdgeAttributes: VertexRDD[Double] = cGraph.mapReduceTriplets(
      edge => {
        Iterator((edge.dstId, edge.attr), (edge.srcId, edge.attr))
      },
      (a: Double, b: Double) => math.max(a, b)
    )

    // join the resulting vertice attributes with the graph
    val maxGraph: Graph[(Tuple, Double), Double] =
      graph.outerJoinVertices(verticesMaxEdgeAttributes)((id, tuple, simOpt) =>
        simOpt match {
          case Some(sim) => (tuple, sim)
          case None      => (tuple, 0D)
        }
      )
      
    // remove edges which have a max value less then src or dst 
    val resultGraph = maxGraph.subgraph(edge => {
      if (edge.attr < edge.srcAttr._2 && edge.attr < edge.dstAttr._2) {
        false
      } else {
        true
      }
    })
    resultGraph
  }

}

object PipeClusteringStrongestPath {
  
  def apply() = new PipeClusteringStrongestPath()

}

Source File: SocialPageRankJob.scala From spark-graphx with GNU General Public License v3.0

5 votes

package com.github.graphx.pagerank

import com.github.graphx.pregel.social.SocialGraph
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.VertexRDD

object SocialPageRankJob {

  
  def static(socialGraph: SocialGraph, tolerance: Double): VertexRDD[Double] =
    socialGraph.graph.staticPageRank(numIter = 20).vertices

  def handleResult(socialGraph: SocialGraph, ranks: VertexRDD[Double]) = {
    socialGraph.verts.join(ranks).map {
      case (_, (username, rank)) => (username, rank)
    }.sortBy({ case (_, rank) => rank }, ascending = false).take(10)
  }

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.ERROR)
    val sc = new SparkContext("local[*]", "PageRank")

    val socialGraph: SocialGraph = new SocialGraph(sc)
    val TOLERANCE: Double = 0.0001

    import scala.compat.Platform.{EOL => D}
    val topUsersDynamically = handleResult(socialGraph, ranks(socialGraph, TOLERANCE)).mkString(D)
    val topUsersIterative = handleResult(socialGraph, static(socialGraph, TOLERANCE)).mkString(D)

    println(s"Top 10 users in network counted with TOLERANCE until convergence $TOLERANCE - $D $topUsersDynamically")
    println(s"Top 10 users in the network counted iteratively - $D $topUsersIterative")

    sc.stop()
  }
}

Source File: EdmondsBCAggregator.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds

import ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds.struct.EdmondsVertex
import org.apache.spark.graphx.{VertexRDD, _}


class EdmondsBCAggregator[ED] extends Serializable {

  def aggregate(graph: Graph[EdmondsVertex, ED], source: VertexId) = {
//    val startTime = System.nanoTime()

    val maxDepth = graph.vertices.aggregate(0)({ case (depth, (vId, vData)) => Math.max(vData.depth, depth) }, Math.max)

    var g = graph
    var oldGraph: Option[Graph[EdmondsVertex, ED]] = None

    var messages = aggregateMessages(g, maxDepth).cache
    messages.count

    for (i <- 1 until maxDepth reverse) {
      oldGraph = Some(g)

      g = applyMessages(g, messages).cache
      val oldMessages = messages
      messages = aggregateMessages(g, i).cache
      messages.count

      oldMessages.unpersist(false)
      oldGraph.foreach(_.unpersistVertices(false))
      oldGraph.foreach(_.edges.unpersist(false))
    }

    messages.unpersist(false)

//    println("Time of execution updateCentrality:" + ((finishTime - startTime) / 1000000) + " ms")
    //    val finishTime = System.nanoTime()
    g
  }

  private def aggregateMessages(graph: Graph[EdmondsVertex, ED], depth: Int) = graph.aggregateMessages[Double](
    edgeContext => {
      val sender = createAndSendMessage(edgeContext.toEdgeTriplet, depth) _
      sender(edgeContext.srcId, edgeContext.sendToDst)
      sender(edgeContext.dstId, edgeContext.sendToSrc)
    }, _ + _
  )

  private def createAndSendMessage(triplet: EdgeTriplet[EdmondsVertex, ED], depth: Int)(source: VertexId, f: (Double) => Unit) = {
    val attr = triplet.vertexAttr(source)
    if (attr.depth == depth) sendMessage(produceMessage(triplet)(source), f)
  }

  private def produceMessage(triplet: EdgeTriplet[EdmondsVertex, ED])(source: VertexId) = {
    val attr = triplet.vertexAttr(source)
    val otherAttr = triplet.otherVertexAttr(source)
    val delta = (otherAttr.sigma.toDouble / attr.sigma.toDouble) * (1.0 + attr.delta)
    if (attr.preds.contains(triplet.otherVertexId(source))) Some(delta) else None
  }

  private def sendMessage(message: Option[Double], f: (Double) => Unit) = message.foreach(f)

  private def applyMessages(graph: Graph[EdmondsVertex, ED], messages: VertexRDD[Double]) =
    graph.ops.joinVertices(messages)((vertexId, attr, delta) => {
      EdmondsVertex(attr.preds, attr.sigma, attr.depth, delta, delta)
    })
}

Source File: Modularity.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.graph

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID
import ml.sparkling.graph.api.operators.measures.{VertexDependentGraphMeasure, GraphIndependentMeasure}
import org.apache.spark.graphx.{EdgeTriplet, VertexRDD, Graph}
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag


object Modularity extends VertexDependentGraphMeasure[Double,ComponentID]{

   def compute[V<:ComponentID:ClassTag,E:ClassTag](graph: Graph[V, E]): Double = {
     val edgesNum=graph.numEdges.toDouble;
     val edgesCounts: RDD[(V, (Int, Int))] = graph.triplets.flatMap(triplet => {
       if (triplet.srcAttr == triplet.dstAttr) {
         Iterator((triplet.srcAttr, (1, 0)),(triplet.srcAttr, (1, 0)))
       } else {
         Iterator((triplet.srcAttr, (0, 1)),(triplet.dstAttr,(0,1)))
       }
     })
     edgesCounts.aggregateByKey((0,0))(
       (agg:(Int,Int),data:(Int,Int))=>
         (agg,data) match{
           case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2)
         },
     (agg1:(Int,Int),agg2:(Int,Int))=>{
       (agg1,agg2) match{
         case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2)
       }
     }
     ).treeAggregate(0.0)(
       (agg:Double,data:(V,(Int,Int)))=>{
         data match{
           case (_,(edgesFull,edgesSome))=>
             agg+(edgesFull/(2.0*edgesNum))-Math.pow((edgesSome+edgesFull)/(2.0*edgesNum),2)
         }
       },
       (agg1,agg2)=>agg1+agg2
     )

  }

}

Source File: BetweennessEdmonds$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds

import java.nio.file.Files

import ml.sparkling.graph.operators.MeasureTest
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, VertexRDD}


class BetweennessEdmonds$Test(implicit sc: SparkContext) extends MeasureTest {
  val tempDir = Files.createTempDirectory("spark-checkpoint")

  override def beforeAll() = {
    sc.setCheckpointDir(tempDir.toAbsolutePath.toString)
  }

  override def afterAll() = {
    FileUtils.deleteDirectory(tempDir.toFile)
  }

  "Edmonds betweenness centrality for random graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes betweenness")
    val result = EdmondsBC.computeBC(graph)
    Then("Should calculate betweenness correctly")
    val bcFile = getClass.getResource("/graphs/graph_ER_15_bc")
    val bcCorrectValues = sc.textFile(bcFile.getPath)
      .filter(_.nonEmpty)
      .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) })
      .sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data}).collect()
    val bcValues = result.sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data }).collect()
    bcCorrectValues.zip(bcValues).foreach({ case (a, b) =>
      a should be(b +- 1e-5)
    })

    result.unpersist(false)
  }

}

org.apache.spark.graphx.VertexRDD Scala Examples