org.apache.spark.graphx.Graph Scala Example

Source File: OperatorsDSL.scala From sparkling-graph with BSD 2-Clause "Simplified" License

6 votes

package ml.sparkling.graph.operators

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection._
import ml.sparkling.graph.api.operators.measures.{EdgeMeasure, VertexMeasureConfiguration}
import ml.sparkling.graph.operators.algorithms.coarsening.labelpropagation.LPCoarsening
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN._
import ml.sparkling.graph.operators.algorithms.link.BasicLinkPredictor
import ml.sparkling.graph.operators.measures.edge.{AdamicAdar, CommonNeighbours}
import ml.sparkling.graph.operators.measures.vertex.{Degree, NeighborhoodConnectivity, VertexEmbeddedness}
import ml.sparkling.graph.operators.measures.vertex.clustering.LocalClustering
import ml.sparkling.graph.operators.measures.graph.{FreemanCentrality, Modularity}
import ml.sparkling.graph.operators.partitioning.CommunityBasedPartitioning._
import ml.sparkling.graph.operators.measures.vertex.closenes.Closeness
import ml.sparkling.graph.operators.measures.vertex.eigenvector.EigenvectorCentrality
import ml.sparkling.graph.operators.measures.vertex.hits.Hits
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


object OperatorsDSL {

  implicit class ModularityDSL[E:ClassTag](graph:Graph[ComponentID,E]){
    def modularity()=Modularity.compute(graph)
  }

  implicit class DSL[VD:ClassTag ,ED:ClassTag](graph:Graph[VD,ED]){
    def PSCAN(epsilon:Double=0.1)=
      computeConnectedComponents(graph,epsilon)

    def LPCoarse(treatAsUndirected:Boolean=false)=LPCoarsening.coarse(graph,treatAsUndirected = treatAsUndirected)

    def closenessCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Closeness.compute(graph,vertexMeasureConfiguration)

    def eigenvectorCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      EigenvectorCentrality.compute(graph,vertexMeasureConfiguration)

    def hits(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Hits.compute(graph,vertexMeasureConfiguration)

    def degreeCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Degree.compute(graph,vertexMeasureConfiguration)

    def neighborhoodConnectivity(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      NeighborhoodConnectivity.compute(graph,vertexMeasureConfiguration)

    def vertexEmbeddedness(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      VertexEmbeddedness.compute(graph,vertexMeasureConfiguration)

    def localClustering(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      LocalClustering.compute(graph,vertexMeasureConfiguration)

    def freemanCentrality()=FreemanCentrality.compute(graph)

    def partitionBy(communityDetectionMethod:CommunityDetectionMethod[VD,ED])(implicit sc:SparkContext)=
      partitionGraphBy(graph,communityDetectionMethod)

    def partitionBy(communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext)=
      partitionGraphUsing(graph,communityDetectionMethod,numParts)

    def adamicAdar(treatAsUndirected:Boolean=false)={
      AdamicAdar.computeWithPreprocessing(graph,treatAsUndirected)
    }

    def commonNeighbours(treatAsUndirected:Boolean=false)={
      CommonNeighbours.computeWithPreprocessing(graph,treatAsUndirected)
    }

    def predictLinks[EV: ClassTag, EO: ClassTag]( edgeMeasure: EdgeMeasure[EO, EV],threshold: EO,treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = {
      BasicLinkPredictor.predictLinks(graph, edgeMeasure, threshold, treatAsUndirected)
    }
    }
}

Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

6 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
    .setMaster("local[4]")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  }
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3
  )
  logNormalGraph.outDegrees.map(_._2).collect().sorted

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true
  ).partitionBy(PartitionStrategy.RandomVertexCut)
  actorGraph.edges.count()

  val actorComponents = actorGraph.connectedComponents().cache
  actorComponents.vertices.map(_._2).distinct().count

  val clusterSizes =actorComponents.vertices.map(
    v => (v._2, 1)).reduceByKey(_ + _)
  clusterSizes.map(_._2).max
  clusterSizes.map(_._2).min

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)
  strongComponents.vertices.map(_._2).distinct().count

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  actorGraph.triangleCount()
  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  actorGraph.staticPageRank(10)
  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2
  })

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println)

  actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10)

  actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count

}

Source File: LocalClustering.scala From sparkling-graph with BSD 2-Clause "Simplified" License

6 votes

package ml.sparkling.graph.operators.measures.vertex.clustering

import it.unimi.dsi.fastutil.longs.LongOpenHashSet
import ml.sparkling.graph.api.operators.measures.{VertexMeasure, VertexMeasureConfiguration}
import ml.sparkling.graph.operators.measures.utils.CollectionsUtils._
import ml.sparkling.graph.operators.measures.utils.{CollectionsUtils, NeighboursUtils}
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


   override def compute[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
                                                    vertexMeasureConfiguration: VertexMeasureConfiguration[VD,ED])
                                                   (implicit num: Numeric[ED]) = {
    val firstLevelNeighboursGraph = NeighboursUtils.getWithNeighbours(graph, vertexMeasureConfiguration.treatAsUndirected, AllPathPredicate)
    val localClusteringSums=firstLevelNeighboursGraph.aggregateMessages[Double](
      sendMsg=edgeContext=>{
      def messageCreator=(neighbours1:LongOpenHashSet,neighbours2:LongOpenHashSet)=>{
         intersectSize(neighbours1,neighbours2)
      }
      val message=messageCreator(edgeContext.srcAttr,edgeContext.dstAttr)
      edgeContext.sendToSrc(message)
      if(vertexMeasureConfiguration.treatAsUndirected){
      edgeContext.sendToDst(message)
      }
    },
    mergeMsg=(a,b)=>a+b)
    firstLevelNeighboursGraph.outerJoinVertices(localClusteringSums)((vId,oldValue,newValue)=>(newValue.getOrElse(0d),oldValue)).mapVertices {
      case (vId, (sum, neighbours)) => {
        val possibleConnections = neighbours.size * (neighbours.size - 1)
        if (possibleConnections == 0) 0d else sum / possibleConnections
      }
    }
  }
}

Source File: GraphFromGraphML$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.loaders.graphml

import ml.sparkling.graph.api.loaders.GraphLoading.LoadGraph
import ml.sparkling.graph.loaders.LoaderTest
import ml.sparkling.graph.loaders.graphml.GraphFromGraphML.{GraphML, GraphProperties}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph



class GraphFromGraphML$Test(implicit sc:SparkContext)  extends LoaderTest {

  "GraphML with standard format" should "be loaded by default" in{
    Given("XML in GraphML format  path")
    val filePath = getClass.getResource("/simpleGraphML.xml").toString
    When("Loads graph")
    val graph = LoadGraph.from(GraphML(filePath)).load()
    Then("Graph should be loaded correctly")
    graph.vertices.count() should equal(2)
    graph.edges.count() should equal(1)
  }

  "GraphML with standard format and multiple edges" should "be loaded by default" in{
    Given("XML in GraphML format path")
    val filePath = getClass.getResource("/simpleGraphML2.xml").toString
    When("Loads graph")
    val graph = LoadGraph.from(GraphML(filePath)).load()
    Then("Graph should be loaded correctly")
    graph.vertices.count() should equal(3)
    graph.edges.count() should equal(2)
  }


  "GraphML with vertices attributes" should "be loaded by default" in{
    Given("XML in GraphML format  path")
    val filePath = getClass.getResource("/withValuesGraphML.xml").toString
    When("Loads graph")
    val graph: Graph[GraphProperties, GraphProperties] = LoadGraph.from(GraphML(filePath)).load()
    Then("Graph should be loaded correctly")
    graph.vertices.count() should equal(4)
    graph.edges.count() should equal(2)
    graph.vertices.map{
      case (vId,properites)=>(vId,properites("name").asInstanceOf[String])
    }.collect().sorted should equal(List((0l,"name0"),(1l,"name1"),(2l,"name2"),(3l,"name3")))
    graph.vertices.flatMap{
      case (vId,properites)=>properites.get("type").asInstanceOf[Option[String]].map((vId,_))
    }.collect().sorted should equal(List((0l,"type0")))
  }


}

Source File: Neo4jGraphFrame.scala From neo4j-spark-connector with Apache License 2.0

5 votes

package org.neo4j.spark.dataframe

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.sql.SQLContext
import org.neo4j.spark.Neo4jGraph

import org.neo4j.spark.cypher.CypherHelpers._


object Neo4jGraphFrame {

  def apply(sqlContext: SQLContext, src: (String, String), edge: (String, String), dst: (String, String)) = {
    def nodeStmt(s: (String, String)) = s"MATCH (n:${s._1.quote}) RETURN id(n) as id, n.${s._2.quote} as prop"

    val edgeProp = if (edge._2 == null) "" else s", r.${edge._2.quote} as prop"
    val edgeStmt = s"MATCH (n:${src._1.quote})-[r:${edge._1.quote}]->(m:${dst._1.quote}) RETURN id(n) as src, id(m) as dst" + edgeProp

    val vertices1 = Neo4jDataFrame(sqlContext, nodeStmt(src), Seq.empty, ("id", "integer"), ("prop", "string"))
    val vertices2 = Neo4jDataFrame(sqlContext, nodeStmt(dst), Seq.empty, ("id", "integer"), ("prop", "string"))
    val schema = Seq(("src", "integer"), ("dst", "integer")) ++ (if (edge._2 != null) Some("prop", "string") else None)
    val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, schema: _*)

    org.graphframes.GraphFrame(vertices1.union(vertices2).distinct(), edges)
  }

  def fromGraphX(sc: SparkContext, label1: String, rels: Seq[String], label2: String) = {
    val g: Graph[Any, Int] = Neo4jGraph.loadGraph(sc, label1, rels, label2)
    org.graphframes.GraphFrame.fromGraphX(g)
  }

  def fromEdges(sqlContext: SQLContext, label1: String, rels: Seq[String], label2: String) = {
    val relTypes = rels.map(_.quote).mkString("|")
    val edgeStmt = s"MATCH (n:${label1.quote})-[r:$relTypes]->(m:${label2.quote}) RETURN id(n) as src, id(m) as dst"
    val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, ("src", "integer"), ("dst", "integer"))
    org.graphframes.GraphFrame.fromEdges(edges)
  }
}

Source File: LoadDsl.scala From neo4j-spark-connector with Apache License 2.0

5 votes

package org.neo4j.spark.dsl

import org.apache.spark.graphx.Graph
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.graphframes.GraphFrame

import scala.reflect.ClassTag

trait LoadDsl {
  def loadRdd[T:ClassTag] : RDD[T]
  def loadRowRdd : RDD[Row]
  def loadNodeRdds : RDD[Row]
  def loadRelRdd : RDD[Row]
  def loadGraph[VD:ClassTag,ED:ClassTag] : Graph[VD,ED]
  def loadGraphFrame[VD:ClassTag,ED:ClassTag] : GraphFrame
  def loadDataFrame : DataFrame
  def loadDataFrame(schema : (String,String)*) : DataFrame
}

Source File: PairwiseBPSuite.scala From sandpiper with Apache License 2.0

5 votes

package sparkle.graph

import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import sparkle.util.LocalSparkContext

class PairwiseBPSuite  extends FunSuite with LocalSparkContext {

  test("Pairwise BP test") {
    // test from the lectures EECS course 6.869, Bill Freeman and Antonio Torralba.
    // Chapter 7.3.5 Numerical example.

    withSpark { sc =>
      val vertices: RDD[(Long, PVertex)] = sc.parallelize(Seq(
        (1L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (2L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (3L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (4L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 0.0).map(math.log)))))
      )
      val edges = sc.parallelize(Seq(
        Edge(1L, 2L, PEdge(Factor(Array(2, 2), Array(1.0, 0.9, 0.9, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))),
        Edge(2L, 3L, PEdge(Factor(Array(2, 2), Array(0.1, 1.0, 1.0, 0.1).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))),
        Edge(2L, 4L, PEdge(Factor(Array(2, 2), Array(1.0, 0.1, 0.1, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0))))
      ))
      val graph = Graph(vertices, edges)
      val bpGraph = PairwiseBP(graph)
      val trueProbabilities = Seq(
        1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0),
        2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1),
        3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01),
        4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid }
      val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid }
      val eps = 10e-5
      calculatedProbabilities.zip(trueProbabilities).foreach {
        case ((_, vertex), (_, (trueP0, trueP1))) =>
          assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps)
      }
    }

  }

  test("Pariwise BP test with file") {
    withSpark { sc =>
      val graph = PairwiseBP.loadPairwiseGraph(sc, "data/vertex4.txt", "data/edge4.txt")
      val bpGraph = PairwiseBP(graph)
      val trueProbabilities = Seq(
        1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0),
        2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1),
        3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01),
        4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid }
      val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid }
      val eps = 10e-5
      calculatedProbabilities.zip(trueProbabilities).foreach {
        case ((_, vertex), (_, (trueP0, trueP1))) =>
          assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps)
      }
    }
  }
}

Source File: FastUnfolding.scala From fastunfolding with Apache License 2.0

5 votes

package com.soteradefense.dga.graphx.louvain

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{VertexId, PartitionStrategy, TripletFields, Graph}

import scala.reflect.ClassTag


class FastUnfolding(outputdir: String,
                    minProgress: Int = 1,
                    progressCounter: Int = 1) {

    var qValues = Array[(Int, Double)]()

    def saveLevel(sc: SparkContext,
                  level: Int,
                  q: Double,
                  graph: Graph[MyVertexState, Long]) = {
        graph.vertices.saveAsTextFile(s"${outputdir}/level_${level}_vertices")
        graph.edges.saveAsTextFile(s"${outputdir}/level_${level}_edges")
        //graph.vertices.map( {case (id,v) => ""+id+","+v.internalWeight+","+v.community }).saveAsTextFile(outputdir+"/level_"+level+"_vertices")
        //graph.edges.mapValues({case e=>""+e.srcId+","+e.dstId+","+e.attr}).saveAsTextFile(outputdir+"/level_"+level+"_edges")
        qValues = qValues :+ ((level, q))
        println(s"qValue: $q")

        // overwrite the q values at each level
        sc.parallelize(qValues, 1).saveAsTextFile(s"${outputdir}/qvalues")
    }

    def run[VD: ClassTag](sc: SparkContext, graph: Graph[VD, Long]) = {
        val initialGraph = createGraph(graph)

        val graphWeight = initialGraph.vertices.map(
            vertex => {
                vertex._2.nodeWeight
            }
        ).reduce(_ + _)

        val broadcastGraphWeight = sc.broadcast(graphWeight)

        val initialModularity = initialGraph.vertices.map(
            vertex => {
                vertex._2.in / (2 * graphWeight) - vertex._2.tot * vertex._2.tot / (graphWeight * graphWeight)
            }
        ).reduce(_ + _)

        var level = -1
        var halt = false

        while(!halt) {
            level += 1
            println(s"Starting level ${level}")

            val (currentQ, currentGraph, passes) = runFastUnfolding(sc, initialGraph, minProgress, progressCounter)


        }
    }

    def runFastUnfolding(sc: SparkContext,
                        graph: Graph[MyVertexState, Long],
                        minProgress: Int,
                        progressCounter: Int) = {
        val cachedGraph = graph.cache()
        

    }

    def createGraph[VD: ClassTag](graph: Graph[VD, Long]): Graph[MyVertexState, Long] = {
        val nodeWeights = graph.aggregateMessages[Long](
            cxt => {
                cxt.sendToSrc(cxt.attr)
                cxt.sendToDst(cxt.attr)
            },
            (a, b) => a + b,
            TripletFields.EdgeOnly
        )

        nodeWeights.foreach(result => println(s"nodeweight: ${result._1}, ${result._2}"))


        val louvainGraph = graph.outerJoinVertices(nodeWeights)((vid, data, weightOption) => {
            val weight = weightOption.getOrElse(0L)
            val state = new MyVertexState()
            state.community = vid
            state.changed = false
            state.tot = weight
            state.in = 0
            state.nodeWeight = weight
            state
        }).partitionBy(PartitionStrategy.EdgePartition2D)

        louvainGraph
    }
}

Source File: PeriodicGraphCheckpointer.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.graphx.util

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.PeriodicCheckpointer



      data.vertices.cache()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.cache()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
}

Source File: SSSPExample.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: RingGenerator.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.generators.ring

import ml.sparkling.graph.api.generators.{GraphGenerator, GraphGeneratorConfiguration}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.rdd.RDD


object RingGenerator  extends GraphGenerator[RingGeneratorConfiguration,Int,Int]{
  override def generate(configuration: RingGeneratorConfiguration)(implicit ctx:SparkContext): Graph[Int, Int] = {
    val vertexTuples: RDD[(Long, Long)] =ctx
      .parallelize((0l to configuration.numberOfNodes-1))
      .flatMap(vId=>{
        val nextId=(vId+1) % configuration.numberOfNodes
        val previousId=if(vId-1 < 0) {configuration.numberOfNodes-1} else {vId-1}
        (vId,nextId) :: {if(configuration.undirected) List((vId,previousId)) else Nil}
      }
      )
    Graph.fromEdgeTuples(vertexTuples,1)
  }
}
case class RingGeneratorConfiguration(val numberOfNodes:Long, val undirected:Boolean=false) extends GraphGeneratorConfiguration;

Source File: Neo4jGraphScalaTSE.scala From neo4j-spark-connector with Apache License 2.0

5 votes

package org.neo4j.spark

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.junit.Assert._
import org.junit._

import scala.collection.JavaConverters._

object Neo4jGraphScalaTSE {

}


class Neo4jGraphScalaTSE extends SparkConnectorScalaBaseTSE {
  val FIXTURE: String = "CREATE (s:A {a:0})-[r:REL {foo:'bar'}]->(t:B {b:1}) RETURN id(s) AS source, id(t) AS target"

  private var source: Long = _
  private var target: Long = _

  @Before
  @throws[Exception]
  def setUp {
    val map = SparkConnectorScalaSuiteIT.session().run(FIXTURE).single()
      .asMap()
    source = map.get("source").asInstanceOf[Long]
    target = map.get("target").asInstanceOf[Long]
  }

  private def assertGraph(graph: Graph[_, _], expectedNodes: Long, expectedRels: Long) = {
    assertEquals(expectedNodes, graph.vertices.count)
    assertEquals(expectedRels, graph.edges.count)
  }

  @Test def runCypherQueryWithParams {
    val data = List(Map("id"->1,"name"->"Test").asJava).asJava
    Executor.execute(sc, "UNWIND $data as row CREATE (n:Test {id:row.id}) SET n.name = row.name", Map(("data",data)))
  }
  @Test def runMatrixQuery {
    val graph = Neo4jGraph.loadGraph(sc, "A", Seq.empty, "B")
    assertGraph(graph, 2, 1)
  }

  @Test def saveGraph {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL","test"))
    assertEquals(42L, SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }

  @Test def saveGraphMerge {
    val edges : RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,13L)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,"value",("FOOBAR","test"),Option("Foo","id"),Option("Bar","id"),merge = true)
    assertEquals(Map("fid"->source,"bid"->target,"rv"->42L,"fv"->13L,"bv"->13L).asJava,SparkConnectorScalaSuiteIT.session().run("MATCH (foo:Foo)-[rel:FOOBAR]->(bar:Bar) RETURN {fid: foo.id, fv:foo.value, rv:rel.test,bid:bar.id,bv:bar.value} as data").single().get("data").asMap())
  }
  @Test def saveGraphByNodeLabel {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(0,1,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL","test"),Option(("A","a")),Option(("B","b")))
    assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }
  @Test def mergeGraphByNodeLabel {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL2","test"),merge = true)
    assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL2]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }

  @Test def saveGraphNodes {
    val nodes : RDD[(VertexId, Long)] = sc.makeRDD(Seq((source,10L),(target,20L)))
    val edges : RDD[Edge[Long]] = sc.makeRDD(Seq())
    val graph = Graph[Long,Long](nodes,edges,-1)
    assertGraph(graph, 2, 0)
    Neo4jGraph.saveGraph(sc,graph,"prop")
    assertEquals(10L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (a:A) WHERE id(a) = $source RETURN a.prop as prop").single().get("prop").asLong())
    assertEquals(20L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (b:B) WHERE id(b) = $target RETURN b.prop as prop").single().get("prop").asLong())
  }
}

Source File: GraphProviders.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.loaders.csv.providers

import ml.sparkling.graph.loaders.csv.types.Types
import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession;
import scala.reflect.ClassTag


object GraphProviders {
  val defaultStorageLevel=StorageLevel.MEMORY_ONLY
  def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
                                                     vertexProvider: Row => Seq[(VertexId, VD)],
                                                     edgeProvider: Row => Seq[Edge[ED]],
                                                     edgeStorageLevel: StorageLevel = defaultStorageLevel,
                                                     vertexStorageLevel: StorageLevel =defaultStorageLevel)
                                                    (dataFrame: DataFrame): Graph[VD, ED] = {

    def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = {
      dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => {
        rowIterator.flatMap { case row => mappingFunction(row) }
      })
    }

    val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider)
    val edges: RDD[Edge[ED]] = mapRows(edgeProvider)
    defaultVertex match{
      case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel)
      case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel)
    }

  }

  def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
                                                      vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)],
                                                      edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]],
                                                      columnsToIndex: Seq[Int],
                                                      edgeStorageLevel: StorageLevel = defaultStorageLevel,
                                                      vertexStorageLevel: StorageLevel = defaultStorageLevel)
                                                     (dataFrame: DataFrame): Graph[VD, ED] = {
    val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap
    def extractIdFromIndex(vertex: VD) = index(vertex)
    simpleGraphBuilder(defaultVertex,
      vertexProvider(_: Row, extractIdFromIndex _),
      edgeProvider(_: Row, extractIdFromIndex _),
      edgeStorageLevel,
      vertexStorageLevel)(dataFrame)

  }
}

Source File: GraphMLLoader.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.loaders.graphml

import com.databricks.spark.xml._
import ml.sparkling.graph.loaders.graphml.GraphMLFormat._
import ml.sparkling.graph.loaders.graphml.GraphMLTypes.TypeHandler
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext, SparkSession}

import scala.collection.mutable
import scala.util.Try


  def loadGraphFromML(path: String)(implicit sc: SparkContext): Graph[ValuesMap, ValuesMap] = {
    val sparkSession=SparkSession.builder().getOrCreate();

    val graphDataFrame = sparkSession.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("attributePrefix","@")
      .option("valueTag","#VALUE")
      .option("rowTag",graphTag).load(path).rdd

    val keys =sparkSession.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("attributePrefix","@")
      .option("valueTag","#VALUE")
      .option("rowTag",graphMLTag).load(path).rdd
      .flatMap(r => Try(r.getAs[mutable.WrappedArray[Row]](keyTag).toArray).getOrElse(Array.empty))

    val nodesKeys = keys
      .filter(r => r.getAs[String](forAttribute) == nodeTag)
    val edgeKeys = keys
      .filter(r => r.getAs[String](forAttribute) == edgeTag)

    val nodeAttrHandlers = createAttrHandlersFor(nodesKeys)
    val edgeAttrHandlers = createAttrHandlersFor(edgeKeys)

    val verticesWithData = graphDataFrame.flatMap(r => r.getAs[Any](nodeTag) match {
      case data: mutable.WrappedArray[Row@unchecked] => data.array
      case data: Row => Array(data)
    })

    val verticesIndex = verticesWithData.map(r => r.getAs[String](idAttribute)).zipWithUniqueId().collect().toMap

    val vertices: RDD[(VertexId, Map[String, Any])] = verticesWithData
      .map(
        r => (verticesIndex(r.getAs[String](idAttribute)), extractAttributesMap(nodeAttrHandlers, r))
      )

    val edgesRows = graphDataFrame.flatMap(r => r.getAs[Any](edgeTag) match {
      case data: mutable.WrappedArray[Row@unchecked] => data.array
      case data: Row => Array(data)
    })
      .map(r => Edge(
        verticesIndex(r.getAs[String](sourceAttribute)),
        verticesIndex(r.getAs[String](targetAttribute)),
        extractAttributesMap(edgeAttrHandlers, r)
      ))
    Graph(vertices, edgesRows)
  }

  def extractAttributesMap(attrHandlers: Map[String, GraphMLAttribute], r: Row): Map[String, Any] = {
    Try(r.getAs[mutable.WrappedArray[Row]](dataTag)).toOption.map(
      _.map(r => {
        val attribute = attrHandlers(r.getAs[String](keyAttribute))
        (attribute.name, attribute.handler(r.getAs[String](tagValue)))
      }).toMap
    ).getOrElse(Map.empty) + ("id" -> r.getAs[String](idAttribute))
  }

  def createAttrHandlersFor(keys: RDD[Row]): Map[String, GraphMLAttribute] = {
    keys
      .map(r => (r.getAs[String](idAttribute), GraphMLAttribute(r.getAs[String](nameAttribute), GraphMLTypes(r.getAs[String](typeAttribute)))))
      .collect().toMap
  }
}

Source File: PSCAN$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.algorithms.community.pscan

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID
import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._
import org.apache.spark.graphx.util.GraphGenerators

class PSCAN$Test (implicit sc:SparkContext)   extends MeasureTest {

  "Components for full graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph)
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (1)
    graph.unpersist(true)
  }


  "Components for full graph" should  " be computed using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] =graph.PSCAN()
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (1)
    graph.unpersist(true)
  }

  "Components for ring graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph)
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (5)
    graph.unpersist(true)
  }
  "Components for 3 component graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/coarsening_to_3")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph)
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (3)
    graph.unpersist(true)
  }

  "Dynamic components detection for 3 component graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/coarsening_to_3")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,3)
    Then("Should compute components correctly")
    numberOfComponents should equal (3)
    graph.unpersist(true)
  }

  "Dynamic components detection  for RMAT graph" should  " be computed" in{
    for(x<- 0 to 10){
      Given("graph")
      val graph:Graph[Int,Int]=GraphGenerators.rmatGraph(sc,33,132)
      When("Computes components")
      val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,24)
      Then("Should compute components correctly")
      numberOfComponents  should equal (24l +- 5l)
      graph.unpersist(true)
    }
  }

  "Dynamic components detection  for random graph" should  " be computed" in{
    Given("graph")
    val graph:Graph[Int,Int]=GraphGenerators.rmatGraph(sc,1000,10000)
    When("Computes components")
    val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,24)
    Then("Should compute components correctly")
    numberOfComponents  should equal (24l +- 5l)
    graph.unpersist(true)
  }

}

Source File: BasicLinkPredictor$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.algorithms.link

import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.measures.edge.CommonNeighbours
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._


class BasicLinkPredictor$Test (implicit sc:SparkContext) extends MeasureTest {

  "In open triad" should  " propose to close it" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/3_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes new links")
    val links = BasicLinkPredictor.predictLinks(graph,CommonNeighbours,0,true)
    Then("Should compute links correctly")
   links.collect() should equal(Array((1,3)))
    graph.unpersist(true)
  }

  "In open 4 nodes graph" should  " propose to close it fully" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_open")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes new links")
    val links = graph.predictLinks(CommonNeighbours,1,true)
    Then("Should compute links correctly")
    links.collect().toSet should equal(Set((1,3),(2,4)))
    graph.unpersist(true)
  }
}

Source File: MeasureTest.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, GraphLoader}
import org.scalatest._


abstract class MeasureTest(implicit sc:SparkContext)  extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers{
  def time[T](str: String)(thunk: => T): (T,Long) = {
    logger.info(s"$str...")
    val t1 = System.currentTimeMillis
    val x = thunk
    val t2 = System.currentTimeMillis
    val diff=t2 - t1
    logger.info(s"$diff ms")
    (x,diff)
  }

  val logger=Logger.getLogger(this.getClass)

  def loadGraph(file:String)={
    val out: Graph[Int, Int] =GraphLoader.edgeListFile(sc,file.toString)
    out.vertices.setName(s"Graph vertices ${file}")
    out.edges.setName(s"Graph edges ${file}")
    out.triplets.setName(s"Graph triplets ${file}")
    out
  }

}

Source File: Modularity$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.graph

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._
import org.apache.spark.graphx.util.GraphGenerators


class Modularity$Test (implicit sc:SparkContext)   extends MeasureTest{

  "Modularity  for star graph in one community" should "be 0" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphComponents=graph.PSCAN(epsilon = 0)
    When("Computes Modularity")
    val result=Modularity.compute(graphComponents)
    Then("Should calculate Modularity")
    result should be (0)
    graph.unpersist(true)
  }


  "Modularity  for ring graph in one  community" should "be 0" in{
    Given("graph")
    val graph=GraphGenerators.gridGraph(sc,5,5).mapEdges((_)=>1).mapVertices((_,_)=>1)
    val graphComponents=graph.PSCAN(epsilon = 0)
    When("Computes Modularity")
    val result=Modularity.compute(graphComponents)
    Then("Should calculate Modularity")
    result should be (0)
    graph.unpersist(true)
  }

  "Modularity  for ring graph in one node communities" should "be -0.041875" in{
    Given("graph")
    val graph=GraphGenerators.gridGraph(sc,5,5)
    val graphComponents=graph.PSCAN(epsilon = 1)
    When("Computes Modularity")
    val result=Modularity.compute(graphComponents)
    Then("Should calculate Modularity")
    result should be (-0.041875 +- 0.000000001)
    graph.unpersist(true)
  }

  "Modularity  for star graph in one community" should "be 0 when calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphComponents=graph.PSCAN(epsilon = 0)
    When("Computes Modularity")
    val result=graphComponents.modularity()
    Then("Should calculate Modularity")
    result should be (0)
    graph.unpersist(true)
  }

  "Modularity  for all single components" should "be -1 " in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphComponents=graph.PSCAN(epsilon=1)
    When("Computes Modularity")
    val result=graphComponents.modularity()
    Then("Should calculate Modularity")
    result should be (-0.3 +- 0.000000001)
    graph.unpersist(true)
  }


}

Source File: FreemanCentrality$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.graph

import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.OperatorsDSL._
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph

class FreemanCentrality$Test (implicit sc:SparkContext)   extends MeasureTest  {

  "Freeman Centrality  for star graph" should "be 1" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Freemans Centrality")
    val result=FreemanCentrality.compute(graph)
    Then("Should calculate Freemans Centrality")
    result should be (1)
    graph.unpersist(true)
  }

  "Freeman Centrality  for star graph" should "be 1 when calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Freemans Centrality")
    val result=graph.freemanCentrality()
    Then("Should calculate Freemans Centrality")
    result should be (1)
    graph.unpersist(true)
  }


  "Freeman Centrality  for 5 node line graph" should "be 0.167" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Freemans Centrality")
    val result=FreemanCentrality.compute(graph)
    Then("Should calculate Freemans Centrality")
    result should be (0.16666666 +- 1e-5)
    graph.unpersist(true)
  }


}

Source File: Hits$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.hits

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class Hits$Test(implicit sc:SparkContext)  extends MeasureTest  {



  "Hits  for line graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Hits")
    val result = Hits.computeBasic(graph)
    Then("Should calculate hits correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      (0.25,0d), (0.25,0.25),(0.25,0.25),(0.25,0.25),(0d,0.25)
    )).foreach {
      case ((a,b),(c,d)) => {
        a should be (c +- 1e-5)
        b should be (d +- 1e-5)
      }
    }
    graph.unpersist(true)
  }

  "Hits  for line graph" should "be correctly calculated using DSL" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Hits")
    val result = graph.hits()
    Then("Should calculate hits correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      (0.25,0d), (0.25,0.25),(0.25,0.25),(0.25,0.25),(0d,0.25)
    )).foreach {
      case ((a,b),(c,d)) => {
        a should be (c +- 1e-5)
        b should be (d +- 1e-5)
      }
    }
    graph.unpersist(true)
  }

  "Hits for full 4 node directed graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Hits")
    val result=Hits.computeBasic(graph)
    Then("Should calculate Hits correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      (0.44504187450168503,0.19806226306818242),
      (0.19806226497496957,0.4450418674109515),
      (1.9336832073590722e-13,0.3568958695205176),
      (0.35689586676523016,3.484376742610991e-13)
    )).foreach {
      case ((a,b),(c,d)) => {
        a should be (c +- 1e-5)
        b should be (d +- 1e-5)
      }
    }
    graph.unpersist(true)
  }



}

Source File: Neo4jPersistence.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.persistence

import java.io.{File, PrintWriter}

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.Graph

object Neo4jPersistence extends GraphPersistence {
  private val vertices_suffix = "_nodes"
  private val edges_suffix = "_relationships"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite :Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath + "-header"))
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath + "-header"))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    val nodeHeader = s"name:ID($graphName),:LABEL\n"

    val nodeHeaderWriter = new PrintWriter(new File(verticesPath + "-header"))
    nodeHeaderWriter.write(nodeHeader)
    nodeHeaderWriter.close()

    graph.vertices.map {
      case (id, _) => s"$id,$graphName"
    }.saveAsTextFile(verticesTmpPath)

    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    val relationshipHeader = s":START_ID($graphName),:END_ID($graphName),:TYPE,${EdgeData.neo4jCsvHeader}\n"

    val relHeaderWriter = new PrintWriter(new File(edgesPath + "-header"))
    relHeaderWriter.write(relationshipHeader)
    relHeaderWriter.close()

    graph.edges.map(edge =>
      edge.attr match {
        case edgeData: EdgeData => s"${edge.srcId},${edge.dstId},EDGE,${edgeData.toCsv}"
        case _ => s"${edge.srcId},${edge.dstId},EDGE"
      }
    ).saveAsTextFile(edgesTmpPath)

    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
}

Source File: ClosenessCentrality.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.workload.spark

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.{EdgeDirection, Graph, VertexId}

import scala.collection.mutable
import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    getClosenessOfVert(vertex, graph)
  }

  private class DistanceNodePair(var distance: Long, var totalPairs: Long) extends Comparable[DistanceNodePair] {

    override def compareTo(dp: DistanceNodePair): Int = (this.distance - dp.distance).toInt
  }

  private class NodeVisitCounter extends java.io.Serializable {

    var totalPairs: Long = _

    var levelSize: mutable.HashMap[Long, Long] = _ //first is distance second is pair at that distance
  }

  private def BFSNode[VD: ClassTag, ED: ClassTag](nID: Long, graph: Graph[VD, ED]): NodeVisitCounter = {
    val q = new mutable.Queue[Long]()
    q.enqueue(nID)
    val visited = new mutable.HashSet[VertexId]()
    val levelSize = new mutable.HashMap[Long, Long]()
    visited.add(nID)
    var totalPairs: Long = 0
    val visitCounter = new NodeVisitCounter()
    var level = 0
    while (q.nonEmpty) {
      val size = q.size
      totalPairs += size
      if (level != 0) {
        levelSize.put(level, size)
      }

      val list: Array[Long] = new Array[Long](size)
      for (x <- 0 until size) {
        list(x) = q.dequeue()
      }
      var children: Array[VertexId] = null
      if (list.length > 0) {
        for (x <- list) {
          val node: VertexId = x
          if (graph.collectNeighborIds(EdgeDirection.Out).lookup(node).nonEmpty) {
            children = graph.collectNeighborIds(EdgeDirection.Out).lookup(node).head
            //        children = hashmap.value.get(x).head
            for (c: Long <- children) {
              // val childNode = graph.vertices.lookup(c) //hashmap.value.get(c).head
              if (!visited.contains(c)) {
                q.enqueue(c)
                visited.add(c)
              }
            }
          }
        }
      }
      level += 1
    }
    totalPairs -= 1

    visitCounter.levelSize = levelSize
    visitCounter.totalPairs = totalPairs

    visitCounter
  }

  private def getClosenessOfVert[VD: ClassTag, ED: ClassTag](vertex: VertexId, graph: Graph[VD, ED]): Double = {
    val visitCenter = BFSNode(vertex, graph)

    var denominator: Long = 0L
    for (x <- visitCenter.levelSize.keySet) {
      denominator += visitCenter.levelSize.get(x).head * x
    }
    if (denominator == 0) return -1
    val count = graph.vertices.count().toDouble
    count / denominator
  }
}

Source File: SSSP.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.workload.spark

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.{Graph, VertexId}

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    for (dst <- graph.vertices.keys.toLocalIterator) {
      bfs(graph, src, dst)
    }
  }

  private def bfs[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], src: VertexId, dst: VertexId): Unit = {
    // if (src == dst) return List(src)
    if (src == dst) return

    // The attribute of each vertex is (dist from src, id of vertex with dist-1)
    var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache()

    // Traverse forward from src
    var dstAttr = (Int.MaxValue, 0L)
    while (dstAttr._1 == Int.MaxValue) {
      val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) {
        e.sendToDst((e.srcAttr._1 + 1, e.srcId))
      }, (a, b) => if (a._1 < b._1) a else b).cache()

      // if (msgs.count == 0) return List.empty
      if (msgs.count == 0) return

      g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) =>
        if (newAttr._1 < oldAttr._1) newAttr else oldAttr
      }.cache()

      dstAttr = g.vertices.filter(_._1 == dst).first()._2
    }

    // Traverse backward from dst and collect the path
    var path: List[VertexId] = dstAttr._2 :: dst :: Nil
    while (path.head != src) {
      path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path
    }

    // path
  }
}

Source File: BFS.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.workload.spark

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.{Graph, VertexId}

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    // if (src == dst) return List(src)
    if (src == dst) return

    // The attribute of each vertex is (dist from src, id of vertex with dist-1)
    var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache()

    // Traverse forward from src
    var dstAttr = (Int.MaxValue, 0L)
    while (dstAttr._1 == Int.MaxValue) {
      val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) {
        e.sendToDst((e.srcAttr._1 + 1, e.srcId))
      }, (a, b) => if (a._1 < b._1) a else b).cache()

      // if (msgs.count == 0) return List.empty
      if (msgs.count == 0) return

      g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) =>
        if (newAttr._1 < oldAttr._1) newAttr else oldAttr
      }.cache()

      dstAttr = g.vertices.filter(_._1 == dst).first()._2
    }

    // Traverse backward from dst and collect the path
    var path: List[VertexId] = dstAttr._2 :: dst :: Nil
    while (path.head != src) {
      path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path
    }

    // path
  }
}

Source File: ConnectedComponents.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = "MATCH (n) WITH COLLECT(n) as nodes " +
      "RETURN REDUCE(graphs = [], n in nodes | " +
      "case when " +
      "ANY (g in graphs WHERE shortestPath( (n)-[*]-(g) ) ) " +
      "then graphs " +
      "else graphs + [n]" +
      "end );"

    engine.run(query)
  }
}

Source File: BetweennessCentrality.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = s"MATCH (n), pthroughn = shortestPath((a)-[*..$hops]->(b)) " +
      "WHERE n IN nodes(pthroughn) AND n <> a AND n <> b AND a <> b " +
      "WITH n,a,b,count(pthroughn) AS sumn " +
      s"MATCH p = shortestPath((a)-[*..$hops]->(b)) " +
      "WITH n, a, b, tofloat(sumn)/ tofloat(count(p)) AS fraction " +
      "RETURN n, sum(fraction);"

    engine.run(query)
  }
}

Source File: StronglyConnectedComponents.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = "MATCH (n) " +
      "WITH COLLECT(n) as nodes " +
      "RETURN REDUCE(graphs = [], n in nodes | " +
      "case when " +
      "ANY (g in graphs WHERE (shortestPath( (n)-[*]->(g) ) AND shortestPath( (n)<-[*]-(g) ) ) ) " +
      "then graphs " +
      "else graphs + [n] " +
      "end ) "

    engine.run(query)
  }
}

Source File: PageRank.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = "MATCH (a) " +
      "set a.pagerank = 0.0 " +
      "WITH collect(distinct a) AS nodes,count(a) as num_nodes " +
      "UNWIND nodes AS a " +
      "MATCH (a)-[r]-(b) " +
      "WITH a,collect(r) AS rels, count(r) AS num_rels, 1.0/num_nodes AS rank " +
      "UNWIND rels AS rel " +
      "SET endnode(rel).pagerank = " +
      "CASE " +
      "WHEN num_rels > 0 AND id(startnode(rel)) = id(a) THEN " +
      "endnode(rel).pagerank + rank/(num_rels) " +
      "ELSE endnode(rel).pagerank " +
      "END " +
      ",startnode(rel).pagerank = " +
      "CASE " +
      "WHEN num_rels > 0 AND id(endnode(rel)) = id(a) THEN " +
      "startnode(rel).pagerank + rank/(num_rels) " +
      "ELSE startnode(rel).pagerank " +
      "END " +
      "WITH collect(distinct a) AS a,rank " +
      "RETURN a"

    engine.run(query)
  }
}

Source File: SparkPersistence.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.persistence

import java.io.File

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.sc
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.storage.StorageLevel

object SparkPersistence extends GraphPersistence {
  private val vertices_suffix = "_vertices"
  private val edges_suffix = "_edges"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    graph.vertices.saveAsTextFile(verticesTmpPath)
    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    graph.edges.saveAsTextFile(edgesTmpPath)
    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
}

Source File: LocalClustering$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.clustering

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class LocalClustering$Test(implicit sc:SparkContext)    extends MeasureTest  {


  "Local clustering for line graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph)
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (Array(
      (1,0.0), (2,0.0), (3,0.0), (4,0.0), (5,0.0)
    ))
    graph.unpersist(true)
  }

  "Local clustering for line graph" should "be correctly calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=graph.localClustering()
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (Array(
      (1,0.0), (2,0.0), (3,0.0), (4,0.0), (5,0.0)
    ))
    graph.unpersist(true)
  }

  "Local clustering for full directed graph " should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph)
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (Array(
      (1,0.5), (2,0d), (3,0d), (4,0.5)
    ))
    graph.unpersist(true)
  }

  "Local clustering for full undirected graph " should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById  should equal (Array(
      (1,1), (2,1), (3,1), (4,1)
    ))
    graph.unpersist(true)
  }


  "Local clustering for full directed graph " should "be correctly calculated using iterative approach" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph)
    val localClusteringIterative=LocalClustering.compute(graph,VertexMeasureConfiguration[Int,Int]((g:Graph[Int,Int])=>1l))
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (localClusteringIterative.vertices.collect().sortBy{case (vId,data)=>vId})
    graph.unpersist(true)
  }

}

Source File: GraphSynth.scala From csb with GNU General Public License v3.0

5 votes

package edu.msstate.dasi.csb.data.synth

import edu.msstate.dasi.csb.data.distributions.DataDistributions
import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.sc
import edu.msstate.dasi.csb.util.Util
import org.apache.spark.graphx.Graph


  def synthesize(seed: Graph[VertexData, EdgeData], seedDists : DataDistributions, withProperties: Boolean): Graph[VertexData, EdgeData] = {

    var synth = null.asInstanceOf[Graph[VertexData, EdgeData]]

    Util.time( "Gen Graph", {
      synth = genGraph(seed, seedDists)
      println("Vertices #: " + synth.numVertices + ", Edges #: " + synth.numEdges)
    } )

    if (withProperties) {
      Util.time( "Gen Properties", {
        synth = genProperties(synth, seedDists)
        println("Vertices #: " + synth.numVertices + ", Edges #: " + synth.numEdges)
      } )
    }

    synth
  }
}

Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0

5 votes

package com.aliyun.odps.spark.examples.graphx

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object PageRank {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("PageRank")
      .getOrCreate()
    val sc = spark.sparkContext

    // build vertices
    val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
      "1,BarackObama,Barack Obama",
      "2,ladygaga,Goddess of Love",
      "3,jeresig,John Resig",
      "4,justinbieber,Justin Bieber",
      "6,matei_zaharia,Matei Zaharia",
      "7,odersky,Martin Odersky",
      "8,anonsys"
    ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))

    // build edges
    val followers: RDD[Edge[Double]] = sc.parallelize(Array(
      Edge(2L, 1L, 1.0),
      Edge(4L, 1L, 1.0),
      Edge(1L, 2L, 1.0),
      Edge(6L, 3L, 1.0),
      Edge(7L, 3L, 1.0),
      Edge(7L, 6L, 1.0),
      Edge(6L, 7L, 1.0),
      Edge(3L, 7L, 1.0)
    ))

    // build graph
    val followerGraph: Graph[Array[String], Double] = Graph(users, followers)

    // restrict the graph to users with usernames and names
    val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)

    // compute PageRank
    val pageRankGraph = subgraph.pageRank(0.001)

    // get attributes of the top pagerank users
    val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
      case (uid, attrList, Some(pr)) => (pr, attrList.toList)
      case (uid, attrList, None) => (0.0, attrList.toList)
    }

    println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
  }
}

Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0

5 votes

package com.aliyun.odps.spark.examples.graphx

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object PageRank {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("PageRank")
    val sc = new SparkContext(conf)

    // build vertices
    val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
      "1,BarackObama,Barack Obama",
      "2,ladygaga,Goddess of Love",
      "3,jeresig,John Resig",
      "4,justinbieber,Justin Bieber",
      "6,matei_zaharia,Matei Zaharia",
      "7,odersky,Martin Odersky",
      "8,anonsys"
    ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))

    // build edges
    val followers: RDD[Edge[Double]] = sc.parallelize(Array(
      Edge(2L, 1L, 1.0),
      Edge(4L, 1L, 1.0),
      Edge(1L, 2L, 1.0),
      Edge(6L, 3L, 1.0),
      Edge(7L, 3L, 1.0),
      Edge(7L, 6L, 1.0),
      Edge(6L, 7L, 1.0),
      Edge(3L, 7L, 1.0)
    ))

    // build graph
    val followerGraph: Graph[Array[String], Double] = Graph(users, followers)

    // restrict the graph to users with usernames and names
    val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)

    // compute PageRank
    val pageRankGraph = subgraph.pageRank(0.001)

    // get attributes of the top pagerank users
    val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
      case (uid, attrList, Some(pr)) => (pr, attrList.toList)
      case (uid, attrList, None) => (0.0, attrList.toList)
    }

    println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
  }
}

Source File: GodwinTest.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.timeseries.graph

import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Logger, Level}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.rdd.RDD

import scala.io.Source

class GodwinTest extends SparkFunSuite {

  Logger.getLogger("akka").setLevel(Level.OFF)
  Logger.getLogger("org").setLevel(Level.OFF)

  def buildEdges() = {
    Source.fromInputStream(getClass.getResourceAsStream("/edges.csv")).getLines().drop(1).map(s => {
      val Array(source, target, weight) = s.split(",")
      Edge(source.toLong, target.toLong, weight.toDouble)
    }).toList
  }

  localTest("Test Random Walks") { sc =>
    val edges: RDD[Edge[Double]] = sc.parallelize(buildEdges(), 1)
    val godwin = new Godwin(Seq(16))
    val walks = godwin.randomWalks(Graph.fromEdges(edges, 0L), 4).collect().sortBy(_._2)
    println(walks.map(_._1).mkString(" -> "))
    walks.last._1 should be(16)
  }

}

Source File: GzetCommunitiesTest.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.community

import io.gzet.community.clustering.wcc.WCCDetection
import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Graph, Edge}

import scala.io.Source

class GzetCommunitiesTest extends SparkFunSuite {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  localTest("WCC communities") { spark =>

    val lines = Source.fromInputStream(getClass.getResourceAsStream("/local-edges.csv")).getLines().zipWithIndex.filter(_._2 > 0).map(_._1).toSeq
    val sc = spark.sparkContext
    val edges = sc.parallelize(lines).map({ line =>
      val a = line.split(",").map(_.toLong).sorted
      Edge(a.head, a.last, 1L)
    }).distinct()

    val graph = Graph.fromEdges(edges, 0L)

    graph.triplets.take(2).foreach(println)
    val communities = new WCCDetection(1).run(graph, sc)
    communities.map(_._2 -> 1).reduceByKey(_+_).collectAsMap() should be(Map(5L -> 5, 15L -> 6, 21L -> 5))
  }
}

Source File: StoryBatchDedup.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.story

import io.gzet.story.model.{Content, Article}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.{Logging, SparkConf, SparkContext}
import io.gzet.story.util.SimhashUtils._
import com.datastax.spark.connector._

object StoryBatchDedup extends SimpleConfig with Logging {

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("Story Extractor")
    val sc = new SparkContext(sparkConf)

    val simhashRDD = sc.cassandraTable[Article]("gzet", "articles").zipWithIndex().map({ case (a, id) =>
      ((id, Content(a.url, a.title, a.body)), a.hash)
    })
    Set(0)

    val duplicateTupleRDD = simhashRDD.flatMap({ case ((id, content), simhash) =>
      searchmasks.map({ mask =>
        (simhash ^ mask, id)
      })
    }).groupByKey()

    val edgeRDD = duplicateTupleRDD.values.flatMap({ it =>
      val list = it.toList
      for (x <- list; y <- list) yield (x, y)
    }).filter({ case (x, y) =>
      x != y
    }).distinct().map({case (x, y) =>
      Edge(x, y, 0)
    })

    val duplicateRDD = Graph.fromEdges(edgeRDD, 0L)
      .connectedComponents()
      .vertices
      .join(simhashRDD.keys)
      .values

    duplicateRDD.sortBy(_._1).collect().foreach({ case (story, content) =>
      println(story + "\t" + content.title)
    })

  }

}

Source File: PlaylistBuilder.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.recommender

import com.datastax.spark.connector._
import com.typesafe.config.Config
import io.gzet.recommender.Config._
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import spark.jobserver._

object PlaylistBuilder extends SparkJob with NamedRddSupport {

  override def runJob(sc: SparkContext, conf: Config): Any = {

    val recordRDD = sc.cassandraTable[Record](KEYSPACE, TABLE_RECORD)
    val hashRDD = sc.cassandraTable[Hash](KEYSPACE, TABLE_HASH)

    val minSimilarityB = sc.broadcast(MIN_SIMILARITY)
    val songIdsB = sc.broadcast(recordRDD.map(r => (r.id, r.name)).collectAsMap())

    implicit class Crossable[X](xs: Traversable[X]) {
      def cross[Y](ys: Traversable[Y]) = for { x <- xs; y <- ys } yield (x, y)
    }

    val songHashRDD = hashRDD flatMap { hash =>
      hash.songs map { song =>
        ((hash, song), 1)
      }
    }

    val songTfRDD = songHashRDD map { case ((hash, songId), count) =>
      (songId, count)
    } reduceByKey(_+_)

    val songTfB = sc.broadcast(songTfRDD.collectAsMap())

    val crossSongRDD = songHashRDD.keys.groupByKey().values flatMap { songIds =>
      songIds cross songIds filter { case (from, to) =>
        from != to
      } map(_ -> 1)
    } reduceByKey(_+_) map { case ((from, to), count) =>
      val weight = count.toDouble / songTfB.value.getOrElse(from, 1)
      org.apache.spark.graphx.Edge(from, to, weight)
    } filter { edge =>
      edge.attr > minSimilarityB.value
    }

    val graph = Graph.fromEdges(crossSongRDD, 0L)
    val prGraph = graph.pageRank(TOLERANCE, TELEPORT)

    val edges = prGraph.edges.map({ edge =>
      (edge.srcId, (edge.dstId, edge.attr))
    }).groupByKey().map({case (srcId, it) =>
      val dst = it.toList
      val dstIds = dst.map(_._1.toString)
      val weights = dst.map(_._2.toString)
      Edge(srcId, dstIds, weights)
    })

    val vertices = prGraph.vertices.mapPartitions({ vertices =>
      val songIds = songIdsB.value
      vertices map { case (vId, pr) =>
        Node(vId, songIds.getOrElse(vId, "UNKNOWN"), pr)
      }
    })

    edges.saveAsCassandraTable(KEYSPACE, TABLE_EDGE)
    vertices.saveAsCassandraTable(KEYSPACE, TABLE_NODE)

    this.namedRdds.update(RDD_EDGE, edges)
    this.namedRdds.update(RDD_NODE, vertices)

  }

  override def validate(sc: SparkContext, config: Config): SparkJobValidation = {
    SparkJobValid
  }



}

Source File: PersonalizedPlaylistBuilder.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.recommender

import com.typesafe.config.Config
import io.gzet.recommender.Config._
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import spark.jobserver._

object PersonalizedPlaylistBuilder extends SparkJob with NamedRddSupport {

  override def runJob(sc: SparkContext, conf: Config): Any = {

    val id = conf.getLong("song.id")

    val edges = this.namedRdds.get[Edge](RDD_EDGE).get
    val nodes = this.namedRdds.get[Node](RDD_NODE).get

    val edgeRDD = edges.flatMap({e =>
      e.targets.zip(e.weights).map({case (target, weight) =>
        org.apache.spark.graphx.Edge(e.source, target.toLong, weight.toDouble)
      })
    })

    val songIdsB = sc.broadcast(nodes.map(n => (n.id, n.name)).collectAsMap())

    val graph = Graph.fromEdges(edgeRDD, 0L)
    graph.cache()
    val prGraph = graph.personalizedPageRank(id, TOLERANCE, TELEPORT)

    prGraph.vertices.mapPartitions({ it =>
      val songIds = songIdsB.value
      it map { case (vId, pr) =>
        (vId, songIds.getOrElse(vId, "UNKNOWN"), pr)
      }
    }).sortBy(_._3, ascending = false).map(v => List(v._1, v._3, v._2).mkString(",")).collect()

  }

  override def validate(sc: SparkContext, config: Config): SparkJobValidation = {
    if(!config.hasPath("song.id")) return SparkJobInvalid("Missing parameter [song.id]")
    if(this.namedRdds.get[Edge](RDD_EDGE).isEmpty) return SparkJobInvalid("Missing RDD [edges]")
    if(this.namedRdds.get[Edge](RDD_NODE).isEmpty) return SparkJobInvalid("Missing RDD [nodes]")
    SparkJobValid
  }

}

Source File: EmployeeRelationship.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples.graphx

import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx.{ Edge, Graph }


object EmployeeRelationship {
	def main(args: Array[String]): Unit = {
		// vertex format: vertex_id, data
		val vertexArray = Array(
			(1L, ("John", "Software Developer")),
			(2L, ("Robert", "Technical Leader")),
			(3L, ("Charlie", "Software Architect")),
			(4L, ("David", "Software Developer")),
			(5L, ("Edward", "Software Development Manager")),
			(6L, ("Francesca", "Software Development Manager")))

		// edge format: from_vertex_id, to_vertex_id, data
		val edgeArray = Array(
			Edge(2L, 1L, "Technical Mentor"),
			Edge(2L, 4L, "Technical Mentor"),
			Edge(3L, 2L, "Collaborator"),
			Edge(6L, 3L, "Team Member"),
			Edge(4L, 1L, "Peers"),
			Edge(5L, 2L, "Team Member"),
			Edge(5L, 3L, "Team Member"),
			Edge(5L, 6L, "Peers"))

		val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob"))

		val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray)

		val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray)

		val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD)

		// Vanilla query
		println(">>> Showing the names of people who are Software Developers")
		graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") }
			.collect()
			.foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") }

		// Connection analysis
		println(">>> People connected to Robert (Technical Leader) -> ")
		graph.triplets.filter(_.srcId == 2).collect()
			.foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) }

		println(">>> Robert (Technical Leader) connected to -> ")
		graph.triplets.filter(_.dstId == 2).collect()
			.foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) }

		println(">>> Technical Mentoring Analysis -> ")
		graph.triplets.filter(_.attr.equals("Technical Mentor")).collect()
			.foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) }
	}
}

Source File: SSSPExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: FindInfluencer.scala From spark-graphx-twitter with Apache License 2.0

5 votes

package com.knoldus.spark.graphx.example

import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object FindInfluencer {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Twittter Influencer").setMaster("local[*]")
    val sparkContext = new SparkContext(conf)
    sparkContext.setLogLevel("ERROR")

    val twitterData = sparkContext.textFile("src/main/resources/twitter-graph-data.txt")

    val followeeVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr =>
      val user = arr(0).replace("((", "")
      val id = arr(1).replace(")", "")
      (id.toLong, user)
    }

    val followerVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr =>
      val user = arr(2).replace("(", "")
      val id = arr(3).replace("))", "")
      (id.toLong, user)
    }

    val vertices = followeeVertices.union(followerVertices)
    val edges: RDD[Edge[String]] = twitterData.map(_.split(",")).map { arr =>
      val followeeId = arr(1).replace(")", "").toLong
      val followerId = arr(3).replace("))", "").toLong
      Edge(followeeId, followerId, "follow")
    }

    val defaultUser = ("")
    val graph = Graph(vertices, edges, defaultUser)

    val subGraph = graph.pregel("", 2, EdgeDirection.In)((_, attr, msg) =>
      attr + "," + msg,
      triplet => Iterator((triplet.srcId, triplet.dstAttr)),
      (a, b) => (a + "," + b))

    val lengthRDD = subGraph.vertices.map(vertex => (vertex._1, vertex._2.split(",").distinct.length - 2)).max()(new Ordering[Tuple2[VertexId, Int]]() {
      override def compare(x: (VertexId, Int), y: (VertexId, Int)): Int =
        Ordering[Int].compare(x._2, y._2)
    })

    val userId = graph.vertices.filter(_._1 == lengthRDD._1).map(_._2).collect().head
    println(userId + " has maximum influence on network with " + lengthRDD._2 + " influencers.")

    sparkContext.stop()
  }
}

Source File: ZombieExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object ZombieExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)(
      (vertexId, zombieState, message) => {
        if (message > 0 && !zombieState.isZombie) {
          new ZombieStats(true, message)
        } else {
          zombieState
        }
      }, triplet => {
        if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) {
          Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l))
        } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) {
          Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l))
        } else {
          Iterator.empty
        }
      }, (a, b) => Math.min(a, b))

    println("ZombieBite")
    zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",ZobmieStat:" + r._2)
    })

    sparkSession.stop()
  }
}

case class ZombieStats (isZombie:Boolean, lengthOfLife:Long)

Source File: PeriodicGraphCheckpointer.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.impl

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel



private[mllib] class PeriodicGraphCheckpointer[VD, ED](
    checkpointInterval: Int,
    sc: SparkContext)
  extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) {

  override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint()

  override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed

  override protected def persist(data: Graph[VD, ED]): Unit = {
    if (data.vertices.getStorageLevel == StorageLevel.NONE) {
      data.vertices.persist()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.persist()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
}

Source File: SSSPExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object UserRankApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    ssc.socketTextStream(hostname, port.toInt)
      .map(r => {
        implicit val formats = DefaultFormats
        parse(r)
      })
      .foreachRDD(rdd => {
        val edges = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]])
        })
          .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0)))

        val vertices = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String])
        })
          .map(r => (r.hashCode.toLong, r))

        val tolerance = 0.0001
        val graph = Graph(vertices, edges, "defaultUser")
          .subgraph(vpred = (id, idStr) => idStr != "defaultUser")
        val pr = graph.pageRank(tolerance).cache

        graph.outerJoinVertices(pr.vertices) {
          (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs)
        }.vertices.top(10) {
          Ordering.by(_._2._1)
        }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1)))
      })

    ssc.start()
    ssc.awaitTermination()

  }

}

Source File: EdgeAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class EdgeAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Edge API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapEdges(e => e.attr.toUpperCase)

    println(res.edges.collect().toList)
  }

}

Source File: VertexAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class VertexAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Vertex API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapVertices((_, att) => att.toUpperCase())
    res.vertices.collect().toList
  }

}

Source File: PeriodicGraphCheckpointer.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.impl

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel



private[mllib] class PeriodicGraphCheckpointer[VD, ED](
    checkpointInterval: Int,
    sc: SparkContext)
  extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) {

  override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint()

  override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed

  override protected def persist(data: Graph[VD, ED]): Unit = {
    if (data.vertices.getStorageLevel == StorageLevel.NONE) {
      data.vertices.persist()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.persist()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
}

Source File: LocalRunner.scala From spark-betweenness with Apache License 2.0

5 votes

package com.centrality.kBC

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

object MainRunner 
{
  def main(args: Array[String])
  {
    // Create spark context
    val appName="kBC"
    val sparkMode="local"
    val conf = new SparkConf().setAppName(appName).setMaster(sparkMode);
    val sc = new SparkContext(conf);
    
    // Create sample graph
    //
    // Create an RDD for vertices
    val users: RDD[(VertexId, (String, String))] =
    sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                         (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    
    val kBCGraph = 
      KBetweenness.run(graph, 3)
  }
}

Source File: GraphFramesExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//import org.graphframes._


object GraphFramesExample extends App {

    val conf = new SparkConf()
      .setAppName("RDD graph")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)


    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val friendGraph: Graph[String, String] = Graph(vertices, edges)

//    val friendGraphFrame = GraphFrame.fromGraphX(friendGraph)
//
//    friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter(
//      "e1.attr = 'trusts' OR v3.attr = 'Chris'"
//    ).collect.foreach(println)

}

Source File: Gephi.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.utils

import org.apache.spark.graphx.Graph


object Gephi {

  def toGexf[VD, ED](g: Graph[VD, ED]): String = {
    val header =
      """<?xml version="1.0" encoding="UTF-8"?>
        |<gexf xmlns="http://www.gexf.net/1.2draft" version="1.2">
        |  <meta>
        |    <description>A gephi graph in GEXF format</description>
        |  </meta>
        |    <graph mode="static" defaultedgetype="directed">
      """.stripMargin

    val vertices = "<nodes>\n" + g.vertices.map(
      v => s"""<node id=\"${v._1}\" label=\"${v._2}\"/>\n"""
    ).collect.mkString + "</nodes>\n"

    val edges = "<edges>\n" + g.edges.map(
      e => s"""<edge source=\"${e.srcId}\" target=\"${e.dstId}\" label=\"${e.attr}\"/>\n"""
    ).collect.mkString + "</edges>\n"

    val footer = "</graph>\n</gexf>"

    header + vertices + edges + footer
  }
}

Source File: CCGraphXDriver.scala From connected-component with MIT License

5 votes

package com.kwartile.lib.cc

import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.{SparkConf, SparkContext}

import scala.annotation.tailrec



object CCGraphXDriver {

  @tailrec
  private def buildEdges(node: Long, neighbors:List[Long], partialPairs: List[Edge[Int]]) : List[Edge[Int]] = {
    if (neighbors.length == 0) {
      if (partialPairs != null)
        List(Edge(node, node, 1)) ::: partialPairs
      else
        List(Edge(node, node, 1))
    } else if (neighbors.length == 1) {
      val neighbor = neighbors(0)
      if (node > neighbor)
        if (partialPairs != null) List(Edge(node, neighbor, 1)) ::: partialPairs else List(Edge(node, neighbor, 1))
      else
      if (partialPairs != null) List(Edge(neighbor, node, 1)) ::: partialPairs else List(Edge(neighbor, node, 1))
    } else {
      val newPartialPairs = neighbors.map(neighbor => {
        if (node > neighbor)
          List(Edge(node, neighbor, 1))
        else
          List(Edge(neighbor, node, 1))
      }).flatMap(x=>x)

      if (partialPairs != null)
        buildEdges(neighbors.head, neighbors.tail, newPartialPairs ::: partialPairs)
      else
        buildEdges(neighbors.head, neighbors.tail, newPartialPairs)
    }
  }

  private def buildEdges(nodes:List[Long]) :  List[Edge[Int]] = {
    buildEdges(nodes.head, nodes.tail, null.asInstanceOf[List[Edge[Int]]])
  }

  def main(args: Array[String]) = {
    val sparkConf = new SparkConf().setAppName("GraphXConnectedComponent")

    val sc = new SparkContext(sparkConf)

    val cliqueFile = args(0)
    val cliquesRec = sc.textFile(args(0))
    val cliques = cliquesRec.map(x => {
      val nodes = x.split("\\s+").map(y => y.toLong).toList
      nodes
    })

    val edges = cliques.map(aClique => {
      buildEdges(aClique)
    }).flatMap(x=>x)

    val graph = Graph.fromEdges(edges, 1)
    val cc = graph.connectedComponents().vertices
    println ("Count of Connected component: " + cc.count)
  }
}

Source File: PipeClusteringStrongestPath.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.clustering

import scala.Iterator

import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexRDD

import de.unihamburg.vsis.sddf.reading.Tuple


class PipeClusteringStrongestPath extends PipeClusteringTransitiveClosure {
  
  override def manipulateGraph(graph: Graph[Tuple, Double]): Graph[_, Double] = {

    val cGraph = graph.mapVertices((vid, tuple) => (vid, Double.MinPositiveValue))

    // attach the max adjacent edge attribute to each vertice
    val verticesMaxEdgeAttributes: VertexRDD[Double] = cGraph.mapReduceTriplets(
      edge => {
        Iterator((edge.dstId, edge.attr), (edge.srcId, edge.attr))
      },
      (a: Double, b: Double) => math.max(a, b)
    )

    // join the resulting vertice attributes with the graph
    val maxGraph: Graph[(Tuple, Double), Double] =
      graph.outerJoinVertices(verticesMaxEdgeAttributes)((id, tuple, simOpt) =>
        simOpt match {
          case Some(sim) => (tuple, sim)
          case None      => (tuple, 0D)
        }
      )
      
    // remove edges which have a max value less then src or dst 
    val resultGraph = maxGraph.subgraph(edge => {
      if (edge.attr < edge.srcAttr._2 && edge.attr < edge.dstAttr._2) {
        false
      } else {
        true
      }
    })
    resultGraph
  }

}

object PipeClusteringStrongestPath {
  
  def apply() = new PipeClusteringStrongestPath()

}

Source File: AbstractPipeClusteringGraph.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.aggregator.Mean
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


abstract class AbstractPipeClusteringGraph
  extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]]
  with Serializable {
  
  def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]]

  def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = {
    
    val duplicatePairsWithSimilarity = input.map(
      pair => (pair._1, Mean.agrSimilarity(pair._2))
    )
    
    val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map(
      pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) }
    )

    // TODO optimize: it would be nice to build the graph only by using edge triplets
    // but as far as I know that's not possible
    val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap(
      tuplePair => Seq(tuplePair._1, tuplePair._2)
    ).map(tuple => (tuple.id, tuple))

    // delete all duplicate vertices
    val vertices = verticesNotUnique.distinct()

    // The edge type Boolean is just a workaround because no edge types are needed
    val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null)
    
    cluster(graph)
  }

}

Source File: AffinityPropagationSuite.scala From SparkAffinityPropagation with MIT License

5 votes

package org.viirya.spark.ml

import scala.collection.mutable

import org.scalatest.{BeforeAndAfterAll, FunSuite, Suite}

import org.viirya.spark.ml.AffinityPropagation._

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph}

class AffinityPropagationSuite extends FunSuite with BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("AffinityPropagationUnitTest")
    sc = new SparkContext(conf)
  }

  override def afterAll() {
    try {
      if (sc != null) {
        sc.stop()
      }
      sc = null
    } finally {
      super.afterAll()
    }
  }  

  test("affinity propagation") {
    
    val similarities = Seq[(Long, Long, Double)](
      (0, 1, 1.0), (1, 0, 1.0), (0, 2, 1.0), (2, 0, 1.0), (0, 3, 1.0), (3, 0, 1.0),
      (1, 2, 1.0), (2, 1, 1.0), (2, 3, 1.0), (3, 2, 1.0))
    val expected = Array(
      Array(0.0,     1.0/3.0, 1.0/3.0, 1.0/3.0),
      Array(1.0/2.0,     0.0, 1.0/2.0,     0.0),
      Array(1.0/3.0, 1.0/3.0,     0.0, 1.0/3.0),
      Array(1.0/2.0,     0.0, 1.0/2.0,     0.0))
    val s = constructGraph(sc.parallelize(similarities, 2), true, false)
    s.edges.collect().foreach { case Edge(i, j, x) =>
      assert(math.abs(x.similarity - expected(i.toInt)(j.toInt)) < 1e-14)
    }
  }
}

Source File: PeriodicGraphCheckpointer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.impl

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel



private[mllib] class PeriodicGraphCheckpointer[VD, ED](
    checkpointInterval: Int,
    sc: SparkContext)
  extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) {

  override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint()

  override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed

  override protected def persist(data: Graph[VD, ED]): Unit = {
    if (data.vertices.getStorageLevel == StorageLevel.NONE) {
      data.vertices.persist()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.persist()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
}

Source File: BasicLinkPredictor.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.algorithms.link

import ml.sparkling.graph.api.operators.algorithms.link.MeasureBasedLnkPredictor
import ml.sparkling.graph.api.operators.measures.EdgeMeasure
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


object BasicLinkPredictor extends MeasureBasedLnkPredictor {

  override def predictLinks[V: ClassTag, E: ClassTag, EV: ClassTag, EO: ClassTag](graph: Graph[V, E],
                                                                                  edgeMeasure: EdgeMeasure[EO, EV],
                                                                                  threshold: EO,
                                                                                  treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = {
    val preprocessedGraph=edgeMeasure.preprocess(graph,treatAsUndirected)
    val allPossibleEdges = preprocessedGraph.vertices.cartesian(preprocessedGraph.vertices).filter{
      case ((vId1,data1),(vId2,data2))=>vId1!=vId2
    }
    val edgesAboveThreshold=allPossibleEdges.map{
      case ((vId1,data1),(vId2,data2))=>(edgeMeasure.computeValue(data1,data2,treatAsUndirected),(vId1,vId2))
    }.filter(t=>num.gt(t._1,threshold)).map(t=>(t._2,0))
    val exsistingEdgesTuples=graph.edges.map(e=>((e.srcId,e.dstId),0))
    val newEdges=edgesAboveThreshold.leftOuterJoin(exsistingEdgesTuples).filter{
      case (k,(_,option))=>option.isEmpty
    }.map(_._1)
    if(treatAsUndirected){
      newEdges.map{
        case (vId1,vId2)=>(Math.min(vId1,vId2),Math.max(vId1,vId2))
      }.distinct()
    }else{
      newEdges
    }
  }

}

Source File: BetweennessEdmonds$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds

import java.nio.file.Files

import ml.sparkling.graph.operators.MeasureTest
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, VertexRDD}


class BetweennessEdmonds$Test(implicit sc: SparkContext) extends MeasureTest {
  val tempDir = Files.createTempDirectory("spark-checkpoint")

  override def beforeAll() = {
    sc.setCheckpointDir(tempDir.toAbsolutePath.toString)
  }

  override def afterAll() = {
    FileUtils.deleteDirectory(tempDir.toFile)
  }

  "Edmonds betweenness centrality for random graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes betweenness")
    val result = EdmondsBC.computeBC(graph)
    Then("Should calculate betweenness correctly")
    val bcFile = getClass.getResource("/graphs/graph_ER_15_bc")
    val bcCorrectValues = sc.textFile(bcFile.getPath)
      .filter(_.nonEmpty)
      .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) })
      .sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data}).collect()
    val bcValues = result.sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data }).collect()
    bcCorrectValues.zip(bcValues).foreach({ case (a, b) =>
      a should be(b +- 1e-5)
    })

    result.unpersist(false)
  }

}

Source File: SSSPExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: EigenvectorCentrality$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.eigenvector

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

import scala.util.Random

class EigenvectorCentrality$Test(implicit sc:SparkContext)   extends MeasureTest  {




  "Eigenvector  for line graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=EigenvectorCentrality.compute(graph)
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      0d, 0d, 0d, 0d, 0d
    )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}}
    graph.unpersist(true)
  }

  "Eigenvector  for line graph" should "be correctly calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=graph.eigenvectorCentrality()
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      0d, 0d, 0d, 0d, 0d
    )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}}
    graph.unpersist(true)
  }

  "Eigenvector  for full 4 node directed graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=EigenvectorCentrality.compute(graph)
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      0.32128186442503776, 0.5515795539542094, 0.6256715148839718, 0.44841176915201825
    )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}}
    graph.unpersist(true)
  }

  "Eigenvector  for full 4 node undirected graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=EigenvectorCentrality.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId} should equal (Array(
      (1,0.5), (2,0.5), (3,0.5), (4,0.5)
    ))
    graph.unpersist(true)
  }

  "Eigenvector " should " take edge weight into account" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphWeighted=graph.mapEdges(edge=>{
      1.0/(edge.srcId+edge.dstId)
    })
    When("Computes eigenvector")
    val resultUnweighted=EigenvectorCentrality.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    val resultWeighted=EigenvectorCentrality.compute(graphWeighted,VertexMeasureConfiguration[Int,Double](true))
    Then("Should calculate eigenvector correctly")
    resultUnweighted.vertices.collect().sortBy{case (vId,data)=>vId} should not equal (
      resultWeighted.vertices.collect().sortBy{case (vId,data)=>vId})
    graph.unpersist(true)
    resultUnweighted.unpersist(true)
    resultWeighted.unpersist(true)
  }



}

Source File: AdamicAdar$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.edge

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class AdamicAdar$Test(implicit sc:SparkContext) extends MeasureTest {


   "Adamic/Adar for star graph" should "be 0 for each node" in{
     Given("graph")
     val filePath = getClass.getResource("/graphs/6_nodes_star")
     val graph:Graph[Int,Int]=loadGraph(filePath.toString)
     When("Computes Adamic/Adar")
     val result=AdamicAdar.computeWithPreprocessing(graph)
     Then("Should calculate Adamic/Adar")
     val resultValues=result.edges.map(_.attr).distinct().collect()
     resultValues(0) should equal(0)
     resultValues.size should equal(1)
     graph.unpersist(true)
   }

   "Adamic/Adar for full graph using DSL" should "be 1.8205 for each node" in{
     Given("graph")
     val filePath = getClass.getResource("/graphs/4_nodes_full")
     val graph:Graph[Int,Int]=loadGraph(filePath.toString)
     When("Computes Adamic/Adar")
     val result=graph.adamicAdar(true)
     Then("Should calculate Adamic/Adar")
     val resultValues=result.edges.map(_.attr).distinct().collect()
     resultValues(0) should equal(1.82047 +- 1e-5)
     resultValues.size should equal(1)
     graph.unpersist(true)
   }


 }

Source File: CommonNeighbours$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.edge

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class CommonNeighbours$Test (implicit sc:SparkContext)   extends MeasureTest {


  "Common neighbours for star graph" should "be 0 for each node" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes common neighbours")
    val result=CommonNeighbours.computeWithPreprocessing(graph)
    Then("Should calculate common neighbours")
    val resultValues=result.edges.map(_.attr).distinct().collect()
    resultValues(0) should equal(0)
    resultValues.size should equal(1)
  }

  "Common neighbours for full graph using DSL" should "be 2 for each node" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes common neighbours")
    val result=graph.commonNeighbours(true)
    Then("Should calculate common neighbours")
    val resultValues=result.edges.map(_.attr).distinct().collect()
    resultValues(0) should equal(2)
    resultValues.size should equal(1)
  }


}

Source File: NeighborhoodConnectivity$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.measures.vertex.NeighborhoodConnectivity
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class NeighborhoodConnectivity$Test(implicit sc:SparkContext)  extends MeasureTest {



  "Neighbor connectivity for directed line graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Neighbor connectivity ")
    val result = NeighborhoodConnectivity.compute(graph)
    Then("Should calculate Neighbor connectivity  correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      1d,1d,1d,0d,0d
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for directed line graph" should "be correctly calculated when using DSL" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Neighbor connectivity ")
    val result = graph.neighborhoodConnectivity()
    Then("Should calculate Neighbor connectivity  correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      1d,1d,1d,0d,0d
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for undirected line graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Neighbor connectivity ")
    val result = NeighborhoodConnectivity.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate Neighbor connectivity  correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      2d,1.5,2d,1.5,2d
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for full 4 node directed graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Neighbor connectivity")
    val result=NeighborhoodConnectivity.compute(graph)
    Then("Should calculate Neighbor connectivity correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      1d,1d,2d,1.5
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for full 4 node undirected graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Neighbor connectivity")
    val result=NeighborhoodConnectivity.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate Neighbor connectivity correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      3d,3d,3d,3d
    ))
    graph.unpersist(true)
  }

}

Source File: CommunityBasedPartitioning$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.partitioning

import ml.sparkling.graph.loaders.csv.CSVLoader
import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class CommunityBasedPartitioning$Test(implicit sc:SparkContext) extends MeasureTest {


  "One component graph " should  " have one partition" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (1)
    graph.unpersist(false)
  }

  "One component graph " should  " have one partition when calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] =graph.partitionBy(PSCAN,1)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (1)
    graph.unpersist(false)
  }

  "Five component graph " should  " have five partitions" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN,5)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (5)
    graph.unpersist(false)
  }

  "Three component graph " should  " have five partitions" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/coarsening_to_3")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN,3)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (3)
    graph.unpersist(false)
  }

  "Change of community method parammeters" should  " be possible" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphBy(graph,PSCAN.computeConnectedComponents(_,epsilon = 0),1)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (1)
    graph.unpersist(false)
  }
}

Source File: PSCANConnectedComponents.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.algorithms.community.pscan

import org.apache.spark.graphx.{EdgeTriplet, Graph, Pregel, VertexId}


class PSCANConnectedComponents(minWeight:Double) extends Serializable{


  def run[VD,ED](graph:Graph[VertexId,Double], maxIterations:Int=Int.MaxValue):Graph[VertexId,Double]={
    val initialMessage = Long.MaxValue
    Pregel(graph, initialMessage,maxIterations = maxIterations)(
    vprog = (_, attr, msg) => math.min(attr, msg),
    sendMsg = sendMessage,
    mergeMsg = (a, b) => math.min(a, b))
  }

  def sendMessage(edge: EdgeTriplet[VertexId, Double]): Iterator[(VertexId, VertexId)] = {
    if(edge.attr > minWeight){
      if(edge.srcAttr<edge.dstAttr){
        Iterator((edge.dstId,edge.srcAttr))
      }else if(edge.dstAttr<edge.srcAttr){
        Iterator((edge.srcId,edge.dstAttr))
      }else{
        Iterator.empty
      }
    }else{
      Iterator.empty
    }
  }
}

Source File: BetweennessHua$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.vertex.betweenness.hua

import java.nio.file.Files

import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds.EdmondsBC
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.scalatest.tagobjects.Slow


class BetweennessHua$Test (implicit sc: SparkContext) extends MeasureTest {
  val tempDir = Files.createTempDirectory("spark-checkpoint")

  override def beforeAll() = {
    sc.setCheckpointDir(tempDir.toAbsolutePath.toString)
  }

  override def afterAll() = {
    FileUtils.deleteDirectory(tempDir.toFile)
  }

  "Hua betweenness centrality for random graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes betweenness")
    val result = HuaBC.computeBC(graph)
    Then("Should calculate betweenness correctly")
    val bcFile = getClass.getResource("/graphs/graph_ER_15_bc")
    val bcCorrectValues = sc.textFile(bcFile.getPath)
      .filter(_.nonEmpty)
      .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) })
      .sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data}).collect()
    val bcValues = result.sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data }).collect()
    bcCorrectValues.zip(bcValues).foreach({ case (a, b) =>
      a should be(b +- 1e-5)
    })

    result.unpersist(false)
  }

  "Hua betweenness centrality for random graph" should "take no longer then Edmonds" taggedAs(Slow) in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("computes betwenness centrality")
    val (_, edmondsTime) = time("Edmonds algorithm for betweenness centrality")(EdmondsBC.computeBC(graph))
    val (_, huaTime) = time("Hua algorithm for betweenness centrality")(HuaBC.computeBC(graph))
    Then("Hua algorithm should be faster")
    huaTime should be <= edmondsTime
  }

}

Source File: Modularity.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.measures.graph

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID
import ml.sparkling.graph.api.operators.measures.{VertexDependentGraphMeasure, GraphIndependentMeasure}
import org.apache.spark.graphx.{EdgeTriplet, VertexRDD, Graph}
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag


object Modularity extends VertexDependentGraphMeasure[Double,ComponentID]{

   def compute[V<:ComponentID:ClassTag,E:ClassTag](graph: Graph[V, E]): Double = {
     val edgesNum=graph.numEdges.toDouble;
     val edgesCounts: RDD[(V, (Int, Int))] = graph.triplets.flatMap(triplet => {
       if (triplet.srcAttr == triplet.dstAttr) {
         Iterator((triplet.srcAttr, (1, 0)),(triplet.srcAttr, (1, 0)))
       } else {
         Iterator((triplet.srcAttr, (0, 1)),(triplet.dstAttr,(0,1)))
       }
     })
     edgesCounts.aggregateByKey((0,0))(
       (agg:(Int,Int),data:(Int,Int))=>
         (agg,data) match{
           case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2)
         },
     (agg1:(Int,Int),agg2:(Int,Int))=>{
       (agg1,agg2) match{
         case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2)
       }
     }
     ).treeAggregate(0.0)(
       (agg:Double,data:(V,(Int,Int)))=>{
         data match{
           case (_,(edgesFull,edgesSome))=>
             agg+(edgesFull/(2.0*edgesNum))-Math.pow((edgesSome+edgesFull)/(2.0*edgesNum),2)
         }
       },
       (agg1,agg2)=>agg1+agg2
     )

  }

}

Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.partitioning


import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID}
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator
import org.apache.log4j.Logger
import org.apache.spark.{Partitioner, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId}

import scala.reflect.ClassTag


object CommunityBasedPartitioning {
  @transient
  val logger=Logger.getLogger(CommunityBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts
    val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph)
    val numberOfCommunities=communities.vertices.values.countApproxDistinct()
    val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices)
    val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries  and ${coarsedNumberOfPartitions} partitions")
    val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache()
    out.edges.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts)
  }



}

Source File: PSCANBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.partitioning

import java.util.UUID

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.{DefaultPartitionOperator, logger}
import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, VertexId}

import scala.collection.mutable
import scala.reflect.ClassTag


object PSCANBasedPartitioning {

  @transient
  val logger=Logger.getLogger(PSCANBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],numberOfPartitions:Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext): Graph[VD, ED] ={
    val (numberOfCommunities: VertexId,  coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int, strategy: ByComponentIdPartitionStrategy) = buildPartitioningStrategy(graph, numberOfPartitions, maxIterations = maxIterations)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions (before ${numberOfCommunities})")
    val out=graph.partitionBy(strategy,numberOfPartitions).cache()
    out.edges.foreachPartition((_)=>{})
    out.triplets.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def buildPartitioningStrategy[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = {
    val (numberOfCommunities: VertexId, coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int) = precomputePartitions(graph, numberOfPartitions, maxIterations = maxIterations)
    logger.info(s"Requested $numberOfPartitions partitions, computed $coarsedNumberOfPartitions")
    val strategy = ByComponentIdPartitionStrategy(coarsedVertexMap, numberOfPartitions, DefaultPartitionOperator)
    (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions, strategy)
  }

  def precomputePartitions[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = {
    logger.info("Computing components using PSCAN")
    val (communities, numberOfCommunities): (Graph[ComponentID, ED], VertexId) = PSCAN.computeConnectedComponentsUsing(graph, numberOfPartitions, maxIterations = maxIterations)
    val computationData=communities.vertices.map(t=>t).localCheckpoint()
    logger.info("Components computed!")
    val (coarsedVertexMap, coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions, numberOfCommunities, computationData)
    (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions)
  }
}

Source File: VertexMeasureConfigurationTest.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.api.operators.measures

import ml.sparkling.graph.api.operators.IterativeComputation.BucketSizeProvider
import org.apache.spark.graphx.Graph
import org.scalatest.{FlatSpec, GivenWhenThen}



class VertexMeasureConfigurationTest extends FlatSpec with GivenWhenThen  {

  "Creation without parameters" should "be possible" in{
    VertexMeasureConfiguration()
  }

  "Creation with undirected flag" should "be possible" in{
    Given("Directed flag")
    val flag=false
    When("Configuration creation")
    VertexMeasureConfiguration(treatAsUndirected = flag )
  }

  "Creation with bucket size provider" should "be possible" in{
    Given("Bucker size provider")
    val provider:BucketSizeProvider[Long,Long]=(g:Graph[Long,Long])=>1l
    When("Configuration creation")
    VertexMeasureConfiguration(bucketSizeProvider = provider)
  }

  "Creation with bucket size provider and directed flag" should "be possible" in{
    Given("Bucker size provider")
    val provider:BucketSizeProvider[Long,Long]=(g:Graph[Long,Long])=>1l
    When("Configuration creation")
    VertexMeasureConfiguration( false, provider)
  }
}

Source File: GraphLoading.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.api.loaders

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


object GraphLoading {
  trait GraphLoader[VD,ED]{
    def load(parameters:List[Parameter])(implicit sc:SparkContext):Graph[VD,ED]
  }

  trait TypedGraphLoader[VD2,ED2] extends GraphLoader[VD2,ED2]{
    def load[VD:ClassTag,ED:ClassTag](parameters:List[Parameter])(implicit sc:SparkContext):Graph[VD,ED]
  }
  
  trait FromPathLoader[VD,ED] {
    def apply(path:String):GraphLoader[VD,ED]
  }
  
  object LoadGraph{
    def from[VD:ClassTag,ED:ClassTag](graphLoader: GraphLoader[VD,ED]):GraphLoaderConfigurator[VD,ED]={
      GraphLoaderConfigurator(List.empty,graphLoader)
    }
  }
  
  case  class GraphLoaderConfigurator[VD:ClassTag,ED:ClassTag](parameters:List[Parameter], loader:GraphLoader[_,_]){
    def using(parameter:Parameter)={
      GraphLoaderConfigurator[VD,ED](parameter::parameters,loader)
    }
    
    def load[VD:ClassTag,ED:ClassTag]()(implicit sc:SparkContext): Graph[VD,ED] ={
      loader match{
        case typed:TypedGraphLoader[_,_]=>typed.load[VD,ED](parameters)
        case normal:GraphLoader[VD @unchecked,ED @unchecked] => normal.load(parameters)
      }
    }

  }

  trait Parameter
  
  trait WithValueParameter[V] extends Parameter{
    def value:V
  }


}

Source File: ShortestPathLengthsFromCSV.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.examples

import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes
import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._
import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm
import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, VertexId}

import scala.collection.JavaConversions._

object ShortestPathLengthsFromCSV extends ExampleApp {
def body()={
  val shortestPaths =if(bucketSize == -1l)
    ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected)
  else
    ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected)
  val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices)
  partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out)
  ctx.stop()
}
}


private object Util{
  def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={
    pathsOption.flatMap((paths)=>{
      var entries=paths.entrySet().toList.sortBy(_.getKey)
      val out=new StringBuilder()
      out++=s"${oldValue},"
      var a = 0l
      while (a < size.value) {
        if (entries.size > 0 && a == entries.head.getKey) {
          out ++= s"${entries.head.getValue},"
          entries = entries.drop(1)
        }
        else {
          out ++= "0,"
        }
        a += 1l
      }
      out.setLength(out.length - 1)
      Option(out.toString())
    }).getOrElse(oldValue)
  }
}

Source File: GraphDescriptionFromCSV.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.examples

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.experiments.describe.GraphDescriptor._
import org.apache.log4j.Logger
import org.apache.spark.graphx.Graph



object GraphDescriptionFromCSV extends ExampleApp {
  def body()={
  val configuration = if (bucketSize == -1l) {
    val graphSize=1000l
    logger.info(s"BUCKET SIZE WILL BE EQUAL TO 1000!!")
    VertexMeasureConfiguration[String,Double](treatAsUndirected,(g:Graph[String,Double])=>graphSize)
  }
  else
    VertexMeasureConfiguration[String,Double](treatAsUndirected,(g:Graph[String,Double])=>bucketSize)
    val groupedGraph=partitionedGraph.groupEdges((a,b)=>a)
    groupedGraph.describeGraphToDirectory(out, configuration)
  ctx.stop()
  }
}

org.apache.spark.graphx.Graph Scala Examples