org.apache.spark.graphx.Graph Scala Examples
The following examples show how to use org.apache.spark.graphx.Graph.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OperatorsDSL.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 6 votes |
package ml.sparkling.graph.operators import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection._ import ml.sparkling.graph.api.operators.measures.{EdgeMeasure, VertexMeasureConfiguration} import ml.sparkling.graph.operators.algorithms.coarsening.labelpropagation.LPCoarsening import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN._ import ml.sparkling.graph.operators.algorithms.link.BasicLinkPredictor import ml.sparkling.graph.operators.measures.edge.{AdamicAdar, CommonNeighbours} import ml.sparkling.graph.operators.measures.vertex.{Degree, NeighborhoodConnectivity, VertexEmbeddedness} import ml.sparkling.graph.operators.measures.vertex.clustering.LocalClustering import ml.sparkling.graph.operators.measures.graph.{FreemanCentrality, Modularity} import ml.sparkling.graph.operators.partitioning.CommunityBasedPartitioning._ import ml.sparkling.graph.operators.measures.vertex.closenes.Closeness import ml.sparkling.graph.operators.measures.vertex.eigenvector.EigenvectorCentrality import ml.sparkling.graph.operators.measures.vertex.hits.Hits import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import scala.reflect.ClassTag object OperatorsDSL { implicit class ModularityDSL[E:ClassTag](graph:Graph[ComponentID,E]){ def modularity()=Modularity.compute(graph) } implicit class DSL[VD:ClassTag ,ED:ClassTag](graph:Graph[VD,ED]){ def PSCAN(epsilon:Double=0.1)= computeConnectedComponents(graph,epsilon) def LPCoarse(treatAsUndirected:Boolean=false)=LPCoarsening.coarse(graph,treatAsUndirected = treatAsUndirected) def closenessCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= Closeness.compute(graph,vertexMeasureConfiguration) def eigenvectorCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= EigenvectorCentrality.compute(graph,vertexMeasureConfiguration) def hits(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= Hits.compute(graph,vertexMeasureConfiguration) def degreeCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= Degree.compute(graph,vertexMeasureConfiguration) def neighborhoodConnectivity(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= NeighborhoodConnectivity.compute(graph,vertexMeasureConfiguration) def vertexEmbeddedness(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= VertexEmbeddedness.compute(graph,vertexMeasureConfiguration) def localClustering(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])= LocalClustering.compute(graph,vertexMeasureConfiguration) def freemanCentrality()=FreemanCentrality.compute(graph) def partitionBy(communityDetectionMethod:CommunityDetectionMethod[VD,ED])(implicit sc:SparkContext)= partitionGraphBy(graph,communityDetectionMethod) def partitionBy(communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext)= partitionGraphUsing(graph,communityDetectionMethod,numParts) def adamicAdar(treatAsUndirected:Boolean=false)={ AdamicAdar.computeWithPreprocessing(graph,treatAsUndirected) } def commonNeighbours(treatAsUndirected:Boolean=false)={ CommonNeighbours.computeWithPreprocessing(graph,treatAsUndirected) } def predictLinks[EV: ClassTag, EO: ClassTag]( edgeMeasure: EdgeMeasure[EO, EV],threshold: EO,treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = { BasicLinkPredictor.predictLinks(graph, edgeMeasure, threshold, treatAsUndirected) } } }
Example 2
Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 6 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.lib.TriangleCount import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphGeneration extends App { val conf = new SparkConf() .setAppName("Graph generation") .setMaster("local[4]") val sc = new SparkContext(conf) val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt") val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map { line => val field = line.split(" ") (field(0).toLong, field(1).toLong) } val edgeTupleGraph = Graph.fromEdgeTuples( rawEdges=rawEdges, defaultValue="") val gridGraph = GraphGenerators.gridGraph(sc, 5, 5) val starGraph = GraphGenerators.starGraph(sc, 11) val logNormalGraph = GraphGenerators.logNormalGraph( sc, numVertices = 20, mu=1, sigma = 3 ) logNormalGraph.outDegrees.map(_._2).collect().sorted val actorGraph = GraphLoader.edgeListFile( sc, "./ca-hollywood-2009.txt", true ).partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.edges.count() val actorComponents = actorGraph.connectedComponents().cache actorComponents.vertices.map(_._2).distinct().count val clusterSizes =actorComponents.vertices.map( v => (v._2, 1)).reduceByKey(_ + _) clusterSizes.map(_._2).max clusterSizes.map(_._2).min val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt") val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5) strongComponents.vertices.map(_._2).distinct().count val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges() val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.triangleCount() val triangles = TriangleCount.runPreCanonicalized(partitionedGraph) actorGraph.staticPageRank(10) val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001) actorPrGraph.vertices.reduce((v1, v2) => { if (v1._2 > v2._2) v1 else v2 }) actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10) actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count }
Example 3
Source File: LocalClustering.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 6 votes |
package ml.sparkling.graph.operators.measures.vertex.clustering import it.unimi.dsi.fastutil.longs.LongOpenHashSet import ml.sparkling.graph.api.operators.measures.{VertexMeasure, VertexMeasureConfiguration} import ml.sparkling.graph.operators.measures.utils.CollectionsUtils._ import ml.sparkling.graph.operators.measures.utils.{CollectionsUtils, NeighboursUtils} import ml.sparkling.graph.operators.predicates.AllPathPredicate import org.apache.spark.graphx.Graph import scala.reflect.ClassTag override def compute[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], vertexMeasureConfiguration: VertexMeasureConfiguration[VD,ED]) (implicit num: Numeric[ED]) = { val firstLevelNeighboursGraph = NeighboursUtils.getWithNeighbours(graph, vertexMeasureConfiguration.treatAsUndirected, AllPathPredicate) val localClusteringSums=firstLevelNeighboursGraph.aggregateMessages[Double]( sendMsg=edgeContext=>{ def messageCreator=(neighbours1:LongOpenHashSet,neighbours2:LongOpenHashSet)=>{ intersectSize(neighbours1,neighbours2) } val message=messageCreator(edgeContext.srcAttr,edgeContext.dstAttr) edgeContext.sendToSrc(message) if(vertexMeasureConfiguration.treatAsUndirected){ edgeContext.sendToDst(message) } }, mergeMsg=(a,b)=>a+b) firstLevelNeighboursGraph.outerJoinVertices(localClusteringSums)((vId,oldValue,newValue)=>(newValue.getOrElse(0d),oldValue)).mapVertices { case (vId, (sum, neighbours)) => { val possibleConnections = neighbours.size * (neighbours.size - 1) if (possibleConnections == 0) 0d else sum / possibleConnections } } } }
Example 4
Source File: GraphFromGraphML$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.graphml import ml.sparkling.graph.api.loaders.GraphLoading.LoadGraph import ml.sparkling.graph.loaders.LoaderTest import ml.sparkling.graph.loaders.graphml.GraphFromGraphML.{GraphML, GraphProperties} import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph class GraphFromGraphML$Test(implicit sc:SparkContext) extends LoaderTest { "GraphML with standard format" should "be loaded by default" in{ Given("XML in GraphML format path") val filePath = getClass.getResource("/simpleGraphML.xml").toString When("Loads graph") val graph = LoadGraph.from(GraphML(filePath)).load() Then("Graph should be loaded correctly") graph.vertices.count() should equal(2) graph.edges.count() should equal(1) } "GraphML with standard format and multiple edges" should "be loaded by default" in{ Given("XML in GraphML format path") val filePath = getClass.getResource("/simpleGraphML2.xml").toString When("Loads graph") val graph = LoadGraph.from(GraphML(filePath)).load() Then("Graph should be loaded correctly") graph.vertices.count() should equal(3) graph.edges.count() should equal(2) } "GraphML with vertices attributes" should "be loaded by default" in{ Given("XML in GraphML format path") val filePath = getClass.getResource("/withValuesGraphML.xml").toString When("Loads graph") val graph: Graph[GraphProperties, GraphProperties] = LoadGraph.from(GraphML(filePath)).load() Then("Graph should be loaded correctly") graph.vertices.count() should equal(4) graph.edges.count() should equal(2) graph.vertices.map{ case (vId,properites)=>(vId,properites("name").asInstanceOf[String]) }.collect().sorted should equal(List((0l,"name0"),(1l,"name1"),(2l,"name2"),(3l,"name3"))) graph.vertices.flatMap{ case (vId,properites)=>properites.get("type").asInstanceOf[Option[String]].map((vId,_)) }.collect().sorted should equal(List((0l,"type0"))) } }
Example 5
Source File: Neo4jGraphFrame.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark.dataframe import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.sql.SQLContext import org.neo4j.spark.Neo4jGraph import org.neo4j.spark.cypher.CypherHelpers._ object Neo4jGraphFrame { def apply(sqlContext: SQLContext, src: (String, String), edge: (String, String), dst: (String, String)) = { def nodeStmt(s: (String, String)) = s"MATCH (n:${s._1.quote}) RETURN id(n) as id, n.${s._2.quote} as prop" val edgeProp = if (edge._2 == null) "" else s", r.${edge._2.quote} as prop" val edgeStmt = s"MATCH (n:${src._1.quote})-[r:${edge._1.quote}]->(m:${dst._1.quote}) RETURN id(n) as src, id(m) as dst" + edgeProp val vertices1 = Neo4jDataFrame(sqlContext, nodeStmt(src), Seq.empty, ("id", "integer"), ("prop", "string")) val vertices2 = Neo4jDataFrame(sqlContext, nodeStmt(dst), Seq.empty, ("id", "integer"), ("prop", "string")) val schema = Seq(("src", "integer"), ("dst", "integer")) ++ (if (edge._2 != null) Some("prop", "string") else None) val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, schema: _*) org.graphframes.GraphFrame(vertices1.union(vertices2).distinct(), edges) } def fromGraphX(sc: SparkContext, label1: String, rels: Seq[String], label2: String) = { val g: Graph[Any, Int] = Neo4jGraph.loadGraph(sc, label1, rels, label2) org.graphframes.GraphFrame.fromGraphX(g) } def fromEdges(sqlContext: SQLContext, label1: String, rels: Seq[String], label2: String) = { val relTypes = rels.map(_.quote).mkString("|") val edgeStmt = s"MATCH (n:${label1.quote})-[r:$relTypes]->(m:${label2.quote}) RETURN id(n) as src, id(m) as dst" val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, ("src", "integer"), ("dst", "integer")) org.graphframes.GraphFrame.fromEdges(edges) } }
Example 6
Source File: LoadDsl.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark.dsl import org.apache.spark.graphx.Graph import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.graphframes.GraphFrame import scala.reflect.ClassTag trait LoadDsl { def loadRdd[T:ClassTag] : RDD[T] def loadRowRdd : RDD[Row] def loadNodeRdds : RDD[Row] def loadRelRdd : RDD[Row] def loadGraph[VD:ClassTag,ED:ClassTag] : Graph[VD,ED] def loadGraphFrame[VD:ClassTag,ED:ClassTag] : GraphFrame def loadDataFrame : DataFrame def loadDataFrame(schema : (String,String)*) : DataFrame }
Example 7
Source File: PairwiseBPSuite.scala From sandpiper with Apache License 2.0 | 5 votes |
package sparkle.graph import org.apache.spark.graphx.{Edge, Graph} import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import sparkle.util.LocalSparkContext class PairwiseBPSuite extends FunSuite with LocalSparkContext { test("Pairwise BP test") { // test from the lectures EECS course 6.869, Bill Freeman and Antonio Torralba. // Chapter 7.3.5 Numerical example. withSpark { sc => val vertices: RDD[(Long, PVertex)] = sc.parallelize(Seq( (1L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))), (2L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))), (3L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))), (4L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 0.0).map(math.log))))) ) val edges = sc.parallelize(Seq( Edge(1L, 2L, PEdge(Factor(Array(2, 2), Array(1.0, 0.9, 0.9, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))), Edge(2L, 3L, PEdge(Factor(Array(2, 2), Array(0.1, 1.0, 1.0, 0.1).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))), Edge(2L, 4L, PEdge(Factor(Array(2, 2), Array(1.0, 0.1, 0.1, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))) )) val graph = Graph(vertices, edges) val bpGraph = PairwiseBP(graph) val trueProbabilities = Seq( 1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0), 2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1), 3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01), 4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid } val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid } val eps = 10e-5 calculatedProbabilities.zip(trueProbabilities).foreach { case ((_, vertex), (_, (trueP0, trueP1))) => assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps) } } } test("Pariwise BP test with file") { withSpark { sc => val graph = PairwiseBP.loadPairwiseGraph(sc, "data/vertex4.txt", "data/edge4.txt") val bpGraph = PairwiseBP(graph) val trueProbabilities = Seq( 1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0), 2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1), 3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01), 4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid } val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid } val eps = 10e-5 calculatedProbabilities.zip(trueProbabilities).foreach { case ((_, vertex), (_, (trueP0, trueP1))) => assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps) } } } }
Example 8
Source File: FastUnfolding.scala From fastunfolding with Apache License 2.0 | 5 votes |
package com.soteradefense.dga.graphx.louvain import org.apache.spark.SparkContext import org.apache.spark.graphx.{VertexId, PartitionStrategy, TripletFields, Graph} import scala.reflect.ClassTag class FastUnfolding(outputdir: String, minProgress: Int = 1, progressCounter: Int = 1) { var qValues = Array[(Int, Double)]() def saveLevel(sc: SparkContext, level: Int, q: Double, graph: Graph[MyVertexState, Long]) = { graph.vertices.saveAsTextFile(s"${outputdir}/level_${level}_vertices") graph.edges.saveAsTextFile(s"${outputdir}/level_${level}_edges") //graph.vertices.map( {case (id,v) => ""+id+","+v.internalWeight+","+v.community }).saveAsTextFile(outputdir+"/level_"+level+"_vertices") //graph.edges.mapValues({case e=>""+e.srcId+","+e.dstId+","+e.attr}).saveAsTextFile(outputdir+"/level_"+level+"_edges") qValues = qValues :+ ((level, q)) println(s"qValue: $q") // overwrite the q values at each level sc.parallelize(qValues, 1).saveAsTextFile(s"${outputdir}/qvalues") } def run[VD: ClassTag](sc: SparkContext, graph: Graph[VD, Long]) = { val initialGraph = createGraph(graph) val graphWeight = initialGraph.vertices.map( vertex => { vertex._2.nodeWeight } ).reduce(_ + _) val broadcastGraphWeight = sc.broadcast(graphWeight) val initialModularity = initialGraph.vertices.map( vertex => { vertex._2.in / (2 * graphWeight) - vertex._2.tot * vertex._2.tot / (graphWeight * graphWeight) } ).reduce(_ + _) var level = -1 var halt = false while(!halt) { level += 1 println(s"Starting level ${level}") val (currentQ, currentGraph, passes) = runFastUnfolding(sc, initialGraph, minProgress, progressCounter) } } def runFastUnfolding(sc: SparkContext, graph: Graph[MyVertexState, Long], minProgress: Int, progressCounter: Int) = { val cachedGraph = graph.cache() } def createGraph[VD: ClassTag](graph: Graph[VD, Long]): Graph[MyVertexState, Long] = { val nodeWeights = graph.aggregateMessages[Long]( cxt => { cxt.sendToSrc(cxt.attr) cxt.sendToDst(cxt.attr) }, (a, b) => a + b, TripletFields.EdgeOnly ) nodeWeights.foreach(result => println(s"nodeweight: ${result._1}, ${result._2}")) val louvainGraph = graph.outerJoinVertices(nodeWeights)((vid, data, weightOption) => { val weight = weightOption.getOrElse(0L) val state = new MyVertexState() state.community = vid state.changed = false state.tot = weight state.in = 0 state.nodeWeight = weight state }).partitionBy(PartitionStrategy.EdgePartition2D) louvainGraph } }
Example 9
Source File: PeriodicGraphCheckpointer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.util import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel import org.apache.spark.util.PeriodicCheckpointer data.vertices.cache() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.cache() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 10
Source File: SSSPExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 11
Source File: RingGenerator.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.generators.ring import ml.sparkling.graph.api.generators.{GraphGenerator, GraphGeneratorConfiguration} import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.rdd.RDD object RingGenerator extends GraphGenerator[RingGeneratorConfiguration,Int,Int]{ override def generate(configuration: RingGeneratorConfiguration)(implicit ctx:SparkContext): Graph[Int, Int] = { val vertexTuples: RDD[(Long, Long)] =ctx .parallelize((0l to configuration.numberOfNodes-1)) .flatMap(vId=>{ val nextId=(vId+1) % configuration.numberOfNodes val previousId=if(vId-1 < 0) {configuration.numberOfNodes-1} else {vId-1} (vId,nextId) :: {if(configuration.undirected) List((vId,previousId)) else Nil} } ) Graph.fromEdgeTuples(vertexTuples,1) } } case class RingGeneratorConfiguration(val numberOfNodes:Long, val undirected:Boolean=false) extends GraphGeneratorConfiguration;
Example 12
Source File: Neo4jGraphScalaTSE.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.junit.Assert._ import org.junit._ import scala.collection.JavaConverters._ object Neo4jGraphScalaTSE { } class Neo4jGraphScalaTSE extends SparkConnectorScalaBaseTSE { val FIXTURE: String = "CREATE (s:A {a:0})-[r:REL {foo:'bar'}]->(t:B {b:1}) RETURN id(s) AS source, id(t) AS target" private var source: Long = _ private var target: Long = _ @Before @throws[Exception] def setUp { val map = SparkConnectorScalaSuiteIT.session().run(FIXTURE).single() .asMap() source = map.get("source").asInstanceOf[Long] target = map.get("target").asInstanceOf[Long] } private def assertGraph(graph: Graph[_, _], expectedNodes: Long, expectedRels: Long) = { assertEquals(expectedNodes, graph.vertices.count) assertEquals(expectedRels, graph.edges.count) } @Test def runCypherQueryWithParams { val data = List(Map("id"->1,"name"->"Test").asJava).asJava Executor.execute(sc, "UNWIND $data as row CREATE (n:Test {id:row.id}) SET n.name = row.name", Map(("data",data))) } @Test def runMatrixQuery { val graph = Neo4jGraph.loadGraph(sc, "A", Seq.empty, "B") assertGraph(graph, 2, 1) } @Test def saveGraph { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL","test")) assertEquals(42L, SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def saveGraphMerge { val edges : RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,13L) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,"value",("FOOBAR","test"),Option("Foo","id"),Option("Bar","id"),merge = true) assertEquals(Map("fid"->source,"bid"->target,"rv"->42L,"fv"->13L,"bv"->13L).asJava,SparkConnectorScalaSuiteIT.session().run("MATCH (foo:Foo)-[rel:FOOBAR]->(bar:Bar) RETURN {fid: foo.id, fv:foo.value, rv:rel.test,bid:bar.id,bv:bar.value} as data").single().get("data").asMap()) } @Test def saveGraphByNodeLabel { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(0,1,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL","test"),Option(("A","a")),Option(("B","b"))) assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def mergeGraphByNodeLabel { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL2","test"),merge = true) assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL2]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def saveGraphNodes { val nodes : RDD[(VertexId, Long)] = sc.makeRDD(Seq((source,10L),(target,20L))) val edges : RDD[Edge[Long]] = sc.makeRDD(Seq()) val graph = Graph[Long,Long](nodes,edges,-1) assertGraph(graph, 2, 0) Neo4jGraph.saveGraph(sc,graph,"prop") assertEquals(10L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (a:A) WHERE id(a) = $source RETURN a.prop as prop").single().get("prop").asLong()) assertEquals(20L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (b:B) WHERE id(b) = $target RETURN b.prop as prop").single().get("prop").asLong()) } }
Example 13
Source File: GraphProviders.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.csv.providers import ml.sparkling.graph.loaders.csv.types.Types import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.SparkSession; import scala.reflect.ClassTag object GraphProviders { val defaultStorageLevel=StorageLevel.MEMORY_ONLY def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: Row => Seq[(VertexId, VD)], edgeProvider: Row => Seq[Edge[ED]], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel =defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = { dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => { rowIterator.flatMap { case row => mappingFunction(row) } }) } val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider) val edges: RDD[Edge[ED]] = mapRows(edgeProvider) defaultVertex match{ case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel) case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel) } } def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)], edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]], columnsToIndex: Seq[Int], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel = defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap def extractIdFromIndex(vertex: VD) = index(vertex) simpleGraphBuilder(defaultVertex, vertexProvider(_: Row, extractIdFromIndex _), edgeProvider(_: Row, extractIdFromIndex _), edgeStorageLevel, vertexStorageLevel)(dataFrame) } }
Example 14
Source File: GraphMLLoader.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.graphml import com.databricks.spark.xml._ import ml.sparkling.graph.loaders.graphml.GraphMLFormat._ import ml.sparkling.graph.loaders.graphml.GraphMLTypes.TypeHandler import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, SparkSession} import scala.collection.mutable import scala.util.Try def loadGraphFromML(path: String)(implicit sc: SparkContext): Graph[ValuesMap, ValuesMap] = { val sparkSession=SparkSession.builder().getOrCreate(); val graphDataFrame = sparkSession.sqlContext.read .format("com.databricks.spark.xml") .option("attributePrefix","@") .option("valueTag","#VALUE") .option("rowTag",graphTag).load(path).rdd val keys =sparkSession.sqlContext.read .format("com.databricks.spark.xml") .option("attributePrefix","@") .option("valueTag","#VALUE") .option("rowTag",graphMLTag).load(path).rdd .flatMap(r => Try(r.getAs[mutable.WrappedArray[Row]](keyTag).toArray).getOrElse(Array.empty)) val nodesKeys = keys .filter(r => r.getAs[String](forAttribute) == nodeTag) val edgeKeys = keys .filter(r => r.getAs[String](forAttribute) == edgeTag) val nodeAttrHandlers = createAttrHandlersFor(nodesKeys) val edgeAttrHandlers = createAttrHandlersFor(edgeKeys) val verticesWithData = graphDataFrame.flatMap(r => r.getAs[Any](nodeTag) match { case data: mutable.WrappedArray[Row@unchecked] => data.array case data: Row => Array(data) }) val verticesIndex = verticesWithData.map(r => r.getAs[String](idAttribute)).zipWithUniqueId().collect().toMap val vertices: RDD[(VertexId, Map[String, Any])] = verticesWithData .map( r => (verticesIndex(r.getAs[String](idAttribute)), extractAttributesMap(nodeAttrHandlers, r)) ) val edgesRows = graphDataFrame.flatMap(r => r.getAs[Any](edgeTag) match { case data: mutable.WrappedArray[Row@unchecked] => data.array case data: Row => Array(data) }) .map(r => Edge( verticesIndex(r.getAs[String](sourceAttribute)), verticesIndex(r.getAs[String](targetAttribute)), extractAttributesMap(edgeAttrHandlers, r) )) Graph(vertices, edgesRows) } def extractAttributesMap(attrHandlers: Map[String, GraphMLAttribute], r: Row): Map[String, Any] = { Try(r.getAs[mutable.WrappedArray[Row]](dataTag)).toOption.map( _.map(r => { val attribute = attrHandlers(r.getAs[String](keyAttribute)) (attribute.name, attribute.handler(r.getAs[String](tagValue))) }).toMap ).getOrElse(Map.empty) + ("id" -> r.getAs[String](idAttribute)) } def createAttrHandlersFor(keys: RDD[Row]): Map[String, GraphMLAttribute] = { keys .map(r => (r.getAs[String](idAttribute), GraphMLAttribute(r.getAs[String](nameAttribute), GraphMLTypes(r.getAs[String](typeAttribute))))) .collect().toMap } }
Example 15
Source File: PSCAN$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.community.pscan import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID import ml.sparkling.graph.operators.MeasureTest import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ import org.apache.spark.graphx.util.GraphGenerators class PSCAN$Test (implicit sc:SparkContext) extends MeasureTest { "Components for full graph" should " be computed" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes components") val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph) Then("Should compute components correctly") components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size should equal (1) graph.unpersist(true) } "Components for full graph" should " be computed using DSL" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes components") val components: Graph[ComponentID, Int] =graph.PSCAN() Then("Should compute components correctly") components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size should equal (1) graph.unpersist(true) } "Components for ring graph" should " be computed" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes components") val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph) Then("Should compute components correctly") components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size should equal (5) graph.unpersist(true) } "Components for 3 component graph" should " be computed" in{ Given("graph") val filePath = getClass.getResource("/graphs/coarsening_to_3") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes components") val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph) Then("Should compute components correctly") components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size should equal (3) graph.unpersist(true) } "Dynamic components detection for 3 component graph" should " be computed" in{ Given("graph") val filePath = getClass.getResource("/graphs/coarsening_to_3") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes components") val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,3) Then("Should compute components correctly") numberOfComponents should equal (3) graph.unpersist(true) } "Dynamic components detection for RMAT graph" should " be computed" in{ for(x<- 0 to 10){ Given("graph") val graph:Graph[Int,Int]=GraphGenerators.rmatGraph(sc,33,132) When("Computes components") val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,24) Then("Should compute components correctly") numberOfComponents should equal (24l +- 5l) graph.unpersist(true) } } "Dynamic components detection for random graph" should " be computed" in{ Given("graph") val graph:Graph[Int,Int]=GraphGenerators.rmatGraph(sc,1000,10000) When("Computes components") val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,24) Then("Should compute components correctly") numberOfComponents should equal (24l +- 5l) graph.unpersist(true) } }
Example 16
Source File: BasicLinkPredictor$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.link import ml.sparkling.graph.operators.MeasureTest import ml.sparkling.graph.operators.measures.edge.CommonNeighbours import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ class BasicLinkPredictor$Test (implicit sc:SparkContext) extends MeasureTest { "In open triad" should " propose to close it" in{ Given("graph") val filePath = getClass.getResource("/graphs/3_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes new links") val links = BasicLinkPredictor.predictLinks(graph,CommonNeighbours,0,true) Then("Should compute links correctly") links.collect() should equal(Array((1,3))) graph.unpersist(true) } "In open 4 nodes graph" should " propose to close it fully" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_open") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes new links") val links = graph.predictLinks(CommonNeighbours,1,true) Then("Should compute links correctly") links.collect().toSet should equal(Set((1,3),(2,4))) graph.unpersist(true) } }
Example 17
Source File: MeasureTest.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.graphx.{Graph, GraphLoader} import org.scalatest._ abstract class MeasureTest(implicit sc:SparkContext) extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers{ def time[T](str: String)(thunk: => T): (T,Long) = { logger.info(s"$str...") val t1 = System.currentTimeMillis val x = thunk val t2 = System.currentTimeMillis val diff=t2 - t1 logger.info(s"$diff ms") (x,diff) } val logger=Logger.getLogger(this.getClass) def loadGraph(file:String)={ val out: Graph[Int, Int] =GraphLoader.edgeListFile(sc,file.toString) out.vertices.setName(s"Graph vertices ${file}") out.edges.setName(s"Graph edges ${file}") out.triplets.setName(s"Graph triplets ${file}") out } }
Example 18
Source File: Modularity$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.graph import ml.sparkling.graph.operators.MeasureTest import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ import org.apache.spark.graphx.util.GraphGenerators class Modularity$Test (implicit sc:SparkContext) extends MeasureTest{ "Modularity for star graph in one community" should "be 0" in{ Given("graph") val filePath = getClass.getResource("/graphs/6_nodes_star") val graph:Graph[Int,Int]=loadGraph(filePath.toString) val graphComponents=graph.PSCAN(epsilon = 0) When("Computes Modularity") val result=Modularity.compute(graphComponents) Then("Should calculate Modularity") result should be (0) graph.unpersist(true) } "Modularity for ring graph in one community" should "be 0" in{ Given("graph") val graph=GraphGenerators.gridGraph(sc,5,5).mapEdges((_)=>1).mapVertices((_,_)=>1) val graphComponents=graph.PSCAN(epsilon = 0) When("Computes Modularity") val result=Modularity.compute(graphComponents) Then("Should calculate Modularity") result should be (0) graph.unpersist(true) } "Modularity for ring graph in one node communities" should "be -0.041875" in{ Given("graph") val graph=GraphGenerators.gridGraph(sc,5,5) val graphComponents=graph.PSCAN(epsilon = 1) When("Computes Modularity") val result=Modularity.compute(graphComponents) Then("Should calculate Modularity") result should be (-0.041875 +- 0.000000001) graph.unpersist(true) } "Modularity for star graph in one community" should "be 0 when calculated using DSL" in{ Given("graph") val filePath = getClass.getResource("/graphs/6_nodes_star") val graph:Graph[Int,Int]=loadGraph(filePath.toString) val graphComponents=graph.PSCAN(epsilon = 0) When("Computes Modularity") val result=graphComponents.modularity() Then("Should calculate Modularity") result should be (0) graph.unpersist(true) } "Modularity for all single components" should "be -1 " in{ Given("graph") val filePath = getClass.getResource("/graphs/6_nodes_star") val graph:Graph[Int,Int]=loadGraph(filePath.toString) val graphComponents=graph.PSCAN(epsilon=1) When("Computes Modularity") val result=graphComponents.modularity() Then("Should calculate Modularity") result should be (-0.3 +- 0.000000001) graph.unpersist(true) } }
Example 19
Source File: FreemanCentrality$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.graph import ml.sparkling.graph.operators.MeasureTest import ml.sparkling.graph.operators.OperatorsDSL._ import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph class FreemanCentrality$Test (implicit sc:SparkContext) extends MeasureTest { "Freeman Centrality for star graph" should "be 1" in{ Given("graph") val filePath = getClass.getResource("/graphs/6_nodes_star") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Freemans Centrality") val result=FreemanCentrality.compute(graph) Then("Should calculate Freemans Centrality") result should be (1) graph.unpersist(true) } "Freeman Centrality for star graph" should "be 1 when calculated using DSL" in{ Given("graph") val filePath = getClass.getResource("/graphs/6_nodes_star") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Freemans Centrality") val result=graph.freemanCentrality() Then("Should calculate Freemans Centrality") result should be (1) graph.unpersist(true) } "Freeman Centrality for 5 node line graph" should "be 0.167" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Freemans Centrality") val result=FreemanCentrality.compute(graph) Then("Should calculate Freemans Centrality") result should be (0.16666666 +- 1e-5) graph.unpersist(true) } }
Example 20
Source File: Hits$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.hits import ml.sparkling.graph.operators.MeasureTest import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ class Hits$Test(implicit sc:SparkContext) extends MeasureTest { "Hits for line graph" should "be correctly calculated" in { Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("Computes Hits") val result = Hits.computeBasic(graph) Then("Should calculate hits correctly") result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array( (0.25,0d), (0.25,0.25),(0.25,0.25),(0.25,0.25),(0d,0.25) )).foreach { case ((a,b),(c,d)) => { a should be (c +- 1e-5) b should be (d +- 1e-5) } } graph.unpersist(true) } "Hits for line graph" should "be correctly calculated using DSL" in { Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("Computes Hits") val result = graph.hits() Then("Should calculate hits correctly") result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array( (0.25,0d), (0.25,0.25),(0.25,0.25),(0.25,0.25),(0d,0.25) )).foreach { case ((a,b),(c,d)) => { a should be (c +- 1e-5) b should be (d +- 1e-5) } } graph.unpersist(true) } "Hits for full 4 node directed graph" should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Hits") val result=Hits.computeBasic(graph) Then("Should calculate Hits correctly") result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array( (0.44504187450168503,0.19806226306818242), (0.19806226497496957,0.4450418674109515), (1.9336832073590722e-13,0.3568958695205176), (0.35689586676523016,3.484376742610991e-13) )).foreach { case ((a,b),(c,d)) => { a should be (c +- 1e-5) b should be (d +- 1e-5) } } graph.unpersist(true) } }
Example 21
Source File: Neo4jPersistence.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.persistence import java.io.{File, PrintWriter} import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import edu.msstate.dasi.csb.util.Util import org.apache.hadoop.fs.FileUtil import org.apache.spark.graphx.Graph object Neo4jPersistence extends GraphPersistence { private val vertices_suffix = "_nodes" private val edges_suffix = "_relationships" def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite :Boolean = false): Unit = { val verticesPath = graphName + vertices_suffix val verticesTmpPath = "__" + verticesPath val edgesPath = graphName + edges_suffix val edgesTmpPath = "__" + edgesPath if (overwrite) { FileUtil.fullyDelete(new File(verticesPath + "-header")) FileUtil.fullyDelete(new File(verticesPath)) FileUtil.fullyDelete(new File(edgesPath + "-header")) FileUtil.fullyDelete(new File(edgesPath)) } val nodeHeader = s"name:ID($graphName),:LABEL\n" val nodeHeaderWriter = new PrintWriter(new File(verticesPath + "-header")) nodeHeaderWriter.write(nodeHeader) nodeHeaderWriter.close() graph.vertices.map { case (id, _) => s"$id,$graphName" }.saveAsTextFile(verticesTmpPath) Util.merge(verticesTmpPath, verticesPath) FileUtil.fullyDelete(new File(verticesTmpPath)) val relationshipHeader = s":START_ID($graphName),:END_ID($graphName),:TYPE,${EdgeData.neo4jCsvHeader}\n" val relHeaderWriter = new PrintWriter(new File(edgesPath + "-header")) relHeaderWriter.write(relationshipHeader) relHeaderWriter.close() graph.edges.map(edge => edge.attr match { case edgeData: EdgeData => s"${edge.srcId},${edge.dstId},EDGE,${edgeData.toCsv}" case _ => s"${edge.srcId},${edge.dstId},EDGE" } ).saveAsTextFile(edgesTmpPath) Util.merge(edgesTmpPath, edgesPath) FileUtil.fullyDelete(new File(edgesTmpPath)) } }
Example 22
Source File: ClosenessCentrality.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.spark import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.{EdgeDirection, Graph, VertexId} import scala.collection.mutable import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { getClosenessOfVert(vertex, graph) } private class DistanceNodePair(var distance: Long, var totalPairs: Long) extends Comparable[DistanceNodePair] { override def compareTo(dp: DistanceNodePair): Int = (this.distance - dp.distance).toInt } private class NodeVisitCounter extends java.io.Serializable { var totalPairs: Long = _ var levelSize: mutable.HashMap[Long, Long] = _ //first is distance second is pair at that distance } private def BFSNode[VD: ClassTag, ED: ClassTag](nID: Long, graph: Graph[VD, ED]): NodeVisitCounter = { val q = new mutable.Queue[Long]() q.enqueue(nID) val visited = new mutable.HashSet[VertexId]() val levelSize = new mutable.HashMap[Long, Long]() visited.add(nID) var totalPairs: Long = 0 val visitCounter = new NodeVisitCounter() var level = 0 while (q.nonEmpty) { val size = q.size totalPairs += size if (level != 0) { levelSize.put(level, size) } val list: Array[Long] = new Array[Long](size) for (x <- 0 until size) { list(x) = q.dequeue() } var children: Array[VertexId] = null if (list.length > 0) { for (x <- list) { val node: VertexId = x if (graph.collectNeighborIds(EdgeDirection.Out).lookup(node).nonEmpty) { children = graph.collectNeighborIds(EdgeDirection.Out).lookup(node).head // children = hashmap.value.get(x).head for (c: Long <- children) { // val childNode = graph.vertices.lookup(c) //hashmap.value.get(c).head if (!visited.contains(c)) { q.enqueue(c) visited.add(c) } } } } } level += 1 } totalPairs -= 1 visitCounter.levelSize = levelSize visitCounter.totalPairs = totalPairs visitCounter } private def getClosenessOfVert[VD: ClassTag, ED: ClassTag](vertex: VertexId, graph: Graph[VD, ED]): Double = { val visitCenter = BFSNode(vertex, graph) var denominator: Long = 0L for (x <- visitCenter.levelSize.keySet) { denominator += visitCenter.levelSize.get(x).head * x } if (denominator == 0) return -1 val count = graph.vertices.count().toDouble count / denominator } }
Example 23
Source File: SSSP.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.spark import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.{Graph, VertexId} import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { for (dst <- graph.vertices.keys.toLocalIterator) { bfs(graph, src, dst) } } private def bfs[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], src: VertexId, dst: VertexId): Unit = { // if (src == dst) return List(src) if (src == dst) return // The attribute of each vertex is (dist from src, id of vertex with dist-1) var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache() // Traverse forward from src var dstAttr = (Int.MaxValue, 0L) while (dstAttr._1 == Int.MaxValue) { val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) { e.sendToDst((e.srcAttr._1 + 1, e.srcId)) }, (a, b) => if (a._1 < b._1) a else b).cache() // if (msgs.count == 0) return List.empty if (msgs.count == 0) return g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) => if (newAttr._1 < oldAttr._1) newAttr else oldAttr }.cache() dstAttr = g.vertices.filter(_._1 == dst).first()._2 } // Traverse backward from dst and collect the path var path: List[VertexId] = dstAttr._2 :: dst :: Nil while (path.head != src) { path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path } // path } }
Example 24
package edu.msstate.dasi.csb.workload.spark import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.{Graph, VertexId} import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { // if (src == dst) return List(src) if (src == dst) return // The attribute of each vertex is (dist from src, id of vertex with dist-1) var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache() // Traverse forward from src var dstAttr = (Int.MaxValue, 0L) while (dstAttr._1 == Int.MaxValue) { val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) { e.sendToDst((e.srcAttr._1 + 1, e.srcId)) }, (a, b) => if (a._1 < b._1) a else b).cache() // if (msgs.count == 0) return List.empty if (msgs.count == 0) return g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) => if (newAttr._1 < oldAttr._1) newAttr else oldAttr }.cache() dstAttr = g.vertices.filter(_._1 == dst).first()._2 } // Traverse backward from dst and collect the path var path: List[VertexId] = dstAttr._2 :: dst :: Nil while (path.head != src) { path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path } // path } }
Example 25
Source File: ConnectedComponents.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.neo4j import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.Graph import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { val query = "MATCH (n) WITH COLLECT(n) as nodes " + "RETURN REDUCE(graphs = [], n in nodes | " + "case when " + "ANY (g in graphs WHERE shortestPath( (n)-[*]-(g) ) ) " + "then graphs " + "else graphs + [n]" + "end );" engine.run(query) } }
Example 26
Source File: BetweennessCentrality.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.neo4j import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.Graph import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { val query = s"MATCH (n), pthroughn = shortestPath((a)-[*..$hops]->(b)) " + "WHERE n IN nodes(pthroughn) AND n <> a AND n <> b AND a <> b " + "WITH n,a,b,count(pthroughn) AS sumn " + s"MATCH p = shortestPath((a)-[*..$hops]->(b)) " + "WITH n, a, b, tofloat(sumn)/ tofloat(count(p)) AS fraction " + "RETURN n, sum(fraction);" engine.run(query) } }
Example 27
Source File: StronglyConnectedComponents.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.neo4j import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.Graph import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { val query = "MATCH (n) " + "WITH COLLECT(n) as nodes " + "RETURN REDUCE(graphs = [], n in nodes | " + "case when " + "ANY (g in graphs WHERE (shortestPath( (n)-[*]->(g) ) AND shortestPath( (n)<-[*]-(g) ) ) ) " + "then graphs " + "else graphs + [n] " + "end ) " engine.run(query) } }
Example 28
Source File: PageRank.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.neo4j import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.Graph import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { val query = "MATCH (a) " + "set a.pagerank = 0.0 " + "WITH collect(distinct a) AS nodes,count(a) as num_nodes " + "UNWIND nodes AS a " + "MATCH (a)-[r]-(b) " + "WITH a,collect(r) AS rels, count(r) AS num_rels, 1.0/num_nodes AS rank " + "UNWIND rels AS rel " + "SET endnode(rel).pagerank = " + "CASE " + "WHEN num_rels > 0 AND id(startnode(rel)) = id(a) THEN " + "endnode(rel).pagerank + rank/(num_rels) " + "ELSE endnode(rel).pagerank " + "END " + ",startnode(rel).pagerank = " + "CASE " + "WHEN num_rels > 0 AND id(endnode(rel)) = id(a) THEN " + "startnode(rel).pagerank + rank/(num_rels) " + "ELSE startnode(rel).pagerank " + "END " + "WITH collect(distinct a) AS a,rank " + "RETURN a" engine.run(query) } }
Example 29
Source File: SparkPersistence.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.persistence import java.io.File import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import edu.msstate.dasi.csb.sc import edu.msstate.dasi.csb.util.Util import org.apache.hadoop.fs.FileUtil import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.storage.StorageLevel object SparkPersistence extends GraphPersistence { private val vertices_suffix = "_vertices" private val edges_suffix = "_edges" def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = { val verticesPath = graphName + vertices_suffix val verticesTmpPath = "__" + verticesPath val edgesPath = graphName + edges_suffix val edgesTmpPath = "__" + edgesPath if (overwrite) { FileUtil.fullyDelete(new File(verticesPath)) FileUtil.fullyDelete(new File(edgesPath)) } graph.vertices.saveAsTextFile(verticesTmpPath) Util.merge(verticesTmpPath, verticesPath) FileUtil.fullyDelete(new File(verticesTmpPath)) graph.edges.saveAsTextFile(edgesTmpPath) Util.merge(edgesTmpPath, edgesPath) FileUtil.fullyDelete(new File(edgesTmpPath)) } }
Example 30
Source File: LocalClustering$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.clustering import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration import ml.sparkling.graph.operators.MeasureTest import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ class LocalClustering$Test(implicit sc:SparkContext) extends MeasureTest { "Local clustering for line graph" should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes local clustering") val localClustering=LocalClustering.compute(graph) Then("Should calculate local clustering correctly") val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById should equal (Array( (1,0.0), (2,0.0), (3,0.0), (4,0.0), (5,0.0) )) graph.unpersist(true) } "Local clustering for line graph" should "be correctly calculated using DSL" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes local clustering") val localClustering=graph.localClustering() Then("Should calculate local clustering correctly") val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById should equal (Array( (1,0.0), (2,0.0), (3,0.0), (4,0.0), (5,0.0) )) graph.unpersist(true) } "Local clustering for full directed graph " should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes local clustering") val localClustering=LocalClustering.compute(graph) Then("Should calculate local clustering correctly") val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById should equal (Array( (1,0.5), (2,0d), (3,0d), (4,0.5) )) graph.unpersist(true) } "Local clustering for full undirected graph " should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes local clustering") val localClustering=LocalClustering.compute(graph,VertexMeasureConfiguration[Int,Int](true)) Then("Should calculate local clustering correctly") val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById should equal (Array( (1,1), (2,1), (3,1), (4,1) )) graph.unpersist(true) } "Local clustering for full directed graph " should "be correctly calculated using iterative approach" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes local clustering") val localClustering=LocalClustering.compute(graph) val localClusteringIterative=LocalClustering.compute(graph,VertexMeasureConfiguration[Int,Int]((g:Graph[Int,Int])=>1l)) Then("Should calculate local clustering correctly") val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById should equal (localClusteringIterative.vertices.collect().sortBy{case (vId,data)=>vId}) graph.unpersist(true) } }
Example 31
Source File: GraphSynth.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.data.synth import edu.msstate.dasi.csb.data.distributions.DataDistributions import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import edu.msstate.dasi.csb.sc import edu.msstate.dasi.csb.util.Util import org.apache.spark.graphx.Graph def synthesize(seed: Graph[VertexData, EdgeData], seedDists : DataDistributions, withProperties: Boolean): Graph[VertexData, EdgeData] = { var synth = null.asInstanceOf[Graph[VertexData, EdgeData]] Util.time( "Gen Graph", { synth = genGraph(seed, seedDists) println("Vertices #: " + synth.numVertices + ", Edges #: " + synth.numEdges) } ) if (withProperties) { Util.time( "Gen Properties", { synth = genProperties(synth, seedDists) println("Vertices #: " + synth.numVertices + ", Edges #: " + synth.numEdges) } ) } synth } }
Example 32
Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.graphx import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object PageRank { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("PageRank") .getOrCreate() val sc = spark.sparkContext // build vertices val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( "1,BarackObama,Barack Obama", "2,ladygaga,Goddess of Love", "3,jeresig,John Resig", "4,justinbieber,Justin Bieber", "6,matei_zaharia,Matei Zaharia", "7,odersky,Martin Odersky", "8,anonsys" ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) // build edges val followers: RDD[Edge[Double]] = sc.parallelize(Array( Edge(2L, 1L, 1.0), Edge(4L, 1L, 1.0), Edge(1L, 2L, 1.0), Edge(6L, 3L, 1.0), Edge(7L, 3L, 1.0), Edge(7L, 6L, 1.0), Edge(6L, 7L, 1.0), Edge(3L, 7L, 1.0) )) // build graph val followerGraph: Graph[Array[String], Double] = Graph(users, followers) // restrict the graph to users with usernames and names val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) // compute PageRank val pageRankGraph = subgraph.pageRank(0.001) // get attributes of the top pagerank users val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { case (uid, attrList, Some(pr)) => (pr, attrList.toList) case (uid, attrList, None) => (0.0, attrList.toList) } println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) } }
Example 33
Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.graphx import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object PageRank { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PageRank") val sc = new SparkContext(conf) // build vertices val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( "1,BarackObama,Barack Obama", "2,ladygaga,Goddess of Love", "3,jeresig,John Resig", "4,justinbieber,Justin Bieber", "6,matei_zaharia,Matei Zaharia", "7,odersky,Martin Odersky", "8,anonsys" ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) // build edges val followers: RDD[Edge[Double]] = sc.parallelize(Array( Edge(2L, 1L, 1.0), Edge(4L, 1L, 1.0), Edge(1L, 2L, 1.0), Edge(6L, 3L, 1.0), Edge(7L, 3L, 1.0), Edge(7L, 6L, 1.0), Edge(6L, 7L, 1.0), Edge(3L, 7L, 1.0) )) // build graph val followerGraph: Graph[Array[String], Double] = Graph(users, followers) // restrict the graph to users with usernames and names val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) // compute PageRank val pageRankGraph = subgraph.pageRank(0.001) // get attributes of the top pagerank users val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { case (uid, attrList, Some(pr)) => (pr, attrList.toList) case (uid, attrList, None) => (0.0, attrList.toList) } println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) } }
Example 34
Source File: GodwinTest.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.graph import io.gzet.test.SparkFunSuite import org.apache.log4j.{Logger, Level} import org.apache.spark.graphx.{Graph, Edge} import org.apache.spark.rdd.RDD import scala.io.Source class GodwinTest extends SparkFunSuite { Logger.getLogger("akka").setLevel(Level.OFF) Logger.getLogger("org").setLevel(Level.OFF) def buildEdges() = { Source.fromInputStream(getClass.getResourceAsStream("/edges.csv")).getLines().drop(1).map(s => { val Array(source, target, weight) = s.split(",") Edge(source.toLong, target.toLong, weight.toDouble) }).toList } localTest("Test Random Walks") { sc => val edges: RDD[Edge[Double]] = sc.parallelize(buildEdges(), 1) val godwin = new Godwin(Seq(16)) val walks = godwin.randomWalks(Graph.fromEdges(edges, 0L), 4).collect().sortBy(_._2) println(walks.map(_._1).mkString(" -> ")) walks.last._1 should be(16) } }
Example 35
Source File: GzetCommunitiesTest.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community import io.gzet.community.clustering.wcc.WCCDetection import io.gzet.test.SparkFunSuite import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.{Graph, Edge} import scala.io.Source class GzetCommunitiesTest extends SparkFunSuite { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) localTest("WCC communities") { spark => val lines = Source.fromInputStream(getClass.getResourceAsStream("/local-edges.csv")).getLines().zipWithIndex.filter(_._2 > 0).map(_._1).toSeq val sc = spark.sparkContext val edges = sc.parallelize(lines).map({ line => val a = line.split(",").map(_.toLong).sorted Edge(a.head, a.last, 1L) }).distinct() val graph = Graph.fromEdges(edges, 0L) graph.triplets.take(2).foreach(println) val communities = new WCCDetection(1).run(graph, sc) communities.map(_._2 -> 1).reduceByKey(_+_).collectAsMap() should be(Map(5L -> 5, 15L -> 6, 21L -> 5)) } }
Example 36
Source File: StoryBatchDedup.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.story import io.gzet.story.model.{Content, Article} import org.apache.spark.graphx.{Graph, Edge} import org.apache.spark.{Logging, SparkConf, SparkContext} import io.gzet.story.util.SimhashUtils._ import com.datastax.spark.connector._ object StoryBatchDedup extends SimpleConfig with Logging { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("Story Extractor") val sc = new SparkContext(sparkConf) val simhashRDD = sc.cassandraTable[Article]("gzet", "articles").zipWithIndex().map({ case (a, id) => ((id, Content(a.url, a.title, a.body)), a.hash) }) Set(0) val duplicateTupleRDD = simhashRDD.flatMap({ case ((id, content), simhash) => searchmasks.map({ mask => (simhash ^ mask, id) }) }).groupByKey() val edgeRDD = duplicateTupleRDD.values.flatMap({ it => val list = it.toList for (x <- list; y <- list) yield (x, y) }).filter({ case (x, y) => x != y }).distinct().map({case (x, y) => Edge(x, y, 0) }) val duplicateRDD = Graph.fromEdges(edgeRDD, 0L) .connectedComponents() .vertices .join(simhashRDD.keys) .values duplicateRDD.sortBy(_._1).collect().foreach({ case (story, content) => println(story + "\t" + content.title) }) } }
Example 37
Source File: PlaylistBuilder.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.recommender import com.datastax.spark.connector._ import com.typesafe.config.Config import io.gzet.recommender.Config._ import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import spark.jobserver._ object PlaylistBuilder extends SparkJob with NamedRddSupport { override def runJob(sc: SparkContext, conf: Config): Any = { val recordRDD = sc.cassandraTable[Record](KEYSPACE, TABLE_RECORD) val hashRDD = sc.cassandraTable[Hash](KEYSPACE, TABLE_HASH) val minSimilarityB = sc.broadcast(MIN_SIMILARITY) val songIdsB = sc.broadcast(recordRDD.map(r => (r.id, r.name)).collectAsMap()) implicit class Crossable[X](xs: Traversable[X]) { def cross[Y](ys: Traversable[Y]) = for { x <- xs; y <- ys } yield (x, y) } val songHashRDD = hashRDD flatMap { hash => hash.songs map { song => ((hash, song), 1) } } val songTfRDD = songHashRDD map { case ((hash, songId), count) => (songId, count) } reduceByKey(_+_) val songTfB = sc.broadcast(songTfRDD.collectAsMap()) val crossSongRDD = songHashRDD.keys.groupByKey().values flatMap { songIds => songIds cross songIds filter { case (from, to) => from != to } map(_ -> 1) } reduceByKey(_+_) map { case ((from, to), count) => val weight = count.toDouble / songTfB.value.getOrElse(from, 1) org.apache.spark.graphx.Edge(from, to, weight) } filter { edge => edge.attr > minSimilarityB.value } val graph = Graph.fromEdges(crossSongRDD, 0L) val prGraph = graph.pageRank(TOLERANCE, TELEPORT) val edges = prGraph.edges.map({ edge => (edge.srcId, (edge.dstId, edge.attr)) }).groupByKey().map({case (srcId, it) => val dst = it.toList val dstIds = dst.map(_._1.toString) val weights = dst.map(_._2.toString) Edge(srcId, dstIds, weights) }) val vertices = prGraph.vertices.mapPartitions({ vertices => val songIds = songIdsB.value vertices map { case (vId, pr) => Node(vId, songIds.getOrElse(vId, "UNKNOWN"), pr) } }) edges.saveAsCassandraTable(KEYSPACE, TABLE_EDGE) vertices.saveAsCassandraTable(KEYSPACE, TABLE_NODE) this.namedRdds.update(RDD_EDGE, edges) this.namedRdds.update(RDD_NODE, vertices) } override def validate(sc: SparkContext, config: Config): SparkJobValidation = { SparkJobValid } }
Example 38
Source File: PersonalizedPlaylistBuilder.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.recommender import com.typesafe.config.Config import io.gzet.recommender.Config._ import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import spark.jobserver._ object PersonalizedPlaylistBuilder extends SparkJob with NamedRddSupport { override def runJob(sc: SparkContext, conf: Config): Any = { val id = conf.getLong("song.id") val edges = this.namedRdds.get[Edge](RDD_EDGE).get val nodes = this.namedRdds.get[Node](RDD_NODE).get val edgeRDD = edges.flatMap({e => e.targets.zip(e.weights).map({case (target, weight) => org.apache.spark.graphx.Edge(e.source, target.toLong, weight.toDouble) }) }) val songIdsB = sc.broadcast(nodes.map(n => (n.id, n.name)).collectAsMap()) val graph = Graph.fromEdges(edgeRDD, 0L) graph.cache() val prGraph = graph.personalizedPageRank(id, TOLERANCE, TELEPORT) prGraph.vertices.mapPartitions({ it => val songIds = songIdsB.value it map { case (vId, pr) => (vId, songIds.getOrElse(vId, "UNKNOWN"), pr) } }).sortBy(_._3, ascending = false).map(v => List(v._1, v._3, v._2).mkString(",")).collect() } override def validate(sc: SparkContext, config: Config): SparkJobValidation = { if(!config.hasPath("song.id")) return SparkJobInvalid("Missing parameter [song.id]") if(this.namedRdds.get[Edge](RDD_EDGE).isEmpty) return SparkJobInvalid("Missing RDD [edges]") if(this.namedRdds.get[Edge](RDD_NODE).isEmpty) return SparkJobInvalid("Missing RDD [nodes]") SparkJobValid } }
Example 39
Source File: EmployeeRelationship.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.graphx import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.rdd.RDD import org.apache.spark.graphx.{ Edge, Graph } object EmployeeRelationship { def main(args: Array[String]): Unit = { // vertex format: vertex_id, data val vertexArray = Array( (1L, ("John", "Software Developer")), (2L, ("Robert", "Technical Leader")), (3L, ("Charlie", "Software Architect")), (4L, ("David", "Software Developer")), (5L, ("Edward", "Software Development Manager")), (6L, ("Francesca", "Software Development Manager"))) // edge format: from_vertex_id, to_vertex_id, data val edgeArray = Array( Edge(2L, 1L, "Technical Mentor"), Edge(2L, 4L, "Technical Mentor"), Edge(3L, 2L, "Collaborator"), Edge(6L, 3L, "Team Member"), Edge(4L, 1L, "Peers"), Edge(5L, 2L, "Team Member"), Edge(5L, 3L, "Team Member"), Edge(5L, 6L, "Peers")) val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob")) val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray) val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray) val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD) // Vanilla query println(">>> Showing the names of people who are Software Developers") graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") } .collect() .foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") } // Connection analysis println(">>> People connected to Robert (Technical Leader) -> ") graph.triplets.filter(_.srcId == 2).collect() .foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) } println(">>> Robert (Technical Leader) connected to -> ") graph.triplets.filter(_.dstId == 2).collect() .foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) } println(">>> Technical Mentoring Analysis -> ") graph.triplets.filter(_.attr.equals("Technical Mentor")).collect() .foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) } } }
Example 40
Source File: SSSPExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 41
Source File: FindInfluencer.scala From spark-graphx-twitter with Apache License 2.0 | 5 votes |
package com.knoldus.spark.graphx.example import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object FindInfluencer { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Twittter Influencer").setMaster("local[*]") val sparkContext = new SparkContext(conf) sparkContext.setLogLevel("ERROR") val twitterData = sparkContext.textFile("src/main/resources/twitter-graph-data.txt") val followeeVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr => val user = arr(0).replace("((", "") val id = arr(1).replace(")", "") (id.toLong, user) } val followerVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr => val user = arr(2).replace("(", "") val id = arr(3).replace("))", "") (id.toLong, user) } val vertices = followeeVertices.union(followerVertices) val edges: RDD[Edge[String]] = twitterData.map(_.split(",")).map { arr => val followeeId = arr(1).replace(")", "").toLong val followerId = arr(3).replace("))", "").toLong Edge(followeeId, followerId, "follow") } val defaultUser = ("") val graph = Graph(vertices, edges, defaultUser) val subGraph = graph.pregel("", 2, EdgeDirection.In)((_, attr, msg) => attr + "," + msg, triplet => Iterator((triplet.srcId, triplet.dstAttr)), (a, b) => (a + "," + b)) val lengthRDD = subGraph.vertices.map(vertex => (vertex._1, vertex._2.split(",").distinct.length - 2)).max()(new Ordering[Tuple2[VertexId, Int]]() { override def compare(x: (VertexId, Int), y: (VertexId, Int)): Int = Ordering[Int].compare(x._2, y._2) }) val userId = graph.vertices.filter(_._1 == lengthRDD._1).map(_._2).collect().head println(userId + " has maximum influence on network with " + lengthRDD._2 + " influencers.") sparkContext.stop() } }
Example 42
Source File: ZombieExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.graph import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object ZombieExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val vertexJsonFile = args(0) val edgeJsonFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex] val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge] val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => { (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive)) }) val edgeRdd = edgeDs.rdd.map(r => { new Edge[String](r.src, r.dst, r.edge_type) }) val defaultUser = new ZombieStats(false, 0) val graph = Graph(vectorRdd, edgeRdd, defaultUser) val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)( (vertexId, zombieState, message) => { if (message > 0 && !zombieState.isZombie) { new ZombieStats(true, message) } else { zombieState } }, triplet => { if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) { Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l)) } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) { Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l)) } else { Iterator.empty } }, (a, b) => Math.min(a, b)) println("ZombieBite") zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => { println("vertexId:" + r._1 + ",ZobmieStat:" + r._2) }) sparkSession.stop() } } case class ZombieStats (isZombie:Boolean, lengthOfLife:Long)
Example 43
Source File: PeriodicGraphCheckpointer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.impl import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel private[mllib] class PeriodicGraphCheckpointer[VD, ED]( checkpointInterval: Int, sc: SparkContext) extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) { override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint() override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed override protected def persist(data: Graph[VD, ED]): Unit = { if (data.vertices.getStorageLevel == StorageLevel.NONE) { data.vertices.persist() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.persist() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 44
Source File: SSSPExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 45
Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.Graph.graphToGraphOps import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object UserRankApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .foreachRDD(rdd => { val edges = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) }) .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) val vertices = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String]) }) .map(r => (r.hashCode.toLong, r)) val tolerance = 0.0001 val graph = Graph(vertices, edges, "defaultUser") .subgraph(vpred = (id, idStr) => idStr != "defaultUser") val pr = graph.pageRank(tolerance).cache graph.outerJoinVertices(pr.vertices) { (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) }.vertices.top(10) { Ordering.by(_._2._1) }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) }) ssc.start() ssc.awaitTermination() } }
Example 46
Source File: EdgeAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class EdgeAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Edge API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapEdges(e => e.attr.toUpperCase) println(res.edges.collect().toList) } }
Example 47
Source File: VertexAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class VertexAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Vertex API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapVertices((_, att) => att.toUpperCase()) res.vertices.collect().toList } }
Example 48
Source File: PeriodicGraphCheckpointer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.impl import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel private[mllib] class PeriodicGraphCheckpointer[VD, ED]( checkpointInterval: Int, sc: SparkContext) extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) { override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint() override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed override protected def persist(data: Graph[VD, ED]): Unit = { if (data.vertices.getStorageLevel == StorageLevel.NONE) { data.vertices.persist() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.persist() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 49
Source File: LocalRunner.scala From spark-betweenness with Apache License 2.0 | 5 votes |
package com.centrality.kBC import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD object MainRunner { def main(args: Array[String]) { // Create spark context val appName="kBC" val sparkMode="local" val conf = new SparkConf().setAppName(appName).setMaster(sparkMode); val sc = new SparkContext(conf); // Create sample graph // // Create an RDD for vertices val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), (5L, ("franklin", "prof")), (2L, ("istoica", "prof")))) // Create an RDD for edges val relationships: RDD[Edge[String]] = sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"))) // Define a default user in case there are relationship with missing user val defaultUser = ("John Doe", "Missing") // Build the initial Graph val graph = Graph(users, relationships, defaultUser) val kBCGraph = KBetweenness.run(graph, 3) } }
Example 50
Source File: GraphFramesExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} //import org.graphframes._ object GraphFramesExample extends App { val conf = new SparkConf() .setAppName("RDD graph") .setMaster("local[4]") val sc = new SparkContext(conf) val vertices: RDD[(VertexId, String)] = sc.parallelize( Array((1L, "Anne"), (2L, "Bernie"), (3L, "Chris"), (4L, "Don"), (5L, "Edgar"))) val edges: RDD[Edge[String]] = sc.parallelize( Array(Edge(1L, 2L, "likes"), Edge(2L, 3L, "trusts"), Edge(3L, 4L, "believes"), Edge(4L, 5L, "worships"), Edge(1L, 3L, "loves"), Edge(4L, 1L, "dislikes"))) val friendGraph: Graph[String, String] = Graph(vertices, edges) // val friendGraphFrame = GraphFrame.fromGraphX(friendGraph) // // friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter( // "e1.attr = 'trusts' OR v3.attr = 'Chris'" // ).collect.foreach(println) }
Example 51
Source File: Gephi.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.utils import org.apache.spark.graphx.Graph object Gephi { def toGexf[VD, ED](g: Graph[VD, ED]): String = { val header = """<?xml version="1.0" encoding="UTF-8"?> |<gexf xmlns="http://www.gexf.net/1.2draft" version="1.2"> | <meta> | <description>A gephi graph in GEXF format</description> | </meta> | <graph mode="static" defaultedgetype="directed"> """.stripMargin val vertices = "<nodes>\n" + g.vertices.map( v => s"""<node id=\"${v._1}\" label=\"${v._2}\"/>\n""" ).collect.mkString + "</nodes>\n" val edges = "<edges>\n" + g.edges.map( e => s"""<edge source=\"${e.srcId}\" target=\"${e.dstId}\" label=\"${e.attr}\"/>\n""" ).collect.mkString + "</edges>\n" val footer = "</graph>\n</gexf>" header + vertices + edges + footer } }
Example 52
Source File: CCGraphXDriver.scala From connected-component with MIT License | 5 votes |
package com.kwartile.lib.cc import org.apache.spark.graphx.{Edge, Graph} import org.apache.spark.{SparkConf, SparkContext} import scala.annotation.tailrec object CCGraphXDriver { @tailrec private def buildEdges(node: Long, neighbors:List[Long], partialPairs: List[Edge[Int]]) : List[Edge[Int]] = { if (neighbors.length == 0) { if (partialPairs != null) List(Edge(node, node, 1)) ::: partialPairs else List(Edge(node, node, 1)) } else if (neighbors.length == 1) { val neighbor = neighbors(0) if (node > neighbor) if (partialPairs != null) List(Edge(node, neighbor, 1)) ::: partialPairs else List(Edge(node, neighbor, 1)) else if (partialPairs != null) List(Edge(neighbor, node, 1)) ::: partialPairs else List(Edge(neighbor, node, 1)) } else { val newPartialPairs = neighbors.map(neighbor => { if (node > neighbor) List(Edge(node, neighbor, 1)) else List(Edge(neighbor, node, 1)) }).flatMap(x=>x) if (partialPairs != null) buildEdges(neighbors.head, neighbors.tail, newPartialPairs ::: partialPairs) else buildEdges(neighbors.head, neighbors.tail, newPartialPairs) } } private def buildEdges(nodes:List[Long]) : List[Edge[Int]] = { buildEdges(nodes.head, nodes.tail, null.asInstanceOf[List[Edge[Int]]]) } def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("GraphXConnectedComponent") val sc = new SparkContext(sparkConf) val cliqueFile = args(0) val cliquesRec = sc.textFile(args(0)) val cliques = cliquesRec.map(x => { val nodes = x.split("\\s+").map(y => y.toLong).toList nodes }) val edges = cliques.map(aClique => { buildEdges(aClique) }).flatMap(x=>x) val graph = Graph.fromEdges(edges, 1) val cc = graph.connectedComponents().vertices println ("Count of Connected component: " + cc.count) } }
Example 53
Source File: PipeClusteringStrongestPath.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.clustering import scala.Iterator import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexRDD import de.unihamburg.vsis.sddf.reading.Tuple class PipeClusteringStrongestPath extends PipeClusteringTransitiveClosure { override def manipulateGraph(graph: Graph[Tuple, Double]): Graph[_, Double] = { val cGraph = graph.mapVertices((vid, tuple) => (vid, Double.MinPositiveValue)) // attach the max adjacent edge attribute to each vertice val verticesMaxEdgeAttributes: VertexRDD[Double] = cGraph.mapReduceTriplets( edge => { Iterator((edge.dstId, edge.attr), (edge.srcId, edge.attr)) }, (a: Double, b: Double) => math.max(a, b) ) // join the resulting vertice attributes with the graph val maxGraph: Graph[(Tuple, Double), Double] = graph.outerJoinVertices(verticesMaxEdgeAttributes)((id, tuple, simOpt) => simOpt match { case Some(sim) => (tuple, sim) case None => (tuple, 0D) } ) // remove edges which have a max value less then src or dst val resultGraph = maxGraph.subgraph(edge => { if (edge.attr < edge.srcAttr._2 && edge.attr < edge.dstAttr._2) { false } else { true } }) resultGraph } } object PipeClusteringStrongestPath { def apply() = new PipeClusteringStrongestPath() }
Example 54
Source File: AbstractPipeClusteringGraph.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.clustering import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.similarity.aggregator.Mean import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable abstract class AbstractPipeClusteringGraph extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]] with Serializable { def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]] def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = { val duplicatePairsWithSimilarity = input.map( pair => (pair._1, Mean.agrSimilarity(pair._2)) ) val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map( pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) } ) // TODO optimize: it would be nice to build the graph only by using edge triplets // but as far as I know that's not possible val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap( tuplePair => Seq(tuplePair._1, tuplePair._2) ).map(tuple => (tuple.id, tuple)) // delete all duplicate vertices val vertices = verticesNotUnique.distinct() // The edge type Boolean is just a workaround because no edge types are needed val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null) cluster(graph) } }
Example 55
Source File: AffinityPropagationSuite.scala From SparkAffinityPropagation with MIT License | 5 votes |
package org.viirya.spark.ml import scala.collection.mutable import org.scalatest.{BeforeAndAfterAll, FunSuite, Suite} import org.viirya.spark.ml.AffinityPropagation._ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.graphx.{Edge, Graph} class AffinityPropagationSuite extends FunSuite with BeforeAndAfterAll { self: Suite => @transient var sc: SparkContext = _ override def beforeAll() { super.beforeAll() val conf = new SparkConf() .setMaster("local[2]") .setAppName("AffinityPropagationUnitTest") sc = new SparkContext(conf) } override def afterAll() { try { if (sc != null) { sc.stop() } sc = null } finally { super.afterAll() } } test("affinity propagation") { val similarities = Seq[(Long, Long, Double)]( (0, 1, 1.0), (1, 0, 1.0), (0, 2, 1.0), (2, 0, 1.0), (0, 3, 1.0), (3, 0, 1.0), (1, 2, 1.0), (2, 1, 1.0), (2, 3, 1.0), (3, 2, 1.0)) val expected = Array( Array(0.0, 1.0/3.0, 1.0/3.0, 1.0/3.0), Array(1.0/2.0, 0.0, 1.0/2.0, 0.0), Array(1.0/3.0, 1.0/3.0, 0.0, 1.0/3.0), Array(1.0/2.0, 0.0, 1.0/2.0, 0.0)) val s = constructGraph(sc.parallelize(similarities, 2), true, false) s.edges.collect().foreach { case Edge(i, j, x) => assert(math.abs(x.similarity - expected(i.toInt)(j.toInt)) < 1e-14) } } }
Example 56
Source File: PeriodicGraphCheckpointer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.impl import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.apache.spark.storage.StorageLevel private[mllib] class PeriodicGraphCheckpointer[VD, ED]( checkpointInterval: Int, sc: SparkContext) extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) { override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint() override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed override protected def persist(data: Graph[VD, ED]): Unit = { if (data.vertices.getStorageLevel == StorageLevel.NONE) { data.vertices.persist() } if (data.edges.getStorageLevel == StorageLevel.NONE) { data.edges.persist() } } override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false) override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = { data.getCheckpointFiles } }
Example 57
Source File: BasicLinkPredictor.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.link import ml.sparkling.graph.api.operators.algorithms.link.MeasureBasedLnkPredictor import ml.sparkling.graph.api.operators.measures.EdgeMeasure import org.apache.spark.graphx.Graph import scala.reflect.ClassTag object BasicLinkPredictor extends MeasureBasedLnkPredictor { override def predictLinks[V: ClassTag, E: ClassTag, EV: ClassTag, EO: ClassTag](graph: Graph[V, E], edgeMeasure: EdgeMeasure[EO, EV], threshold: EO, treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = { val preprocessedGraph=edgeMeasure.preprocess(graph,treatAsUndirected) val allPossibleEdges = preprocessedGraph.vertices.cartesian(preprocessedGraph.vertices).filter{ case ((vId1,data1),(vId2,data2))=>vId1!=vId2 } val edgesAboveThreshold=allPossibleEdges.map{ case ((vId1,data1),(vId2,data2))=>(edgeMeasure.computeValue(data1,data2,treatAsUndirected),(vId1,vId2)) }.filter(t=>num.gt(t._1,threshold)).map(t=>(t._2,0)) val exsistingEdgesTuples=graph.edges.map(e=>((e.srcId,e.dstId),0)) val newEdges=edgesAboveThreshold.leftOuterJoin(exsistingEdgesTuples).filter{ case (k,(_,option))=>option.isEmpty }.map(_._1) if(treatAsUndirected){ newEdges.map{ case (vId1,vId2)=>(Math.min(vId1,vId2),Math.max(vId1,vId2)) }.distinct() }else{ newEdges } } }
Example 58
Source File: BetweennessEdmonds$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds import java.nio.file.Files import ml.sparkling.graph.operators.MeasureTest import org.apache.commons.io.FileUtils import org.apache.spark.SparkContext import org.apache.spark.graphx.{Graph, VertexRDD} class BetweennessEdmonds$Test(implicit sc: SparkContext) extends MeasureTest { val tempDir = Files.createTempDirectory("spark-checkpoint") override def beforeAll() = { sc.setCheckpointDir(tempDir.toAbsolutePath.toString) } override def afterAll() = { FileUtils.deleteDirectory(tempDir.toFile) } "Edmonds betweenness centrality for random graph" should "be correctly calculated" in { Given("graph") val filePath = getClass.getResource("/graphs/graph_ER_15") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("Computes betweenness") val result = EdmondsBC.computeBC(graph) Then("Should calculate betweenness correctly") val bcFile = getClass.getResource("/graphs/graph_ER_15_bc") val bcCorrectValues = sc.textFile(bcFile.getPath) .filter(_.nonEmpty) .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) }) .sortBy({ case (vId, data) => vId }) .map({ case (vId, data) => data}).collect() val bcValues = result.sortBy({ case (vId, data) => vId }) .map({ case (vId, data) => data }).collect() bcCorrectValues.zip(bcValues).foreach({ case (a, b) => a should be(b +- 1e-5) }) result.unpersist(false) } }
Example 59
Source File: SSSPExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 60
Source File: EigenvectorCentrality$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.eigenvector import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration import ml.sparkling.graph.operators.MeasureTest import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ import scala.util.Random class EigenvectorCentrality$Test(implicit sc:SparkContext) extends MeasureTest { "Eigenvector for line graph" should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes eigenvector") val result=EigenvectorCentrality.compute(graph) Then("Should calculate eigenvector correctly") result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array( 0d, 0d, 0d, 0d, 0d )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}} graph.unpersist(true) } "Eigenvector for line graph" should "be correctly calculated using DSL" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes eigenvector") val result=graph.eigenvectorCentrality() Then("Should calculate eigenvector correctly") result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array( 0d, 0d, 0d, 0d, 0d )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}} graph.unpersist(true) } "Eigenvector for full 4 node directed graph" should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes eigenvector") val result=EigenvectorCentrality.compute(graph) Then("Should calculate eigenvector correctly") result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array( 0.32128186442503776, 0.5515795539542094, 0.6256715148839718, 0.44841176915201825 )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}} graph.unpersist(true) } "Eigenvector for full 4 node undirected graph" should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes eigenvector") val result=EigenvectorCentrality.compute(graph,VertexMeasureConfiguration[Int,Int](true)) Then("Should calculate eigenvector correctly") result.vertices.collect().sortBy{case (vId,data)=>vId} should equal (Array( (1,0.5), (2,0.5), (3,0.5), (4,0.5) )) graph.unpersist(true) } "Eigenvector " should " take edge weight into account" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) val graphWeighted=graph.mapEdges(edge=>{ 1.0/(edge.srcId+edge.dstId) }) When("Computes eigenvector") val resultUnweighted=EigenvectorCentrality.compute(graph,VertexMeasureConfiguration[Int,Int](true)) val resultWeighted=EigenvectorCentrality.compute(graphWeighted,VertexMeasureConfiguration[Int,Double](true)) Then("Should calculate eigenvector correctly") resultUnweighted.vertices.collect().sortBy{case (vId,data)=>vId} should not equal ( resultWeighted.vertices.collect().sortBy{case (vId,data)=>vId}) graph.unpersist(true) resultUnweighted.unpersist(true) resultWeighted.unpersist(true) } }
Example 61
Source File: AdamicAdar$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.edge import ml.sparkling.graph.operators.MeasureTest import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ class AdamicAdar$Test(implicit sc:SparkContext) extends MeasureTest { "Adamic/Adar for star graph" should "be 0 for each node" in{ Given("graph") val filePath = getClass.getResource("/graphs/6_nodes_star") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Adamic/Adar") val result=AdamicAdar.computeWithPreprocessing(graph) Then("Should calculate Adamic/Adar") val resultValues=result.edges.map(_.attr).distinct().collect() resultValues(0) should equal(0) resultValues.size should equal(1) graph.unpersist(true) } "Adamic/Adar for full graph using DSL" should "be 1.8205 for each node" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Adamic/Adar") val result=graph.adamicAdar(true) Then("Should calculate Adamic/Adar") val resultValues=result.edges.map(_.attr).distinct().collect() resultValues(0) should equal(1.82047 +- 1e-5) resultValues.size should equal(1) graph.unpersist(true) } }
Example 62
Source File: CommonNeighbours$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.edge import ml.sparkling.graph.operators.MeasureTest import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ class CommonNeighbours$Test (implicit sc:SparkContext) extends MeasureTest { "Common neighbours for star graph" should "be 0 for each node" in{ Given("graph") val filePath = getClass.getResource("/graphs/6_nodes_star") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes common neighbours") val result=CommonNeighbours.computeWithPreprocessing(graph) Then("Should calculate common neighbours") val resultValues=result.edges.map(_.attr).distinct().collect() resultValues(0) should equal(0) resultValues.size should equal(1) } "Common neighbours for full graph using DSL" should "be 2 for each node" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes common neighbours") val result=graph.commonNeighbours(true) Then("Should calculate common neighbours") val resultValues=result.edges.map(_.attr).distinct().collect() resultValues(0) should equal(2) resultValues.size should equal(1) } }
Example 63
Source File: NeighborhoodConnectivity$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration import ml.sparkling.graph.operators.MeasureTest import ml.sparkling.graph.operators.measures.vertex.NeighborhoodConnectivity import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ class NeighborhoodConnectivity$Test(implicit sc:SparkContext) extends MeasureTest { "Neighbor connectivity for directed line graph" should "be correctly calculated" in { Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("Computes Neighbor connectivity ") val result = NeighborhoodConnectivity.compute(graph) Then("Should calculate Neighbor connectivity correctly") val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById .map{case (vId,data)=>data} should equal (Array( 1d,1d,1d,0d,0d )) graph.unpersist(true) } "Neighbor connectivity for directed line graph" should "be correctly calculated when using DSL" in { Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("Computes Neighbor connectivity ") val result = graph.neighborhoodConnectivity() Then("Should calculate Neighbor connectivity correctly") val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById .map{case (vId,data)=>data} should equal (Array( 1d,1d,1d,0d,0d )) graph.unpersist(true) } "Neighbor connectivity for undirected line graph" should "be correctly calculated" in { Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("Computes Neighbor connectivity ") val result = NeighborhoodConnectivity.compute(graph,VertexMeasureConfiguration[Int,Int](true)) Then("Should calculate Neighbor connectivity correctly") val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById .map{case (vId,data)=>data} should equal (Array( 2d,1.5,2d,1.5,2d )) graph.unpersist(true) } "Neighbor connectivity for full 4 node directed graph" should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Neighbor connectivity") val result=NeighborhoodConnectivity.compute(graph) Then("Should calculate Neighbor connectivity correctly") val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById .map{case (vId,data)=>data} should equal (Array( 1d,1d,2d,1.5 )) graph.unpersist(true) } "Neighbor connectivity for full 4 node undirected graph" should "be correctly calculated" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Computes Neighbor connectivity") val result=NeighborhoodConnectivity.compute(graph,VertexMeasureConfiguration[Int,Int](true)) Then("Should calculate Neighbor connectivity correctly") val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId} verticesSortedById .map{case (vId,data)=>data} should equal (Array( 3d,3d,3d,3d )) graph.unpersist(true) } }
Example 64
Source File: CommunityBasedPartitioning$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import ml.sparkling.graph.loaders.csv.CSVLoader import ml.sparkling.graph.operators.MeasureTest import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import ml.sparkling.graph.operators.OperatorsDSL._ class CommunityBasedPartitioning$Test(implicit sc:SparkContext) extends MeasureTest { "One component graph " should " have one partition" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Partition using PSCAN") val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN) Then("Should compute partitions correctly") partitionedGraph.edges.partitions.size should equal (1) graph.unpersist(false) } "One component graph " should " have one partition when calculated using DSL" in{ Given("graph") val filePath = getClass.getResource("/graphs/4_nodes_full") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Partition using PSCAN") val partitionedGraph: Graph[Int, Int] =graph.partitionBy(PSCAN,1) Then("Should compute partitions correctly") partitionedGraph.edges.partitions.size should equal (1) graph.unpersist(false) } "Five component graph " should " have five partitions" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Partition using PSCAN") val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN,5) Then("Should compute partitions correctly") partitionedGraph.edges.partitions.size should equal (5) graph.unpersist(false) } "Three component graph " should " have five partitions" in{ Given("graph") val filePath = getClass.getResource("/graphs/coarsening_to_3") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Partition using PSCAN") val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN,3) Then("Should compute partitions correctly") partitionedGraph.edges.partitions.size should equal (3) graph.unpersist(false) } "Change of community method parammeters" should " be possible" in{ Given("graph") val filePath = getClass.getResource("/graphs/5_nodes_directed") val graph:Graph[Int,Int]=loadGraph(filePath.toString) When("Partition using PSCAN") val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphBy(graph,PSCAN.computeConnectedComponents(_,epsilon = 0),1) Then("Should compute partitions correctly") partitionedGraph.edges.partitions.size should equal (1) graph.unpersist(false) } }
Example 65
Source File: PSCANConnectedComponents.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.community.pscan import org.apache.spark.graphx.{EdgeTriplet, Graph, Pregel, VertexId} class PSCANConnectedComponents(minWeight:Double) extends Serializable{ def run[VD,ED](graph:Graph[VertexId,Double], maxIterations:Int=Int.MaxValue):Graph[VertexId,Double]={ val initialMessage = Long.MaxValue Pregel(graph, initialMessage,maxIterations = maxIterations)( vprog = (_, attr, msg) => math.min(attr, msg), sendMsg = sendMessage, mergeMsg = (a, b) => math.min(a, b)) } def sendMessage(edge: EdgeTriplet[VertexId, Double]): Iterator[(VertexId, VertexId)] = { if(edge.attr > minWeight){ if(edge.srcAttr<edge.dstAttr){ Iterator((edge.dstId,edge.srcAttr)) }else if(edge.dstAttr<edge.srcAttr){ Iterator((edge.srcId,edge.dstAttr)) }else{ Iterator.empty } }else{ Iterator.empty } } }
Example 66
Source File: BetweennessHua$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.hua import java.nio.file.Files import ml.sparkling.graph.operators.MeasureTest import ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds.EdmondsBC import org.apache.commons.io.FileUtils import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import org.scalatest.tagobjects.Slow class BetweennessHua$Test (implicit sc: SparkContext) extends MeasureTest { val tempDir = Files.createTempDirectory("spark-checkpoint") override def beforeAll() = { sc.setCheckpointDir(tempDir.toAbsolutePath.toString) } override def afterAll() = { FileUtils.deleteDirectory(tempDir.toFile) } "Hua betweenness centrality for random graph" should "be correctly calculated" in { Given("graph") val filePath = getClass.getResource("/graphs/graph_ER_15") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("Computes betweenness") val result = HuaBC.computeBC(graph) Then("Should calculate betweenness correctly") val bcFile = getClass.getResource("/graphs/graph_ER_15_bc") val bcCorrectValues = sc.textFile(bcFile.getPath) .filter(_.nonEmpty) .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) }) .sortBy({ case (vId, data) => vId }) .map({ case (vId, data) => data}).collect() val bcValues = result.sortBy({ case (vId, data) => vId }) .map({ case (vId, data) => data }).collect() bcCorrectValues.zip(bcValues).foreach({ case (a, b) => a should be(b +- 1e-5) }) result.unpersist(false) } "Hua betweenness centrality for random graph" should "take no longer then Edmonds" taggedAs(Slow) in { Given("graph") val filePath = getClass.getResource("/graphs/graph_ER_15") val graph: Graph[Int, Int] = loadGraph(filePath.toString) When("computes betwenness centrality") val (_, edmondsTime) = time("Edmonds algorithm for betweenness centrality")(EdmondsBC.computeBC(graph)) val (_, huaTime) = time("Hua algorithm for betweenness centrality")(HuaBC.computeBC(graph)) Then("Hua algorithm should be faster") huaTime should be <= edmondsTime } }
Example 67
Source File: Modularity.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.graph import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID import ml.sparkling.graph.api.operators.measures.{VertexDependentGraphMeasure, GraphIndependentMeasure} import org.apache.spark.graphx.{EdgeTriplet, VertexRDD, Graph} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag object Modularity extends VertexDependentGraphMeasure[Double,ComponentID]{ def compute[V<:ComponentID:ClassTag,E:ClassTag](graph: Graph[V, E]): Double = { val edgesNum=graph.numEdges.toDouble; val edgesCounts: RDD[(V, (Int, Int))] = graph.triplets.flatMap(triplet => { if (triplet.srcAttr == triplet.dstAttr) { Iterator((triplet.srcAttr, (1, 0)),(triplet.srcAttr, (1, 0))) } else { Iterator((triplet.srcAttr, (0, 1)),(triplet.dstAttr,(0,1))) } }) edgesCounts.aggregateByKey((0,0))( (agg:(Int,Int),data:(Int,Int))=> (agg,data) match{ case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2) }, (agg1:(Int,Int),agg2:(Int,Int))=>{ (agg1,agg2) match{ case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2) } } ).treeAggregate(0.0)( (agg:Double,data:(V,(Int,Int)))=>{ data match{ case (_,(edgesFull,edgesSome))=> agg+(edgesFull/(2.0*edgesNum))-Math.pow((edgesSome+edgesFull)/(2.0*edgesNum),2) } }, (agg1,agg2)=>agg1+agg2 ) } }
Example 68
Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID} import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator import org.apache.log4j.Logger import org.apache.spark.{Partitioner, SparkContext} import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId} import scala.reflect.ClassTag object CommunityBasedPartitioning { @transient val logger=Logger.getLogger(CommunityBasedPartitioning.getClass()) def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph) val numberOfCommunities=communities.vertices.values.countApproxDistinct() val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices) val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator) logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions") val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache() out.edges.foreachPartition((_)=>{}) out.vertices.foreachPartition((_)=>{}) out } def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts) } }
Example 69
Source File: PSCANBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import java.util.UUID import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.{DefaultPartitionOperator, logger} import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.graphx.{Graph, VertexId} import scala.collection.mutable import scala.reflect.ClassTag object PSCANBasedPartitioning { @transient val logger=Logger.getLogger(PSCANBasedPartitioning.getClass()) def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],numberOfPartitions:Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext): Graph[VD, ED] ={ val (numberOfCommunities: VertexId, coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int, strategy: ByComponentIdPartitionStrategy) = buildPartitioningStrategy(graph, numberOfPartitions, maxIterations = maxIterations) logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions (before ${numberOfCommunities})") val out=graph.partitionBy(strategy,numberOfPartitions).cache() out.edges.foreachPartition((_)=>{}) out.triplets.foreachPartition((_)=>{}) out.vertices.foreachPartition((_)=>{}) out } def buildPartitioningStrategy[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = { val (numberOfCommunities: VertexId, coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int) = precomputePartitions(graph, numberOfPartitions, maxIterations = maxIterations) logger.info(s"Requested $numberOfPartitions partitions, computed $coarsedNumberOfPartitions") val strategy = ByComponentIdPartitionStrategy(coarsedVertexMap, numberOfPartitions, DefaultPartitionOperator) (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions, strategy) } def precomputePartitions[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = { logger.info("Computing components using PSCAN") val (communities, numberOfCommunities): (Graph[ComponentID, ED], VertexId) = PSCAN.computeConnectedComponentsUsing(graph, numberOfPartitions, maxIterations = maxIterations) val computationData=communities.vertices.map(t=>t).localCheckpoint() logger.info("Components computed!") val (coarsedVertexMap, coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions, numberOfCommunities, computationData) (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions) } }
Example 70
Source File: VertexMeasureConfigurationTest.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.api.operators.measures import ml.sparkling.graph.api.operators.IterativeComputation.BucketSizeProvider import org.apache.spark.graphx.Graph import org.scalatest.{FlatSpec, GivenWhenThen} class VertexMeasureConfigurationTest extends FlatSpec with GivenWhenThen { "Creation without parameters" should "be possible" in{ VertexMeasureConfiguration() } "Creation with undirected flag" should "be possible" in{ Given("Directed flag") val flag=false When("Configuration creation") VertexMeasureConfiguration(treatAsUndirected = flag ) } "Creation with bucket size provider" should "be possible" in{ Given("Bucker size provider") val provider:BucketSizeProvider[Long,Long]=(g:Graph[Long,Long])=>1l When("Configuration creation") VertexMeasureConfiguration(bucketSizeProvider = provider) } "Creation with bucket size provider and directed flag" should "be possible" in{ Given("Bucker size provider") val provider:BucketSizeProvider[Long,Long]=(g:Graph[Long,Long])=>1l When("Configuration creation") VertexMeasureConfiguration( false, provider) } }
Example 71
Source File: GraphLoading.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.api.loaders import org.apache.spark.SparkContext import org.apache.spark.graphx.Graph import scala.reflect.ClassTag object GraphLoading { trait GraphLoader[VD,ED]{ def load(parameters:List[Parameter])(implicit sc:SparkContext):Graph[VD,ED] } trait TypedGraphLoader[VD2,ED2] extends GraphLoader[VD2,ED2]{ def load[VD:ClassTag,ED:ClassTag](parameters:List[Parameter])(implicit sc:SparkContext):Graph[VD,ED] } trait FromPathLoader[VD,ED] { def apply(path:String):GraphLoader[VD,ED] } object LoadGraph{ def from[VD:ClassTag,ED:ClassTag](graphLoader: GraphLoader[VD,ED]):GraphLoaderConfigurator[VD,ED]={ GraphLoaderConfigurator(List.empty,graphLoader) } } case class GraphLoaderConfigurator[VD:ClassTag,ED:ClassTag](parameters:List[Parameter], loader:GraphLoader[_,_]){ def using(parameter:Parameter)={ GraphLoaderConfigurator[VD,ED](parameter::parameters,loader) } def load[VD:ClassTag,ED:ClassTag]()(implicit sc:SparkContext): Graph[VD,ED] ={ loader match{ case typed:TypedGraphLoader[_,_]=>typed.load[VD,ED](parameters) case normal:GraphLoader[VD @unchecked,ED @unchecked] => normal.load(parameters) } } } trait Parameter trait WithValueParameter[V] extends Parameter{ def value:V } }
Example 72
Source File: ShortestPathLengthsFromCSV.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.examples import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._ import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap import ml.sparkling.graph.operators.predicates.AllPathPredicate import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, VertexId} import scala.collection.JavaConversions._ object ShortestPathLengthsFromCSV extends ExampleApp { def body()={ val shortestPaths =if(bucketSize == -1l) ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected) else ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected) val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices) partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out) ctx.stop() } } private object Util{ def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={ pathsOption.flatMap((paths)=>{ var entries=paths.entrySet().toList.sortBy(_.getKey) val out=new StringBuilder() out++=s"${oldValue}," var a = 0l while (a < size.value) { if (entries.size > 0 && a == entries.head.getKey) { out ++= s"${entries.head.getValue}," entries = entries.drop(1) } else { out ++= "0," } a += 1l } out.setLength(out.length - 1) Option(out.toString()) }).getOrElse(oldValue) } }
Example 73
Source File: GraphDescriptionFromCSV.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.examples import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration import ml.sparkling.graph.experiments.describe.GraphDescriptor._ import org.apache.log4j.Logger import org.apache.spark.graphx.Graph object GraphDescriptionFromCSV extends ExampleApp { def body()={ val configuration = if (bucketSize == -1l) { val graphSize=1000l logger.info(s"BUCKET SIZE WILL BE EQUAL TO 1000!!") VertexMeasureConfiguration[String,Double](treatAsUndirected,(g:Graph[String,Double])=>graphSize) } else VertexMeasureConfiguration[String,Double](treatAsUndirected,(g:Graph[String,Double])=>bucketSize) val groupedGraph=partitionedGraph.groupEdges((a,b)=>a) groupedGraph.describeGraphToDirectory(out, configuration) ctx.stop() } }