org.apache.spark.graphx.VertexId Scala Examples
The following examples show how to use org.apache.spark.graphx.VertexId.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 6 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.lib.TriangleCount import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphGeneration extends App { val conf = new SparkConf() .setAppName("Graph generation") .setMaster("local[4]") val sc = new SparkContext(conf) val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt") val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map { line => val field = line.split(" ") (field(0).toLong, field(1).toLong) } val edgeTupleGraph = Graph.fromEdgeTuples( rawEdges=rawEdges, defaultValue="") val gridGraph = GraphGenerators.gridGraph(sc, 5, 5) val starGraph = GraphGenerators.starGraph(sc, 11) val logNormalGraph = GraphGenerators.logNormalGraph( sc, numVertices = 20, mu=1, sigma = 3 ) logNormalGraph.outDegrees.map(_._2).collect().sorted val actorGraph = GraphLoader.edgeListFile( sc, "./ca-hollywood-2009.txt", true ).partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.edges.count() val actorComponents = actorGraph.connectedComponents().cache actorComponents.vertices.map(_._2).distinct().count val clusterSizes =actorComponents.vertices.map( v => (v._2, 1)).reduceByKey(_ + _) clusterSizes.map(_._2).max clusterSizes.map(_._2).min val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt") val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5) strongComponents.vertices.map(_._2).distinct().count val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges() val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.triangleCount() val triangles = TriangleCount.runPreCanonicalized(partitionedGraph) actorGraph.staticPageRank(10) val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001) actorPrGraph.vertices.reduce((v1, v2) => { if (v1._2 > v2._2) v1 else v2 }) actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10) actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count }
Example 2
Source File: SSSPExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 3
Source File: NOInitBFSProcessor.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.hua.processor import ml.sparkling.graph.operators.algorithms.bfs.processor.BFSProcessor import ml.sparkling.graph.operators.measures.vertex.betweenness.hua.struct.NOVertex import ml.sparkling.graph.operators.measures.vertex.betweenness.hua.struct.messages.{BFSConfirmMessage, BFSExpandMessage, NOMessage} import org.apache.spark.graphx.{EdgeTriplet, VertexId} class NOInitBFSProcessor[ED] extends BFSProcessor[NOVertex, ED, List[NOMessage[VertexId]]] { override def initialMessage: List[NOMessage[VertexId]] = List.empty override def mergeMessages(msg1: List[NOMessage[VertexId]], msg2: List[NOMessage[VertexId]]): List[NOMessage[VertexId]] = { val allMessages = msg1 ++ msg2 val expandMessageList = allMessages.filter(_.isExpand) val expandMessage = expandMessageList.headOption val succMessages = allMessages.filter(_.isConfirm) expandMessage match { case Some(m) => succMessages :+ m case None => succMessages } } override def sendMessage(triplet: EdgeTriplet[NOVertex, ED]): Iterator[(VertexId, List[NOMessage[VertexId]])] = { def createExpandMsg(dstId: VertexId) = { val dstAttr = triplet.vertexAttr(dstId) val srcAttr = triplet.otherVertexAttr(dstId) if (dstAttr.pred.isEmpty && srcAttr.pred.nonEmpty) Iterator((dstId, List(BFSExpandMessage(triplet.otherVertexId(dstId))))) else Iterator.empty } def createConfirmMsg(dstId: VertexId) = { val dstAttr = triplet.vertexAttr(dstId) val srcAttr = triplet.otherVertexAttr(dstId) if (!dstAttr.isCompleted && srcAttr.pred.exists(_ == dstId)) Iterator((dstId, List(BFSConfirmMessage(triplet.otherVertexId(dstId))))) else Iterator.empty } val confirmMsg = createConfirmMsg(triplet.srcId) ++ createConfirmMsg(triplet.dstId) val expandMsg = createExpandMsg(triplet.srcId) ++ createExpandMsg(triplet.dstId) confirmMsg ++ expandMsg } }
Example 4
Source File: NOVertex.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.hua.struct import ml.sparkling.graph.operators.measures.vertex.betweenness.hua.struct.messages.DFSPointer import org.apache.spark.graphx.VertexId class NOVertex(val vertexId: VertexId, val bfsMap: Map[VertexId, NOBFSVertex], val pred: Option[VertexId], val succ: Option[Array[VertexId]], val dfsPointer: Option[DFSPointer], val bc: Double) extends Serializable { def setParent(idParent: VertexId) = NOVertex(vertexId, bfsMap, Some(idParent), succ, dfsPointer, bc) def setPredecessorAndSuccessors(newPred: Option[VertexId], newSucc: Option[Array[VertexId]]) = NOVertex(vertexId, bfsMap, newPred, newSucc, dfsPointer, bc) val isCompleted = pred.nonEmpty && succ.nonEmpty val leaf = succ.isEmpty lazy val bfsRoot = bfsMap.contains(vertexId) lazy val lowestSucc = succ.getOrElse(Array.empty).sorted.headOption lazy val eccentricity = if (bfsMap.isEmpty) 0 else bfsMap.map({ case (id, v) => v.distance}).max def withDfsPointer(pointer: Option[DFSPointer]) = NOVertex(vertexId, bfsMap, pred, succ, pointer, bc) def update(bfsMap: Map[VertexId, NOBFSVertex] = bfsMap, succ: Option[Array[VertexId]] = succ, dfsPointer: Option[DFSPointer] = dfsPointer, bcInc: Double = 0) = NOVertex(vertexId, bfsMap, pred, succ, dfsPointer, bc + bcInc) } object NOVertex extends Serializable { def apply(vertexId: VertexId, bfsMap: Map[VertexId, NOBFSVertex] = Map.empty, pred: Option[VertexId] = None, succ: Option[Array[VertexId]] = None, dfsPointer: Option[DFSPointer] = None, bc: Double = .0): NOVertex = new NOVertex(vertexId, bfsMap, pred, succ, dfsPointer, bc) }
Example 5
Source File: WithPathProcessor.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors import org.apache.spark.graphx.VertexId class WithPathProcessor[VD,ED]() extends PathProcessor[VD,ED,Map[VertexId,(ED,Set[List[VertexId]])]]{ private type PathsSet=(ED,Set[List[VertexId]]) private type PathsMap=Map[VertexId,PathsSet] def EMPTY_CONTAINER=Map.empty[VertexId,PathsSet] def getNewContainerForPaths() ={ EMPTY_CONTAINER } def putNewPath(map:PathsMap,to:VertexId,weight:ED)(implicit num:Numeric[ED]): PathsMap={ (map + (to -> (weight,Set(to::Nil)))).map(identity) } def processNewMessages(map1:PathsMap,map2:PathsMap)(implicit num:Numeric[ED]):PathsMap={ (map1.keySet ++ map2.keySet).map(vId=>(vId,mergePathSets(map1.get(vId),map2.get(vId)))).toMap.map(identity) } def extendPathsMerging(targetVertexId:VertexId,map:PathsMap,vertexId:VertexId,distance:ED,map2:PathsMap)(implicit num:Numeric[ED]): PathsMap ={ val extended=map.filterKeys(_!=targetVertexId).mapValues(extendPathsSet(_,vertexId,distance)).map(identity) processNewMessages(extended,map2) } private def extendPathsSet(pathSet:PathsSet,vertexId:VertexId,distance:ED)(implicit num:Numeric[ED]):PathsSet={ pathSet match{ case (edge,set) => (num.plus(distance,edge),set.map(vertexId :: _)) } } private def mergePathSets(pathSet1:Option[PathsSet],pathSet2:Option[PathsSet])(implicit num:Numeric[ED]): PathsSet ={ (pathSet1 :: pathSet2 :: Nil).flatten[PathsSet].reduce[PathsSet]{ case ((edge1,set1),(edge2,set2))=> num.compare(edge1,edge2).signum match{ case 0=> (edge1,set1++set2) case 1=>(edge2,set2) case -1=>(edge1,set1) } } } }
Example 6
Source File: FastUtilWithDistance.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils import it.unimi.dsi.fastutil.longs._ import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._ import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.PathProcessor import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap import ml.sparkling.graph.operators.utils.LoggerHolder import org.apache.spark.graphx.VertexId import scala.collection.JavaConversions._ class FastUtilWithDistance[VD, ED]() extends PathProcessor[VD, ED, DataMap] { def EMPTY_CONTAINER = new DataMap(0) def getNewContainerForPaths() = { new DataMap(64,0.25f) } def putNewPath(map: DataMap, to: VertexId, weight: ED)(implicit num: Numeric[ED]): DataMap = { val out=map.asInstanceOf[DataMap].clone() out.put(to, num.toDouble(weight)) out } def processNewMessages(map1: DataMap, map2: DataMap)(implicit num: Numeric[ED]):DataMap = { mergeMessages(map1,map2.clone()) } override def mergeMessages(map1: DataMap, map2: DataMap)(implicit num: Numeric[ED]):DataMap = { val out=map2 map1.foreach{case (key: JLong,inValue: JDouble)=>{ val longKey=key.toLong val value: Double =if(map2.containsKey(longKey)) { min(inValue,map2.get(key.toLong)) }else{ inValue } out.put(longKey, value) }} out } def min(d1:JDouble,d2:JDouble):JDouble={ if(d1<d2){ d1 }else{ d2 } } def extendPathsMerging(targetVertexId:VertexId,map: DataMap, vertexId: VertexId, distance: ED,map2: DataMap)(implicit num: Numeric[ED]):DataMap = { val out=map2.clone() val toAdd=num.toDouble(distance) map.foreach{case (key: JLong,inValue: JDouble)=>{ if(!targetVertexId.equals(key)){ val longKey=key.toLong val value: Double =if(map2.containsKey(longKey)) { min(inValue+toAdd,map2.get(longKey)) }else{ inValue+toAdd } out.put(longKey,value) } }} out } } object FastUtilWithDistance{ type DataMap=Long2DoubleOpenHashMap }
Example 7
Source File: PathProcessor.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors import org.apache.spark.graphx.VertexId trait PathProcessor[VD,ED,PS] extends Serializable{ def EMPTY_CONTAINER:PS def getNewContainerForPaths():PS def putNewPath(map:PS,to:VertexId,weight:ED)(implicit num:Numeric[ED]): PS def processNewMessages(map1:PS, map2:PS)(implicit num:Numeric[ED]):PS def mergeMessages(map1:PS, map2:PS)(implicit num:Numeric[ED]):PS={ processNewMessages(map1,map2) } def extendPathsMerging(targetVertexId:VertexId,map:PS,vertexId:VertexId,distance:ED,map2:PS)(implicit num:Numeric[ED]): PS }
Example 8
Source File: SingleVertexProcessor.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors import org.apache.spark.graphx.VertexId class SingleVertexProcessor[VD, ED](computedVertexId:VertexId) extends PathProcessor[VD, ED, Double] { def EMPTY_CONTAINER = 0d override def getNewContainerForPaths(): Double = 0d override def extendPathsMerging(targetVertexId: VertexId, currentValue: Double, vertexId: VertexId, distance: ED, currentValue2: Double)(implicit num: Numeric[ED]): Double = { val currentExtended= { if (vertexId == computedVertexId || currentValue != 0) currentValue + num.toDouble(distance) else 0.0 } processNewMessages(currentExtended,currentValue2) } override def processNewMessages(map1: Double, map2: Double)(implicit num: Numeric[ED]): Double = { (map1,map2) match{ case (0d,_)=> map2 case (_,0d)=> map1 case _ =>Math.min(map1,map2) } } override def putNewPath(map: Double, to: VertexId, weight: ED)(implicit num: Numeric[ED]): Double = { num.toDouble(weight) } }
Example 9
Source File: PSCANConnectedComponents.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.algorithms.community.pscan import org.apache.spark.graphx.{EdgeTriplet, Graph, Pregel, VertexId} class PSCANConnectedComponents(minWeight:Double) extends Serializable{ def run[VD,ED](graph:Graph[VertexId,Double], maxIterations:Int=Int.MaxValue):Graph[VertexId,Double]={ val initialMessage = Long.MaxValue Pregel(graph, initialMessage,maxIterations = maxIterations)( vprog = (_, attr, msg) => math.min(attr, msg), sendMsg = sendMessage, mergeMsg = (a, b) => math.min(a, b)) } def sendMessage(edge: EdgeTriplet[VertexId, Double]): Iterator[(VertexId, VertexId)] = { if(edge.attr > minWeight){ if(edge.srcAttr<edge.dstAttr){ Iterator((edge.dstId,edge.srcAttr)) }else if(edge.dstAttr<edge.srcAttr){ Iterator((edge.srcId,edge.dstAttr)) }else{ Iterator.empty } }else{ Iterator.empty } } }
Example 10
Source File: GraphMLLoader.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.graphml import com.databricks.spark.xml._ import ml.sparkling.graph.loaders.graphml.GraphMLFormat._ import ml.sparkling.graph.loaders.graphml.GraphMLTypes.TypeHandler import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, SparkSession} import scala.collection.mutable import scala.util.Try def loadGraphFromML(path: String)(implicit sc: SparkContext): Graph[ValuesMap, ValuesMap] = { val sparkSession=SparkSession.builder().getOrCreate(); val graphDataFrame = sparkSession.sqlContext.read .format("com.databricks.spark.xml") .option("attributePrefix","@") .option("valueTag","#VALUE") .option("rowTag",graphTag).load(path).rdd val keys =sparkSession.sqlContext.read .format("com.databricks.spark.xml") .option("attributePrefix","@") .option("valueTag","#VALUE") .option("rowTag",graphMLTag).load(path).rdd .flatMap(r => Try(r.getAs[mutable.WrappedArray[Row]](keyTag).toArray).getOrElse(Array.empty)) val nodesKeys = keys .filter(r => r.getAs[String](forAttribute) == nodeTag) val edgeKeys = keys .filter(r => r.getAs[String](forAttribute) == edgeTag) val nodeAttrHandlers = createAttrHandlersFor(nodesKeys) val edgeAttrHandlers = createAttrHandlersFor(edgeKeys) val verticesWithData = graphDataFrame.flatMap(r => r.getAs[Any](nodeTag) match { case data: mutable.WrappedArray[Row@unchecked] => data.array case data: Row => Array(data) }) val verticesIndex = verticesWithData.map(r => r.getAs[String](idAttribute)).zipWithUniqueId().collect().toMap val vertices: RDD[(VertexId, Map[String, Any])] = verticesWithData .map( r => (verticesIndex(r.getAs[String](idAttribute)), extractAttributesMap(nodeAttrHandlers, r)) ) val edgesRows = graphDataFrame.flatMap(r => r.getAs[Any](edgeTag) match { case data: mutable.WrappedArray[Row@unchecked] => data.array case data: Row => Array(data) }) .map(r => Edge( verticesIndex(r.getAs[String](sourceAttribute)), verticesIndex(r.getAs[String](targetAttribute)), extractAttributesMap(edgeAttrHandlers, r) )) Graph(vertices, edgesRows) } def extractAttributesMap(attrHandlers: Map[String, GraphMLAttribute], r: Row): Map[String, Any] = { Try(r.getAs[mutable.WrappedArray[Row]](dataTag)).toOption.map( _.map(r => { val attribute = attrHandlers(r.getAs[String](keyAttribute)) (attribute.name, attribute.handler(r.getAs[String](tagValue))) }).toMap ).getOrElse(Map.empty) + ("id" -> r.getAs[String](idAttribute)) } def createAttrHandlersFor(keys: RDD[Row]): Map[String, GraphMLAttribute] = { keys .map(r => (r.getAs[String](idAttribute), GraphMLAttribute(r.getAs[String](nameAttribute), GraphMLTypes(r.getAs[String](typeAttribute))))) .collect().toMap } }
Example 11
Source File: GraphProviders.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.csv.providers import ml.sparkling.graph.loaders.csv.types.Types import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.SparkSession; import scala.reflect.ClassTag object GraphProviders { val defaultStorageLevel=StorageLevel.MEMORY_ONLY def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: Row => Seq[(VertexId, VD)], edgeProvider: Row => Seq[Edge[ED]], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel =defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = { dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => { rowIterator.flatMap { case row => mappingFunction(row) } }) } val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider) val edges: RDD[Edge[ED]] = mapRows(edgeProvider) defaultVertex match{ case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel) case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel) } } def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)], edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]], columnsToIndex: Seq[Int], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel = defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap def extractIdFromIndex(vertex: VD) = index(vertex) simpleGraphBuilder(defaultVertex, vertexProvider(_: Row, extractIdFromIndex _), edgeProvider(_: Row, extractIdFromIndex _), edgeStorageLevel, vertexStorageLevel)(dataFrame) } }
Example 12
Source File: NOInitBFSPredicate.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.hua.predicate import ml.sparkling.graph.operators.algorithms.bfs.predicate.BFSPredicate import ml.sparkling.graph.operators.measures.vertex.betweenness.hua.struct.NOVertex import ml.sparkling.graph.operators.measures.vertex.betweenness.hua.struct.messages.NOMessage import org.apache.spark.graphx.VertexId class NOInitBFSPredicate extends BFSPredicate[NOVertex, List[NOMessage[VertexId]]] { override def getInitialData(vertexId: VertexId, attr: NOVertex): (VertexId) => NOVertex = (id: VertexId) => if (id == vertexId) attr.setParent(id) else attr override def applyMessages(vertexId: VertexId, vertex: NOVertex, message: List[NOMessage[VertexId]]): NOVertex = if (vertex.isCompleted) vertex else updateVertex(vertex, message) def updateVertex(vertex: NOVertex, messages: List[NOMessage[VertexId]]) = { val parent = extractParrent(vertex, messages) val succ = extractSuccessors(vertex, messages) vertex.setPredecessorAndSuccessors(parent, succ) } def extractParrent(vertex: NOVertex, messages: List[NOMessage[VertexId]]) = { vertex.pred match { case Some(pred) => vertex.pred case None => val expandMsg = messages.filter(_.isExpand).map(_.content) expandMsg.headOption } } def extractSuccessors(vertex: NOVertex, messages: List[NOMessage[VertexId]]) = vertex.succ match { case Some(arr) => vertex.succ case None => val confirmMsg = messages.filter(_.isConfirm).map(_.content) if (confirmMsg.nonEmpty) Some(confirmMsg.toArray) else None } }
Example 13
Source File: FastUnfolding.scala From fastunfolding with Apache License 2.0 | 5 votes |
package com.soteradefense.dga.graphx.louvain import org.apache.spark.SparkContext import org.apache.spark.graphx.{VertexId, PartitionStrategy, TripletFields, Graph} import scala.reflect.ClassTag class FastUnfolding(outputdir: String, minProgress: Int = 1, progressCounter: Int = 1) { var qValues = Array[(Int, Double)]() def saveLevel(sc: SparkContext, level: Int, q: Double, graph: Graph[MyVertexState, Long]) = { graph.vertices.saveAsTextFile(s"${outputdir}/level_${level}_vertices") graph.edges.saveAsTextFile(s"${outputdir}/level_${level}_edges") //graph.vertices.map( {case (id,v) => ""+id+","+v.internalWeight+","+v.community }).saveAsTextFile(outputdir+"/level_"+level+"_vertices") //graph.edges.mapValues({case e=>""+e.srcId+","+e.dstId+","+e.attr}).saveAsTextFile(outputdir+"/level_"+level+"_edges") qValues = qValues :+ ((level, q)) println(s"qValue: $q") // overwrite the q values at each level sc.parallelize(qValues, 1).saveAsTextFile(s"${outputdir}/qvalues") } def run[VD: ClassTag](sc: SparkContext, graph: Graph[VD, Long]) = { val initialGraph = createGraph(graph) val graphWeight = initialGraph.vertices.map( vertex => { vertex._2.nodeWeight } ).reduce(_ + _) val broadcastGraphWeight = sc.broadcast(graphWeight) val initialModularity = initialGraph.vertices.map( vertex => { vertex._2.in / (2 * graphWeight) - vertex._2.tot * vertex._2.tot / (graphWeight * graphWeight) } ).reduce(_ + _) var level = -1 var halt = false while(!halt) { level += 1 println(s"Starting level ${level}") val (currentQ, currentGraph, passes) = runFastUnfolding(sc, initialGraph, minProgress, progressCounter) } } def runFastUnfolding(sc: SparkContext, graph: Graph[MyVertexState, Long], minProgress: Int, progressCounter: Int) = { val cachedGraph = graph.cache() } def createGraph[VD: ClassTag](graph: Graph[VD, Long]): Graph[MyVertexState, Long] = { val nodeWeights = graph.aggregateMessages[Long]( cxt => { cxt.sendToSrc(cxt.attr) cxt.sendToDst(cxt.attr) }, (a, b) => a + b, TripletFields.EdgeOnly ) nodeWeights.foreach(result => println(s"nodeweight: ${result._1}, ${result._2}")) val louvainGraph = graph.outerJoinVertices(nodeWeights)((vid, data, weightOption) => { val weight = weightOption.getOrElse(0L) val state = new MyVertexState() state.community = vid state.changed = false state.tot = weight state.in = 0 state.nodeWeight = weight state }).partitionBy(PartitionStrategy.EdgePartition2D) louvainGraph } }
Example 14
Source File: Neo4jGraphScalaTSE.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.junit.Assert._ import org.junit._ import scala.collection.JavaConverters._ object Neo4jGraphScalaTSE { } class Neo4jGraphScalaTSE extends SparkConnectorScalaBaseTSE { val FIXTURE: String = "CREATE (s:A {a:0})-[r:REL {foo:'bar'}]->(t:B {b:1}) RETURN id(s) AS source, id(t) AS target" private var source: Long = _ private var target: Long = _ @Before @throws[Exception] def setUp { val map = SparkConnectorScalaSuiteIT.session().run(FIXTURE).single() .asMap() source = map.get("source").asInstanceOf[Long] target = map.get("target").asInstanceOf[Long] } private def assertGraph(graph: Graph[_, _], expectedNodes: Long, expectedRels: Long) = { assertEquals(expectedNodes, graph.vertices.count) assertEquals(expectedRels, graph.edges.count) } @Test def runCypherQueryWithParams { val data = List(Map("id"->1,"name"->"Test").asJava).asJava Executor.execute(sc, "UNWIND $data as row CREATE (n:Test {id:row.id}) SET n.name = row.name", Map(("data",data))) } @Test def runMatrixQuery { val graph = Neo4jGraph.loadGraph(sc, "A", Seq.empty, "B") assertGraph(graph, 2, 1) } @Test def saveGraph { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL","test")) assertEquals(42L, SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def saveGraphMerge { val edges : RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,13L) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,"value",("FOOBAR","test"),Option("Foo","id"),Option("Bar","id"),merge = true) assertEquals(Map("fid"->source,"bid"->target,"rv"->42L,"fv"->13L,"bv"->13L).asJava,SparkConnectorScalaSuiteIT.session().run("MATCH (foo:Foo)-[rel:FOOBAR]->(bar:Bar) RETURN {fid: foo.id, fv:foo.value, rv:rel.test,bid:bar.id,bv:bar.value} as data").single().get("data").asMap()) } @Test def saveGraphByNodeLabel { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(0,1,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL","test"),Option(("A","a")),Option(("B","b"))) assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def mergeGraphByNodeLabel { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL2","test"),merge = true) assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL2]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def saveGraphNodes { val nodes : RDD[(VertexId, Long)] = sc.makeRDD(Seq((source,10L),(target,20L))) val edges : RDD[Edge[Long]] = sc.makeRDD(Seq()) val graph = Graph[Long,Long](nodes,edges,-1) assertGraph(graph, 2, 0) Neo4jGraph.saveGraph(sc,graph,"prop") assertEquals(10L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (a:A) WHERE id(a) = $source RETURN a.prop as prop").single().get("prop").asLong()) assertEquals(20L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (b:B) WHERE id(b) = $target RETURN b.prop as prop").single().get("prop").asLong()) } }
Example 15
Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.graphx import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object PageRank { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PageRank") val sc = new SparkContext(conf) // build vertices val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( "1,BarackObama,Barack Obama", "2,ladygaga,Goddess of Love", "3,jeresig,John Resig", "4,justinbieber,Justin Bieber", "6,matei_zaharia,Matei Zaharia", "7,odersky,Martin Odersky", "8,anonsys" ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) // build edges val followers: RDD[Edge[Double]] = sc.parallelize(Array( Edge(2L, 1L, 1.0), Edge(4L, 1L, 1.0), Edge(1L, 2L, 1.0), Edge(6L, 3L, 1.0), Edge(7L, 3L, 1.0), Edge(7L, 6L, 1.0), Edge(6L, 7L, 1.0), Edge(3L, 7L, 1.0) )) // build graph val followerGraph: Graph[Array[String], Double] = Graph(users, followers) // restrict the graph to users with usernames and names val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) // compute PageRank val pageRankGraph = subgraph.pageRank(0.001) // get attributes of the top pagerank users val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { case (uid, attrList, Some(pr)) => (pr, attrList.toList) case (uid, attrList, None) => (0.0, attrList.toList) } println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) } }
Example 16
Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.graphx import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object PageRank { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("PageRank") .getOrCreate() val sc = spark.sparkContext // build vertices val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( "1,BarackObama,Barack Obama", "2,ladygaga,Goddess of Love", "3,jeresig,John Resig", "4,justinbieber,Justin Bieber", "6,matei_zaharia,Matei Zaharia", "7,odersky,Martin Odersky", "8,anonsys" ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) // build edges val followers: RDD[Edge[Double]] = sc.parallelize(Array( Edge(2L, 1L, 1.0), Edge(4L, 1L, 1.0), Edge(1L, 2L, 1.0), Edge(6L, 3L, 1.0), Edge(7L, 3L, 1.0), Edge(7L, 6L, 1.0), Edge(6L, 7L, 1.0), Edge(3L, 7L, 1.0) )) // build graph val followerGraph: Graph[Array[String], Double] = Graph(users, followers) // restrict the graph to users with usernames and names val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) // compute PageRank val pageRankGraph = subgraph.pageRank(0.001) // get attributes of the top pagerank users val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { case (uid, attrList, Some(pr)) => (pr, attrList.toList) case (uid, attrList, None) => (0.0, attrList.toList) } println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) } }
Example 17
Source File: SparkPersistence.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.persistence import java.io.File import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import edu.msstate.dasi.csb.sc import edu.msstate.dasi.csb.util.Util import org.apache.hadoop.fs.FileUtil import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.storage.StorageLevel object SparkPersistence extends GraphPersistence { private val vertices_suffix = "_vertices" private val edges_suffix = "_edges" def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = { val verticesPath = graphName + vertices_suffix val verticesTmpPath = "__" + verticesPath val edgesPath = graphName + edges_suffix val edgesTmpPath = "__" + edgesPath if (overwrite) { FileUtil.fullyDelete(new File(verticesPath)) FileUtil.fullyDelete(new File(edgesPath)) } graph.vertices.saveAsTextFile(verticesTmpPath) Util.merge(verticesTmpPath, verticesPath) FileUtil.fullyDelete(new File(verticesTmpPath)) graph.edges.saveAsTextFile(edgesTmpPath) Util.merge(edgesTmpPath, edgesPath) FileUtil.fullyDelete(new File(edgesTmpPath)) } }
Example 18
package edu.msstate.dasi.csb.workload.spark import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.{Graph, VertexId} import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { // if (src == dst) return List(src) if (src == dst) return // The attribute of each vertex is (dist from src, id of vertex with dist-1) var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache() // Traverse forward from src var dstAttr = (Int.MaxValue, 0L) while (dstAttr._1 == Int.MaxValue) { val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) { e.sendToDst((e.srcAttr._1 + 1, e.srcId)) }, (a, b) => if (a._1 < b._1) a else b).cache() // if (msgs.count == 0) return List.empty if (msgs.count == 0) return g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) => if (newAttr._1 < oldAttr._1) newAttr else oldAttr }.cache() dstAttr = g.vertices.filter(_._1 == dst).first()._2 } // Traverse backward from dst and collect the path var path: List[VertexId] = dstAttr._2 :: dst :: Nil while (path.head != src) { path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path } // path } }
Example 19
Source File: SSSP.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.spark import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.{Graph, VertexId} import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { for (dst <- graph.vertices.keys.toLocalIterator) { bfs(graph, src, dst) } } private def bfs[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], src: VertexId, dst: VertexId): Unit = { // if (src == dst) return List(src) if (src == dst) return // The attribute of each vertex is (dist from src, id of vertex with dist-1) var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache() // Traverse forward from src var dstAttr = (Int.MaxValue, 0L) while (dstAttr._1 == Int.MaxValue) { val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) { e.sendToDst((e.srcAttr._1 + 1, e.srcId)) }, (a, b) => if (a._1 < b._1) a else b).cache() // if (msgs.count == 0) return List.empty if (msgs.count == 0) return g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) => if (newAttr._1 < oldAttr._1) newAttr else oldAttr }.cache() dstAttr = g.vertices.filter(_._1 == dst).first()._2 } // Traverse backward from dst and collect the path var path: List[VertexId] = dstAttr._2 :: dst :: Nil while (path.head != src) { path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path } // path } }
Example 20
Source File: ClosenessCentrality.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.workload.spark import edu.msstate.dasi.csb.workload.Workload import org.apache.spark.graphx.{EdgeDirection, Graph, VertexId} import scala.collection.mutable import scala.reflect.ClassTag def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = { getClosenessOfVert(vertex, graph) } private class DistanceNodePair(var distance: Long, var totalPairs: Long) extends Comparable[DistanceNodePair] { override def compareTo(dp: DistanceNodePair): Int = (this.distance - dp.distance).toInt } private class NodeVisitCounter extends java.io.Serializable { var totalPairs: Long = _ var levelSize: mutable.HashMap[Long, Long] = _ //first is distance second is pair at that distance } private def BFSNode[VD: ClassTag, ED: ClassTag](nID: Long, graph: Graph[VD, ED]): NodeVisitCounter = { val q = new mutable.Queue[Long]() q.enqueue(nID) val visited = new mutable.HashSet[VertexId]() val levelSize = new mutable.HashMap[Long, Long]() visited.add(nID) var totalPairs: Long = 0 val visitCounter = new NodeVisitCounter() var level = 0 while (q.nonEmpty) { val size = q.size totalPairs += size if (level != 0) { levelSize.put(level, size) } val list: Array[Long] = new Array[Long](size) for (x <- 0 until size) { list(x) = q.dequeue() } var children: Array[VertexId] = null if (list.length > 0) { for (x <- list) { val node: VertexId = x if (graph.collectNeighborIds(EdgeDirection.Out).lookup(node).nonEmpty) { children = graph.collectNeighborIds(EdgeDirection.Out).lookup(node).head // children = hashmap.value.get(x).head for (c: Long <- children) { // val childNode = graph.vertices.lookup(c) //hashmap.value.get(c).head if (!visited.contains(c)) { q.enqueue(c) visited.add(c) } } } } } level += 1 } totalPairs -= 1 visitCounter.levelSize = levelSize visitCounter.totalPairs = totalPairs visitCounter } private def getClosenessOfVert[VD: ClassTag, ED: ClassTag](vertex: VertexId, graph: Graph[VD, ED]): Double = { val visitCenter = BFSNode(vertex, graph) var denominator: Long = 0L for (x <- visitCenter.levelSize.keySet) { denominator += visitCenter.levelSize.get(x).head * x } if (denominator == 0) return -1 val count = graph.vertices.count().toDouble count / denominator } }
Example 21
Source File: FindInfluencer.scala From spark-graphx-twitter with Apache License 2.0 | 5 votes |
package com.knoldus.spark.graphx.example import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object FindInfluencer { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Twittter Influencer").setMaster("local[*]") val sparkContext = new SparkContext(conf) sparkContext.setLogLevel("ERROR") val twitterData = sparkContext.textFile("src/main/resources/twitter-graph-data.txt") val followeeVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr => val user = arr(0).replace("((", "") val id = arr(1).replace(")", "") (id.toLong, user) } val followerVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr => val user = arr(2).replace("(", "") val id = arr(3).replace("))", "") (id.toLong, user) } val vertices = followeeVertices.union(followerVertices) val edges: RDD[Edge[String]] = twitterData.map(_.split(",")).map { arr => val followeeId = arr(1).replace(")", "").toLong val followerId = arr(3).replace("))", "").toLong Edge(followeeId, followerId, "follow") } val defaultUser = ("") val graph = Graph(vertices, edges, defaultUser) val subGraph = graph.pregel("", 2, EdgeDirection.In)((_, attr, msg) => attr + "," + msg, triplet => Iterator((triplet.srcId, triplet.dstAttr)), (a, b) => (a + "," + b)) val lengthRDD = subGraph.vertices.map(vertex => (vertex._1, vertex._2.split(",").distinct.length - 2)).max()(new Ordering[Tuple2[VertexId, Int]]() { override def compare(x: (VertexId, Int), y: (VertexId, Int)): Int = Ordering[Int].compare(x._2, y._2) }) val userId = graph.vertices.filter(_._1 == lengthRDD._1).map(_._2).collect().head println(userId + " has maximum influence on network with " + lengthRDD._2 + " influencers.") sparkContext.stop() } }
Example 22
Source File: AbstractPipeClusteringGraph.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.clustering import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.similarity.aggregator.Mean import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable abstract class AbstractPipeClusteringGraph extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]] with Serializable { def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]] def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = { val duplicatePairsWithSimilarity = input.map( pair => (pair._1, Mean.agrSimilarity(pair._2)) ) val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map( pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) } ) // TODO optimize: it would be nice to build the graph only by using edge triplets // but as far as I know that's not possible val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap( tuplePair => Seq(tuplePair._1, tuplePair._2) ).map(tuple => (tuple.id, tuple)) // delete all duplicate vertices val vertices = verticesNotUnique.distinct() // The edge type Boolean is just a workaround because no edge types are needed val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null) cluster(graph) } }
Example 23
Source File: InputDataFlow.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pregel.social import org.apache.spark.graphx.{Edge, VertexId} import scala.collection.mutable.ListBuffer object InputDataFlow { def parseNames(line: String): Option[(VertexId, String)] = { val fields = line.split('\t') if (fields.length > 1) Some(fields(0).trim().toLong, fields(1)) else None } def makeEdges(line: String): List[Edge[Int]] = { var edges = new ListBuffer[Edge[Int]]() val fields = line.split(" ") val origin = fields(0) (1 until fields.length) .foreach { p => edges += Edge(origin.toLong, fields(p).toLong, 0) } edges.toList } }
Example 24
Source File: ShortestPathProblemJob.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pregel.jobs.ssp import com.github.graphx.pregel.ssp.ShortestPathProblem import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.graphx.VertexId object ShortestPathProblemJob extends App { Logger.getLogger("org").setLevel(Level.ERROR) val sc = new SparkContext("local[*]", "ShortestPathProblemDemo") val ssp = new ShortestPathProblem(sc) val sourceIdForTest: VertexId = 3 val sourceIdForRandom: VertexId = 75 val testGraph = ssp.testGraph val resultOnTestGraph = ssp.shortestPath(testGraph, sourceIdForTest) println(s"Test Graph:\n${ssp.graphToString(testGraph)}\n\n" + s"Distances on the test graph $resultOnTestGraph\n") val randomGraph = ssp.randomGraph val resultOnRandomGraph = ssp.shortestPath(randomGraph, sourceIdForRandom) println(s"Distances on the random graph $resultOnRandomGraph\n") }
Example 25
Source File: GraphFramesExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} //import org.graphframes._ object GraphFramesExample extends App { val conf = new SparkConf() .setAppName("RDD graph") .setMaster("local[4]") val sc = new SparkContext(conf) val vertices: RDD[(VertexId, String)] = sc.parallelize( Array((1L, "Anne"), (2L, "Bernie"), (3L, "Chris"), (4L, "Don"), (5L, "Edgar"))) val edges: RDD[Edge[String]] = sc.parallelize( Array(Edge(1L, 2L, "likes"), Edge(2L, 3L, "trusts"), Edge(3L, 4L, "believes"), Edge(4L, 5L, "worships"), Edge(1L, 3L, "loves"), Edge(4L, 1L, "dislikes"))) val friendGraph: Graph[String, String] = Graph(vertices, edges) // val friendGraphFrame = GraphFrame.fromGraphX(friendGraph) // // friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter( // "e1.attr = 'trusts' OR v3.attr = 'Chris'" // ).collect.foreach(println) }
Example 26
Source File: LocalRunner.scala From spark-betweenness with Apache License 2.0 | 5 votes |
package com.centrality.kBC import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD object MainRunner { def main(args: Array[String]) { // Create spark context val appName="kBC" val sparkMode="local" val conf = new SparkConf().setAppName(appName).setMaster(sparkMode); val sc = new SparkContext(conf); // Create sample graph // // Create an RDD for vertices val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), (5L, ("franklin", "prof")), (2L, ("istoica", "prof")))) // Create an RDD for edges val relationships: RDD[Edge[String]] = sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"))) // Define a default user in case there are relationship with missing user val defaultUser = ("John Doe", "Missing") // Build the initial Graph val graph = Graph(users, relationships, defaultUser) val kBCGraph = KBetweenness.run(graph, 3) } }
Example 27
Source File: VertexAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class VertexAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Vertex API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapVertices((_, att) => att.toUpperCase()) res.vertices.collect().toList } }
Example 28
Source File: EdgeAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class EdgeAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Edge API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapEdges(e => e.attr.toUpperCase) println(res.edges.collect().toList) } }
Example 29
Source File: SSSPExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 30
Source File: SSSPExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 31
Source File: SSSPExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.graphx.{Graph, VertexId} import org.apache.spark.graphx.util.GraphGenerators // $example off$ import org.apache.spark.sql.SparkSession object SSSPExample { def main(args: Array[String]): Unit = { // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() val sc = spark.sparkContext // $example on$ // A graph with edge attributes containing distances val graph: Graph[Long, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( (id, dist, newDist) => math.min(dist, newDist), // Vertex Program triplet => { // Send Message if (triplet.srcAttr + triplet.attr < triplet.dstAttr) { Iterator((triplet.dstId, triplet.srcAttr + triplet.attr)) } else { Iterator.empty } }, (a, b) => math.min(a, b) // Merge Message ) println(sssp.vertices.collect.mkString("\n")) // $example off$ spark.stop() } } // scalastyle:on println
Example 32
Source File: ShortestPathLengthsFromCSV.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.examples import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._ import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap import ml.sparkling.graph.operators.predicates.AllPathPredicate import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, VertexId} import scala.collection.JavaConversions._ object ShortestPathLengthsFromCSV extends ExampleApp { def body()={ val shortestPaths =if(bucketSize == -1l) ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected) else ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected) val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices) partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out) ctx.stop() } } private object Util{ def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={ pathsOption.flatMap((paths)=>{ var entries=paths.entrySet().toList.sortBy(_.getKey) val out=new StringBuilder() out++=s"${oldValue}," var a = 0l while (a < size.value) { if (entries.size > 0 && a == entries.head.getKey) { out ++= s"${entries.head.getValue}," entries = entries.drop(1) } else { out ++= "0," } a += 1l } out.setLength(out.length - 1) Option(out.toString()) }).getOrElse(oldValue) } }
Example 33
Source File: PSCANBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import java.util.UUID import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.{DefaultPartitionOperator, logger} import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.graphx.{Graph, VertexId} import scala.collection.mutable import scala.reflect.ClassTag object PSCANBasedPartitioning { @transient val logger=Logger.getLogger(PSCANBasedPartitioning.getClass()) def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],numberOfPartitions:Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext): Graph[VD, ED] ={ val (numberOfCommunities: VertexId, coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int, strategy: ByComponentIdPartitionStrategy) = buildPartitioningStrategy(graph, numberOfPartitions, maxIterations = maxIterations) logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions (before ${numberOfCommunities})") val out=graph.partitionBy(strategy,numberOfPartitions).cache() out.edges.foreachPartition((_)=>{}) out.triplets.foreachPartition((_)=>{}) out.vertices.foreachPartition((_)=>{}) out } def buildPartitioningStrategy[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = { val (numberOfCommunities: VertexId, coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int) = precomputePartitions(graph, numberOfPartitions, maxIterations = maxIterations) logger.info(s"Requested $numberOfPartitions partitions, computed $coarsedNumberOfPartitions") val strategy = ByComponentIdPartitionStrategy(coarsedVertexMap, numberOfPartitions, DefaultPartitionOperator) (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions, strategy) } def precomputePartitions[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = { logger.info("Computing components using PSCAN") val (communities, numberOfCommunities): (Graph[ComponentID, ED], VertexId) = PSCAN.computeConnectedComponentsUsing(graph, numberOfPartitions, maxIterations = maxIterations) val computationData=communities.vertices.map(t=>t).localCheckpoint() logger.info("Components computed!") val (coarsedVertexMap, coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions, numberOfCommunities, computationData) (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions) } }
Example 34
Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID} import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator import org.apache.log4j.Logger import org.apache.spark.{Partitioner, SparkContext} import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId} import scala.reflect.ClassTag object CommunityBasedPartitioning { @transient val logger=Logger.getLogger(CommunityBasedPartitioning.getClass()) def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph) val numberOfCommunities=communities.vertices.values.countApproxDistinct() val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices) val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator) logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions") val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache() out.edges.foreachPartition((_)=>{}) out.vertices.foreachPartition((_)=>{}) out } def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts) } }
Example 35
Source File: CFBCFlow.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.flow.struct import org.apache.spark.graphx.VertexId class CFBCFlow(val src: VertexId, val dst: VertexId, val potential: Double, val completed: Boolean, val aliveThrough: Int) extends Serializable { def supplyValue(vertexId: VertexId) = vertexId match { case `src` => 1 case `dst` => -1 case _ => 0 } val key = (src, dst) val removable = completed && aliveThrough <= 0 def countdownVitality = if (aliveThrough > 0) CFBCFlow(src, dst, potential, completed, aliveThrough - 1) else this } object CFBCFlow extends Serializable { def apply(src: VertexId, dst: VertexId, potential: Double = 1.0, completed: Boolean = false, aliveThrough: Int = 3 ): CFBCFlow = new CFBCFlow(src, dst, potential, completed, aliveThrough) def updatePotential(flow: CFBCFlow, newPotential: Double, eps: Double = 0.0) = { val completed = Math.abs(flow.potential - newPotential) > eps CFBCFlow(flow.src, flow.dst, newPotential, completed, flow.aliveThrough) } def empty(key: (VertexId, VertexId)) = key match { case (src, dst) => CFBCFlow(src, dst, 0.0) } }
Example 36
Source File: CFBCVertex.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.flow.struct import org.apache.spark.graphx.VertexId class CFBCVertex( val id: VertexId, val degree: Int, val bc: Double, val sampleVertices: Array[VertexId], val flows: (Array[CFBCFlow], Iterable[CFBCNeighbourFlow]), val processedFlows: Int) extends Serializable { lazy val relatedFlows = vertexFlows.filter(f => f.dst == id || f.src == id) lazy val availableSamples = sampleVertices lazy val vertexPhi = vertexFlows.count(_.src == id) lazy val flowsMap = vertexFlows.map(f => ((f.src, f.dst), f)).toMap val (vertexFlows, neighboursFlows) = flows def isFinalized(k: Int) = sampleVertices.isEmpty || processedFlows >= k def getFlow(key: (VertexId, VertexId)) = flowsMap.getOrElse(key, CFBCFlow.empty(key)) def updateBC(currentFlowing: Double) = { val newBC = (processedFlows * bc + currentFlowing) / (processedFlows + 1) new CFBCVertex(id, degree, newBC, sampleVertices, flows, processedFlows + 1) } def updateBC(currentFlowing: Seq[Double]) = { val newBC = if (currentFlowing.isEmpty) bc else (processedFlows * bc + currentFlowing.sum) / (processedFlows + currentFlowing.length) new CFBCVertex(id, degree, newBC, sampleVertices, flows, processedFlows + currentFlowing.length) } def addNewFlow(flow: CFBCFlow) = new CFBCVertex(id, degree, bc, sampleVertices.filterNot(_ == flow.dst), (vertexFlows :+ flow, neighboursFlows), processedFlows) def updateFlows(fls: Array[CFBCFlow]) = new CFBCVertex(id, degree, bc, sampleVertices, (fls, neighboursFlows), processedFlows) def removeFlows(toRemove: Seq[CFBCFlow]) = { val newFlows = vertexFlows.diff(toRemove).map(_.countdownVitality) new CFBCVertex(id, degree, bc, sampleVertices, (newFlows, neighboursFlows), processedFlows) } def applyNeighbourFlows(nbhFlows: Iterable[CFBCNeighbourFlow]) = new CFBCVertex(id, degree, bc, sampleVertices, (vertexFlows, nbhFlows), processedFlows) } object CFBCVertex extends Serializable { def apply(id: VertexId, degree: Int, bc: Double = 0.0, sampleVertices: Array[VertexId] = Array.empty, flows: (Array[CFBCFlow], Iterable[CFBCNeighbourFlow]) = (Array.empty, Iterable.empty) ): CFBCVertex = new CFBCVertex(id, degree, bc, sampleVertices, flows, 0) }
Example 37
Source File: CFBCNeighbourFlow.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.flow.struct import org.apache.spark.graphx.VertexId class CFBCNeighbourFlow( val src: VertexId, val dst: VertexId, val sumOfPotential: Double, val sumOfDifferences: Double, val numberOfFlows: Int, val allCompleted: Boolean, val anyCompleted: Boolean) extends Serializable { val key = (src, dst) } object CFBCNeighbourFlow extends Serializable { def apply(src: VertexId, dst: VertexId, sumOfPotential: Double = .0, sumOfDifferences: Double = .0, numberOfFlows: Int = 0, allCompleted: Boolean = true, anyCompleted: Boolean = true ): CFBCNeighbourFlow = new CFBCNeighbourFlow(src, dst, sumOfPotential, sumOfDifferences, numberOfFlows, allCompleted, anyCompleted) def apply(key: (VertexId, VertexId)): CFBCNeighbourFlow = key match { case (src, dst) => apply(src, dst) } def apply(flows: Iterable[CFBCFlow], vertex: CFBCVertex): CFBCNeighbourFlow = { def aggregatePotential(vertexFlow: CFBCFlow)(acc: NeighbourFlowStats, flow: CFBCFlow) = NeighbourFlowStats.fromFlow(vertexFlow)(flow).merge(acc) def mergePotential(acc1: NeighbourFlowStats, acc2: NeighbourFlowStats) = acc1.merge(acc2) val (src, dst) = flows.headOption.map(_.key) match { case Some(k) => k case None => throw new RuntimeException("Empty flows!") } val aggregaeFunc = aggregatePotential(vertex.getFlow((src, dst))) _ val stats = flows.aggregate(NeighbourFlowStats.empty)(aggregaeFunc, mergePotential) CFBCNeighbourFlow(src, dst, stats.potential, stats.sumPotentialDiff, flows.size, stats.allCompleted, stats.anyCompleted) } class NeighbourFlowStats( val potential: Double, val sumPotentialDiff: Double, val allCompleted: Boolean, val anyCompleted: Boolean) extends Serializable { def merge(other: NeighbourFlowStats): NeighbourFlowStats = { NeighbourFlowStats( potential + other.potential, sumPotentialDiff + other.sumPotentialDiff, allCompleted && other.allCompleted, anyCompleted || other.anyCompleted) } } object NeighbourFlowStats extends Serializable { def apply(potential: Double, sumPotentialDiff: Double, allCompleted: Boolean, anyCompleted: Boolean): NeighbourFlowStats = new NeighbourFlowStats(potential, sumPotentialDiff, allCompleted, anyCompleted) def fromFlow(vertexFlow: CFBCFlow)(nbflow: CFBCFlow): NeighbourFlowStats = apply(nbflow.potential, Math.abs(nbflow.potential - vertexFlow.potential), nbflow.completed, nbflow.completed) def empty = apply(.0, .0, true, false) } }
Example 38
Source File: EdmondsMessage.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds.struct.messages import org.apache.spark.graphx.VertexId class EdmondsMessage(val preds: List[VertexId], val sigma: Int, val depth: Int) extends Serializable { def merge(other: EdmondsMessage): EdmondsMessage = { require(depth == other.depth) EdmondsMessage(preds ++ other.preds, sigma + other.sigma, depth) } } object EdmondsMessage extends Serializable { def apply(preds: List[VertexId], sigma: Int, depth: Int): EdmondsMessage = new EdmondsMessage(preds, sigma, depth) def empty = apply(List.empty, -1, -1) }