org.apache.spark.graphx.Edge Scala Examples
The following examples show how to use org.apache.spark.graphx.Edge.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FindInfluencer.scala From spark-graphx-twitter with Apache License 2.0 | 5 votes |
package com.knoldus.spark.graphx.example import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object FindInfluencer { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Twittter Influencer").setMaster("local[*]") val sparkContext = new SparkContext(conf) sparkContext.setLogLevel("ERROR") val twitterData = sparkContext.textFile("src/main/resources/twitter-graph-data.txt") val followeeVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr => val user = arr(0).replace("((", "") val id = arr(1).replace(")", "") (id.toLong, user) } val followerVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr => val user = arr(2).replace("(", "") val id = arr(3).replace("))", "") (id.toLong, user) } val vertices = followeeVertices.union(followerVertices) val edges: RDD[Edge[String]] = twitterData.map(_.split(",")).map { arr => val followeeId = arr(1).replace(")", "").toLong val followerId = arr(3).replace("))", "").toLong Edge(followeeId, followerId, "follow") } val defaultUser = ("") val graph = Graph(vertices, edges, defaultUser) val subGraph = graph.pregel("", 2, EdgeDirection.In)((_, attr, msg) => attr + "," + msg, triplet => Iterator((triplet.srcId, triplet.dstAttr)), (a, b) => (a + "," + b)) val lengthRDD = subGraph.vertices.map(vertex => (vertex._1, vertex._2.split(",").distinct.length - 2)).max()(new Ordering[Tuple2[VertexId, Int]]() { override def compare(x: (VertexId, Int), y: (VertexId, Int)): Int = Ordering[Int].compare(x._2, y._2) }) val userId = graph.vertices.filter(_._1 == lengthRDD._1).map(_._2).collect().head println(userId + " has maximum influence on network with " + lengthRDD._2 + " influencers.") sparkContext.stop() } }
Example 2
Source File: SparkPersistence.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.persistence import java.io.File import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import edu.msstate.dasi.csb.sc import edu.msstate.dasi.csb.util.Util import org.apache.hadoop.fs.FileUtil import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.storage.StorageLevel object SparkPersistence extends GraphPersistence { private val vertices_suffix = "_vertices" private val edges_suffix = "_edges" def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = { val verticesPath = graphName + vertices_suffix val verticesTmpPath = "__" + verticesPath val edgesPath = graphName + edges_suffix val edgesTmpPath = "__" + edgesPath if (overwrite) { FileUtil.fullyDelete(new File(verticesPath)) FileUtil.fullyDelete(new File(edgesPath)) } graph.vertices.saveAsTextFile(verticesTmpPath) Util.merge(verticesTmpPath, verticesPath) FileUtil.fullyDelete(new File(verticesTmpPath)) graph.edges.saveAsTextFile(edgesTmpPath) Util.merge(edgesTmpPath, edgesPath) FileUtil.fullyDelete(new File(edgesTmpPath)) } }
Example 3
Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.graphx import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object PageRank { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("PageRank") .getOrCreate() val sc = spark.sparkContext // build vertices val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( "1,BarackObama,Barack Obama", "2,ladygaga,Goddess of Love", "3,jeresig,John Resig", "4,justinbieber,Justin Bieber", "6,matei_zaharia,Matei Zaharia", "7,odersky,Martin Odersky", "8,anonsys" ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) // build edges val followers: RDD[Edge[Double]] = sc.parallelize(Array( Edge(2L, 1L, 1.0), Edge(4L, 1L, 1.0), Edge(1L, 2L, 1.0), Edge(6L, 3L, 1.0), Edge(7L, 3L, 1.0), Edge(7L, 6L, 1.0), Edge(6L, 7L, 1.0), Edge(3L, 7L, 1.0) )) // build graph val followerGraph: Graph[Array[String], Double] = Graph(users, followers) // restrict the graph to users with usernames and names val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) // compute PageRank val pageRankGraph = subgraph.pageRank(0.001) // get attributes of the top pagerank users val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { case (uid, attrList, Some(pr)) => (pr, attrList.toList) case (uid, attrList, None) => (0.0, attrList.toList) } println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) } }
Example 4
Source File: PageRank.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.graphx import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object PageRank { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PageRank") val sc = new SparkContext(conf) // build vertices val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( "1,BarackObama,Barack Obama", "2,ladygaga,Goddess of Love", "3,jeresig,John Resig", "4,justinbieber,Justin Bieber", "6,matei_zaharia,Matei Zaharia", "7,odersky,Martin Odersky", "8,anonsys" ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) // build edges val followers: RDD[Edge[Double]] = sc.parallelize(Array( Edge(2L, 1L, 1.0), Edge(4L, 1L, 1.0), Edge(1L, 2L, 1.0), Edge(6L, 3L, 1.0), Edge(7L, 3L, 1.0), Edge(7L, 6L, 1.0), Edge(6L, 7L, 1.0), Edge(3L, 7L, 1.0) )) // build graph val followerGraph: Graph[Array[String], Double] = Graph(users, followers) // restrict the graph to users with usernames and names val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) // compute PageRank val pageRankGraph = subgraph.pageRank(0.001) // get attributes of the top pagerank users val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { case (uid, attrList, Some(pr)) => (pr, attrList.toList) case (uid, attrList, None) => (0.0, attrList.toList) } println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) } }
Example 5
Source File: GodwinTest.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.graph import io.gzet.test.SparkFunSuite import org.apache.log4j.{Logger, Level} import org.apache.spark.graphx.{Graph, Edge} import org.apache.spark.rdd.RDD import scala.io.Source class GodwinTest extends SparkFunSuite { Logger.getLogger("akka").setLevel(Level.OFF) Logger.getLogger("org").setLevel(Level.OFF) def buildEdges() = { Source.fromInputStream(getClass.getResourceAsStream("/edges.csv")).getLines().drop(1).map(s => { val Array(source, target, weight) = s.split(",") Edge(source.toLong, target.toLong, weight.toDouble) }).toList } localTest("Test Random Walks") { sc => val edges: RDD[Edge[Double]] = sc.parallelize(buildEdges(), 1) val godwin = new Godwin(Seq(16)) val walks = godwin.randomWalks(Graph.fromEdges(edges, 0L), 4).collect().sortBy(_._2) println(walks.map(_._1).mkString(" -> ")) walks.last._1 should be(16) } }
Example 6
Source File: GzetCommunitiesTest.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community import io.gzet.community.clustering.wcc.WCCDetection import io.gzet.test.SparkFunSuite import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.{Graph, Edge} import scala.io.Source class GzetCommunitiesTest extends SparkFunSuite { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) localTest("WCC communities") { spark => val lines = Source.fromInputStream(getClass.getResourceAsStream("/local-edges.csv")).getLines().zipWithIndex.filter(_._2 > 0).map(_._1).toSeq val sc = spark.sparkContext val edges = sc.parallelize(lines).map({ line => val a = line.split(",").map(_.toLong).sorted Edge(a.head, a.last, 1L) }).distinct() val graph = Graph.fromEdges(edges, 0L) graph.triplets.take(2).foreach(println) val communities = new WCCDetection(1).run(graph, sc) communities.map(_._2 -> 1).reduceByKey(_+_).collectAsMap() should be(Map(5L -> 5, 15L -> 6, 21L -> 5)) } }
Example 7
Source File: StoryBatchDedup.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.story import io.gzet.story.model.{Content, Article} import org.apache.spark.graphx.{Graph, Edge} import org.apache.spark.{Logging, SparkConf, SparkContext} import io.gzet.story.util.SimhashUtils._ import com.datastax.spark.connector._ object StoryBatchDedup extends SimpleConfig with Logging { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("Story Extractor") val sc = new SparkContext(sparkConf) val simhashRDD = sc.cassandraTable[Article]("gzet", "articles").zipWithIndex().map({ case (a, id) => ((id, Content(a.url, a.title, a.body)), a.hash) }) Set(0) val duplicateTupleRDD = simhashRDD.flatMap({ case ((id, content), simhash) => searchmasks.map({ mask => (simhash ^ mask, id) }) }).groupByKey() val edgeRDD = duplicateTupleRDD.values.flatMap({ it => val list = it.toList for (x <- list; y <- list) yield (x, y) }).filter({ case (x, y) => x != y }).distinct().map({case (x, y) => Edge(x, y, 0) }) val duplicateRDD = Graph.fromEdges(edgeRDD, 0L) .connectedComponents() .vertices .join(simhashRDD.keys) .values duplicateRDD.sortBy(_._1).collect().foreach({ case (story, content) => println(story + "\t" + content.title) }) } }
Example 8
Source File: Neo4jGraphScalaTSE.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.junit.Assert._ import org.junit._ import scala.collection.JavaConverters._ object Neo4jGraphScalaTSE { } class Neo4jGraphScalaTSE extends SparkConnectorScalaBaseTSE { val FIXTURE: String = "CREATE (s:A {a:0})-[r:REL {foo:'bar'}]->(t:B {b:1}) RETURN id(s) AS source, id(t) AS target" private var source: Long = _ private var target: Long = _ @Before @throws[Exception] def setUp { val map = SparkConnectorScalaSuiteIT.session().run(FIXTURE).single() .asMap() source = map.get("source").asInstanceOf[Long] target = map.get("target").asInstanceOf[Long] } private def assertGraph(graph: Graph[_, _], expectedNodes: Long, expectedRels: Long) = { assertEquals(expectedNodes, graph.vertices.count) assertEquals(expectedRels, graph.edges.count) } @Test def runCypherQueryWithParams { val data = List(Map("id"->1,"name"->"Test").asJava).asJava Executor.execute(sc, "UNWIND $data as row CREATE (n:Test {id:row.id}) SET n.name = row.name", Map(("data",data))) } @Test def runMatrixQuery { val graph = Neo4jGraph.loadGraph(sc, "A", Seq.empty, "B") assertGraph(graph, 2, 1) } @Test def saveGraph { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL","test")) assertEquals(42L, SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def saveGraphMerge { val edges : RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,13L) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,"value",("FOOBAR","test"),Option("Foo","id"),Option("Bar","id"),merge = true) assertEquals(Map("fid"->source,"bid"->target,"rv"->42L,"fv"->13L,"bv"->13L).asJava,SparkConnectorScalaSuiteIT.session().run("MATCH (foo:Foo)-[rel:FOOBAR]->(bar:Bar) RETURN {fid: foo.id, fv:foo.value, rv:rel.test,bid:bar.id,bv:bar.value} as data").single().get("data").asMap()) } @Test def saveGraphByNodeLabel { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(0,1,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL","test"),Option(("A","a")),Option(("B","b"))) assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def mergeGraphByNodeLabel { val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L))) val graph = Graph.fromEdges(edges,-1) assertGraph(graph, 2, 1) Neo4jGraph.saveGraph(sc,graph,null,("REL2","test"),merge = true) assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL2]->(:B) RETURN rel.test as prop").single().get("prop").asLong()) } @Test def saveGraphNodes { val nodes : RDD[(VertexId, Long)] = sc.makeRDD(Seq((source,10L),(target,20L))) val edges : RDD[Edge[Long]] = sc.makeRDD(Seq()) val graph = Graph[Long,Long](nodes,edges,-1) assertGraph(graph, 2, 0) Neo4jGraph.saveGraph(sc,graph,"prop") assertEquals(10L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (a:A) WHERE id(a) = $source RETURN a.prop as prop").single().get("prop").asLong()) assertEquals(20L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (b:B) WHERE id(b) = $target RETURN b.prop as prop").single().get("prop").asLong()) } }
Example 9
Source File: PairwiseBPSuite.scala From sandpiper with Apache License 2.0 | 5 votes |
package sparkle.graph import org.apache.spark.graphx.{Edge, Graph} import org.apache.spark.rdd.RDD import org.scalatest.FunSuite import sparkle.util.LocalSparkContext class PairwiseBPSuite extends FunSuite with LocalSparkContext { test("Pairwise BP test") { // test from the lectures EECS course 6.869, Bill Freeman and Antonio Torralba. // Chapter 7.3.5 Numerical example. withSpark { sc => val vertices: RDD[(Long, PVertex)] = sc.parallelize(Seq( (1L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))), (2L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))), (3L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))), (4L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 0.0).map(math.log))))) ) val edges = sc.parallelize(Seq( Edge(1L, 2L, PEdge(Factor(Array(2, 2), Array(1.0, 0.9, 0.9, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))), Edge(2L, 3L, PEdge(Factor(Array(2, 2), Array(0.1, 1.0, 1.0, 0.1).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))), Edge(2L, 4L, PEdge(Factor(Array(2, 2), Array(1.0, 0.1, 0.1, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))) )) val graph = Graph(vertices, edges) val bpGraph = PairwiseBP(graph) val trueProbabilities = Seq( 1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0), 2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1), 3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01), 4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid } val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid } val eps = 10e-5 calculatedProbabilities.zip(trueProbabilities).foreach { case ((_, vertex), (_, (trueP0, trueP1))) => assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps) } } } test("Pariwise BP test with file") { withSpark { sc => val graph = PairwiseBP.loadPairwiseGraph(sc, "data/vertex4.txt", "data/edge4.txt") val bpGraph = PairwiseBP(graph) val trueProbabilities = Seq( 1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0), 2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1), 3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01), 4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid } val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid } val eps = 10e-5 calculatedProbabilities.zip(trueProbabilities).foreach { case ((_, vertex), (_, (trueP0, trueP1))) => assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps) } } } }
Example 10
Source File: EdgeProviders.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.csv.providers import ml.sparkling.graph.loaders.csv.types.CSVTypes.EdgeAttributeExtractor import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId import ml.sparkling.graph.loaders.csv.types.{CSVTypes, Types} import ml.sparkling.graph.loaders.csv.utils.DefaultTransformers import ml.sparkling.graph.loaders.csv.utils.DefaultTransformers.{defaultEdgeAttribute, numberToVertexId} import org.apache.spark.graphx.Edge import org.apache.spark.sql.Row import scala.reflect.ClassTag object EdgeProviders { type TwoColumnsMakeEdgeProvider[VD,ED]=(Int,Int,Row, ToVertexId[VD], EdgeAttributeExtractor[ED])=>Seq[Edge[ED]] def twoColumnsMakesEdge[VD:ClassTag,ED:ClassTag](id1:Int, id2:Int,row:Row, columnToId:ToVertexId[VD], edgeAttributeProvider:EdgeAttributeExtractor[ED]):Seq[Edge[ED]]={ Seq(Edge(columnToId(row.getAs(id1)),columnToId(row.getAs(id2)),edgeAttributeProvider(row))) } def twoColumnsMakesEdge[VD:ClassTag](id1:Int, id2:Int, row:Row):Seq[Edge[Double]]={ twoColumnsMakesEdge(id1,id2,row,numberToVertexId _,defaultEdgeAttribute _) } }
Example 11
Source File: GraphProviders.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.csv.providers import ml.sparkling.graph.loaders.csv.types.Types import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.SparkSession; import scala.reflect.ClassTag object GraphProviders { val defaultStorageLevel=StorageLevel.MEMORY_ONLY def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: Row => Seq[(VertexId, VD)], edgeProvider: Row => Seq[Edge[ED]], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel =defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = { dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => { rowIterator.flatMap { case row => mappingFunction(row) } }) } val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider) val edges: RDD[Edge[ED]] = mapRows(edgeProvider) defaultVertex match{ case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel) case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel) } } def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None, vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)], edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]], columnsToIndex: Seq[Int], edgeStorageLevel: StorageLevel = defaultStorageLevel, vertexStorageLevel: StorageLevel = defaultStorageLevel) (dataFrame: DataFrame): Graph[VD, ED] = { val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap def extractIdFromIndex(vertex: VD) = index(vertex) simpleGraphBuilder(defaultVertex, vertexProvider(_: Row, extractIdFromIndex _), edgeProvider(_: Row, extractIdFromIndex _), edgeStorageLevel, vertexStorageLevel)(dataFrame) } }
Example 12
Source File: GraphMLLoader.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.loaders.graphml import com.databricks.spark.xml._ import ml.sparkling.graph.loaders.graphml.GraphMLFormat._ import ml.sparkling.graph.loaders.graphml.GraphMLTypes.TypeHandler import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, SparkSession} import scala.collection.mutable import scala.util.Try def loadGraphFromML(path: String)(implicit sc: SparkContext): Graph[ValuesMap, ValuesMap] = { val sparkSession=SparkSession.builder().getOrCreate(); val graphDataFrame = sparkSession.sqlContext.read .format("com.databricks.spark.xml") .option("attributePrefix","@") .option("valueTag","#VALUE") .option("rowTag",graphTag).load(path).rdd val keys =sparkSession.sqlContext.read .format("com.databricks.spark.xml") .option("attributePrefix","@") .option("valueTag","#VALUE") .option("rowTag",graphMLTag).load(path).rdd .flatMap(r => Try(r.getAs[mutable.WrappedArray[Row]](keyTag).toArray).getOrElse(Array.empty)) val nodesKeys = keys .filter(r => r.getAs[String](forAttribute) == nodeTag) val edgeKeys = keys .filter(r => r.getAs[String](forAttribute) == edgeTag) val nodeAttrHandlers = createAttrHandlersFor(nodesKeys) val edgeAttrHandlers = createAttrHandlersFor(edgeKeys) val verticesWithData = graphDataFrame.flatMap(r => r.getAs[Any](nodeTag) match { case data: mutable.WrappedArray[Row@unchecked] => data.array case data: Row => Array(data) }) val verticesIndex = verticesWithData.map(r => r.getAs[String](idAttribute)).zipWithUniqueId().collect().toMap val vertices: RDD[(VertexId, Map[String, Any])] = verticesWithData .map( r => (verticesIndex(r.getAs[String](idAttribute)), extractAttributesMap(nodeAttrHandlers, r)) ) val edgesRows = graphDataFrame.flatMap(r => r.getAs[Any](edgeTag) match { case data: mutable.WrappedArray[Row@unchecked] => data.array case data: Row => Array(data) }) .map(r => Edge( verticesIndex(r.getAs[String](sourceAttribute)), verticesIndex(r.getAs[String](targetAttribute)), extractAttributesMap(edgeAttrHandlers, r) )) Graph(vertices, edgesRows) } def extractAttributesMap(attrHandlers: Map[String, GraphMLAttribute], r: Row): Map[String, Any] = { Try(r.getAs[mutable.WrappedArray[Row]](dataTag)).toOption.map( _.map(r => { val attribute = attrHandlers(r.getAs[String](keyAttribute)) (attribute.name, attribute.handler(r.getAs[String](tagValue))) }).toMap ).getOrElse(Map.empty) + ("id" -> r.getAs[String](idAttribute)) } def createAttrHandlersFor(keys: RDD[Row]): Map[String, GraphMLAttribute] = { keys .map(r => (r.getAs[String](idAttribute), GraphMLAttribute(r.getAs[String](nameAttribute), GraphMLTypes(r.getAs[String](typeAttribute))))) .collect().toMap } }
Example 13
Source File: AffinityPropagationSuite.scala From SparkAffinityPropagation with MIT License | 5 votes |
package org.viirya.spark.ml import scala.collection.mutable import org.scalatest.{BeforeAndAfterAll, FunSuite, Suite} import org.viirya.spark.ml.AffinityPropagation._ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.graphx.{Edge, Graph} class AffinityPropagationSuite extends FunSuite with BeforeAndAfterAll { self: Suite => @transient var sc: SparkContext = _ override def beforeAll() { super.beforeAll() val conf = new SparkConf() .setMaster("local[2]") .setAppName("AffinityPropagationUnitTest") sc = new SparkContext(conf) } override def afterAll() { try { if (sc != null) { sc.stop() } sc = null } finally { super.afterAll() } } test("affinity propagation") { val similarities = Seq[(Long, Long, Double)]( (0, 1, 1.0), (1, 0, 1.0), (0, 2, 1.0), (2, 0, 1.0), (0, 3, 1.0), (3, 0, 1.0), (1, 2, 1.0), (2, 1, 1.0), (2, 3, 1.0), (3, 2, 1.0)) val expected = Array( Array(0.0, 1.0/3.0, 1.0/3.0, 1.0/3.0), Array(1.0/2.0, 0.0, 1.0/2.0, 0.0), Array(1.0/3.0, 1.0/3.0, 0.0, 1.0/3.0), Array(1.0/2.0, 0.0, 1.0/2.0, 0.0)) val s = constructGraph(sc.parallelize(similarities, 2), true, false) s.edges.collect().foreach { case Edge(i, j, x) => assert(math.abs(x.similarity - expected(i.toInt)(j.toInt)) < 1e-14) } } }
Example 14
Source File: ZombieExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.graph import org.apache.log4j.{Level, Logger} import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object ZombieExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val vertexJsonFile = args(0) val edgeJsonFile = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") import sparkSession.implicits._ val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex] val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge] val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => { (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive)) }) val edgeRdd = edgeDs.rdd.map(r => { new Edge[String](r.src, r.dst, r.edge_type) }) val defaultUser = new ZombieStats(false, 0) val graph = Graph(vectorRdd, edgeRdd, defaultUser) val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)( (vertexId, zombieState, message) => { if (message > 0 && !zombieState.isZombie) { new ZombieStats(true, message) } else { zombieState } }, triplet => { if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) { Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l)) } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) { Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l)) } else { Iterator.empty } }, (a, b) => Math.min(a, b)) println("ZombieBite") zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => { println("vertexId:" + r._1 + ",ZobmieStat:" + r._2) }) sparkSession.stop() } } case class ZombieStats (isZombie:Boolean, lengthOfLife:Long)
Example 15
Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.Graph.graphToGraphOps import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object UserRankApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .foreachRDD(rdd => { val edges = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) }) .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) val vertices = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String]) }) .map(r => (r.hashCode.toLong, r)) val tolerance = 0.0001 val graph = Graph(vertices, edges, "defaultUser") .subgraph(vpred = (id, idStr) => idStr != "defaultUser") val pr = graph.pageRank(tolerance).cache graph.outerJoinVertices(pr.vertices) { (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) }.vertices.top(10) { Ordering.by(_._2._1) }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) }) ssc.start() ssc.awaitTermination() } }
Example 16
Source File: EdgeAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class EdgeAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Edge API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapEdges(e => e.attr.toUpperCase) println(res.edges.collect().toList) } }
Example 17
Source File: VertexAPI.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.SparkContext import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite class VertexAPI extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("Should use Vertex API") { //given val users: RDD[(VertexId, (String))] = spark.parallelize(Array( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d") )) val relationships = spark.parallelize(Array( Edge(1L, 2L, "friend"), Edge(1L, 3L, "friend"), Edge(2L, 4L, "wife") )) val graph = Graph(users, relationships) //when val res = graph.mapVertices((_, att) => att.toUpperCase()) res.vertices.collect().toList } }
Example 18
Source File: EmployeeRelationship.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.graphx import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.rdd.RDD import org.apache.spark.graphx.{ Edge, Graph } object EmployeeRelationship { def main(args: Array[String]): Unit = { // vertex format: vertex_id, data val vertexArray = Array( (1L, ("John", "Software Developer")), (2L, ("Robert", "Technical Leader")), (3L, ("Charlie", "Software Architect")), (4L, ("David", "Software Developer")), (5L, ("Edward", "Software Development Manager")), (6L, ("Francesca", "Software Development Manager"))) // edge format: from_vertex_id, to_vertex_id, data val edgeArray = Array( Edge(2L, 1L, "Technical Mentor"), Edge(2L, 4L, "Technical Mentor"), Edge(3L, 2L, "Collaborator"), Edge(6L, 3L, "Team Member"), Edge(4L, 1L, "Peers"), Edge(5L, 2L, "Team Member"), Edge(5L, 3L, "Team Member"), Edge(5L, 6L, "Peers")) val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob")) val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray) val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray) val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD) // Vanilla query println(">>> Showing the names of people who are Software Developers") graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") } .collect() .foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") } // Connection analysis println(">>> People connected to Robert (Technical Leader) -> ") graph.triplets.filter(_.srcId == 2).collect() .foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) } println(">>> Robert (Technical Leader) connected to -> ") graph.triplets.filter(_.dstId == 2).collect() .foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) } println(">>> Technical Mentoring Analysis -> ") graph.triplets.filter(_.attr.equals("Technical Mentor")).collect() .foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) } } }
Example 19
Source File: LocalRunner.scala From spark-betweenness with Apache License 2.0 | 5 votes |
package com.centrality.kBC import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD object MainRunner { def main(args: Array[String]) { // Create spark context val appName="kBC" val sparkMode="local" val conf = new SparkConf().setAppName(appName).setMaster(sparkMode); val sc = new SparkContext(conf); // Create sample graph // // Create an RDD for vertices val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), (5L, ("franklin", "prof")), (2L, ("istoica", "prof")))) // Create an RDD for edges val relationships: RDD[Edge[String]] = sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"))) // Define a default user in case there are relationship with missing user val defaultUser = ("John Doe", "Missing") // Build the initial Graph val graph = Graph(users, relationships, defaultUser) val kBCGraph = KBetweenness.run(graph, 3) } }
Example 20
Source File: GraphFramesExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.{Edge, Graph, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} //import org.graphframes._ object GraphFramesExample extends App { val conf = new SparkConf() .setAppName("RDD graph") .setMaster("local[4]") val sc = new SparkContext(conf) val vertices: RDD[(VertexId, String)] = sc.parallelize( Array((1L, "Anne"), (2L, "Bernie"), (3L, "Chris"), (4L, "Don"), (5L, "Edgar"))) val edges: RDD[Edge[String]] = sc.parallelize( Array(Edge(1L, 2L, "likes"), Edge(2L, 3L, "trusts"), Edge(3L, 4L, "believes"), Edge(4L, 5L, "worships"), Edge(1L, 3L, "loves"), Edge(4L, 1L, "dislikes"))) val friendGraph: Graph[String, String] = Graph(vertices, edges) // val friendGraphFrame = GraphFrame.fromGraphX(friendGraph) // // friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter( // "e1.attr = 'trusts' OR v3.attr = 'Chris'" // ).collect.foreach(println) }
Example 21
Source File: CCGraphXDriver.scala From connected-component with MIT License | 5 votes |
package com.kwartile.lib.cc import org.apache.spark.graphx.{Edge, Graph} import org.apache.spark.{SparkConf, SparkContext} import scala.annotation.tailrec object CCGraphXDriver { @tailrec private def buildEdges(node: Long, neighbors:List[Long], partialPairs: List[Edge[Int]]) : List[Edge[Int]] = { if (neighbors.length == 0) { if (partialPairs != null) List(Edge(node, node, 1)) ::: partialPairs else List(Edge(node, node, 1)) } else if (neighbors.length == 1) { val neighbor = neighbors(0) if (node > neighbor) if (partialPairs != null) List(Edge(node, neighbor, 1)) ::: partialPairs else List(Edge(node, neighbor, 1)) else if (partialPairs != null) List(Edge(neighbor, node, 1)) ::: partialPairs else List(Edge(neighbor, node, 1)) } else { val newPartialPairs = neighbors.map(neighbor => { if (node > neighbor) List(Edge(node, neighbor, 1)) else List(Edge(neighbor, node, 1)) }).flatMap(x=>x) if (partialPairs != null) buildEdges(neighbors.head, neighbors.tail, newPartialPairs ::: partialPairs) else buildEdges(neighbors.head, neighbors.tail, newPartialPairs) } } private def buildEdges(nodes:List[Long]) : List[Edge[Int]] = { buildEdges(nodes.head, nodes.tail, null.asInstanceOf[List[Edge[Int]]]) } def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("GraphXConnectedComponent") val sc = new SparkContext(sparkConf) val cliqueFile = args(0) val cliquesRec = sc.textFile(args(0)) val cliques = cliquesRec.map(x => { val nodes = x.split("\\s+").map(y => y.toLong).toList nodes }) val edges = cliques.map(aClique => { buildEdges(aClique) }).flatMap(x=>x) val graph = Graph.fromEdges(edges, 1) val cc = graph.connectedComponents().vertices println ("Count of Connected component: " + cc.count) } }
Example 22
Source File: InputDataFlow.scala From spark-graphx with GNU General Public License v3.0 | 5 votes |
package com.github.graphx.pregel.social import org.apache.spark.graphx.{Edge, VertexId} import scala.collection.mutable.ListBuffer object InputDataFlow { def parseNames(line: String): Option[(VertexId, String)] = { val fields = line.split('\t') if (fields.length > 1) Some(fields(0).trim().toLong, fields(1)) else None } def makeEdges(line: String): List[Edge[Int]] = { var edges = new ListBuffer[Edge[Int]]() val fields = line.split(" ") val origin = fields(0) (1 until fields.length) .foreach { p => edges += Edge(origin.toLong, fields(p).toLong, 0) } edges.toList } }
Example 23
Source File: AbstractPipeClusteringGraph.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.clustering import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.VertexId import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.similarity.aggregator.Mean import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable abstract class AbstractPipeClusteringGraph extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]] with Serializable { def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]] def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = { val duplicatePairsWithSimilarity = input.map( pair => (pair._1, Mean.agrSimilarity(pair._2)) ) val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map( pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) } ) // TODO optimize: it would be nice to build the graph only by using edge triplets // but as far as I know that's not possible val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap( tuplePair => Seq(tuplePair._1, tuplePair._2) ).map(tuple => (tuple.id, tuple)) // delete all duplicate vertices val vertices = verticesNotUnique.distinct() // The edge type Boolean is just a workaround because no edge types are needed val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null) cluster(graph) } }