org.apache.spark.graphx.GraphLoader Scala Examples
The following examples show how to use org.apache.spark.graphx.GraphLoader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 6 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.lib.TriangleCount import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphGeneration extends App { val conf = new SparkConf() .setAppName("Graph generation") .setMaster("local[4]") val sc = new SparkContext(conf) val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt") val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map { line => val field = line.split(" ") (field(0).toLong, field(1).toLong) } val edgeTupleGraph = Graph.fromEdgeTuples( rawEdges=rawEdges, defaultValue="") val gridGraph = GraphGenerators.gridGraph(sc, 5, 5) val starGraph = GraphGenerators.starGraph(sc, 11) val logNormalGraph = GraphGenerators.logNormalGraph( sc, numVertices = 20, mu=1, sigma = 3 ) logNormalGraph.outDegrees.map(_._2).collect().sorted val actorGraph = GraphLoader.edgeListFile( sc, "./ca-hollywood-2009.txt", true ).partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.edges.count() val actorComponents = actorGraph.connectedComponents().cache actorComponents.vertices.map(_._2).distinct().count val clusterSizes =actorComponents.vertices.map( v => (v._2, 1)).reduceByKey(_ + _) clusterSizes.map(_._2).max clusterSizes.map(_._2).min val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt") val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5) strongComponents.vertices.map(_._2).distinct().count val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges() val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.triangleCount() val triangles = TriangleCount.runPreCanonicalized(partitionedGraph) actorGraph.staticPageRank(10) val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001) actorPrGraph.vertices.reduce((v1, v2) => { if (v1._2 > v2._2) v1 else v2 }) actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10) actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count }
Example 2
Source File: LoadGraph.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.graphx import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.graphx.{GraphLoader, PartitionStrategy} class LoadGraph extends ConfigurableStop { val authorEmail: String = "[email protected]" val description: String = "Load data and construct a graphx" val inportList: List[String] = List(Port.DefaultPort) var edgePort : String = "edges" var vertexPort : String = "vertex" val outportList: List[String] = List(edgePort,vertexPort) var dataPath:String = _ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val sc=spark.sparkContext import spark.sqlContext.implicits._ var graph=GraphLoader .edgeListFile(sc,dataPath,true) .partitionBy(PartitionStrategy.RandomVertexCut) //TODO:can not transfer EdgeRdd to Dataset out.write(edgePort,graph.edges.toDF()) out.write(vertexPort,graph.vertices.toDF()) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map : Map[String, Any]): Unit = { dataPath = MapUtil.get(map,"dataPath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val dataPath = new PropertyDescriptor() .name("dataPath") .displayName("Data_Path") .defaultValue("") .allowableValues(Set("")) .required(true) .example("hdfs://192.168.3.138:8020/work/test/test.csv") descriptor = dataPath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/graphx/LoadGraph.png") } override def getGroup(): List[String] = { List(StopGroup.GraphX.toString) } }
Example 3
Source File: kBCDriver.scala From spark-betweenness with Apache License 2.0 | 5 votes |
package com.centrality.kBC import java.util.Calendar import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.GraphLoader import org.apache.spark.graphx.PartitionStrategy object kBCDriver { def main(args: Array[String]) { // Create spark context val appName="kBCDriver" val conf = new SparkConf().setAppName(appName)//.setMaster(master) val sc = new SparkContext(conf) // Graph partition params val DEFAULT_K = 2 val DEFAULT_EDGE_PARTITIONS=60 val DEFAULT_CANONICAL_ORIENTATION=true val k = args(0).toInt println("k : " + k) val canonicalOrientation = DEFAULT_CANONICAL_ORIENTATION val numEdgePartitions = args(1).toInt // Input params val DEFAULT_INPUT_DIR="/tmp/input/" val DEFAULT_INPUT_FILE_NAME="edge_list.txt" val inputDir = args(2) val inputFileName = args(4) val inputPath = inputDir+inputFileName println("inputPath : " + inputPath) // Output params val DEFAULT_OUTPUT_DIR="/tmp/output/" val DEFAULT_V_OUTPUT_FILE=List(inputFileName,"kbc",k,"vertices").mkString("_")+".txt" val DEFAULT_E_OUTPUT_FILE=List(inputFileName,"kbc",k,"edges").mkString("_")+".txt" val outputDir = args(3) val outputVerticesFileName = sc.hadoopConfiguration.get("outputVerticesFileName", DEFAULT_V_OUTPUT_FILE) val outputEdgesFileName = sc.hadoopConfiguration.get("outputEdgesFileName", DEFAULT_E_OUTPUT_FILE) val outputVerticesPath = sc.hadoopConfiguration.get("outputVerticesPath", outputDir+outputVerticesFileName) val outputEdgesPath = sc.hadoopConfiguration.get("outputEdgesPath", outputDir+outputEdgesFileName) println("outputVerticesPath : " + outputVerticesPath) println("outputEdgesPath : " + outputEdgesPath) // Read graph val graph = GraphLoader.edgeListFile(sc, inputPath, canonicalOrientation, numEdgePartitions).partitionBy(PartitionStrategy.EdgePartition2D) println(Calendar.getInstance().getTime().toString + " vertices : " + graph.vertices.count()) println(Calendar.getInstance().getTime().toString + " edges : " + graph.edges.count()) // Run kBC println(Calendar.getInstance().getTime().toString + ": start kBC") val kBCGraph = KBetweenness.run(graph, k) // Save graph to file println(Calendar.getInstance().getTime().toString + ": saving results ") kBCGraph.vertices.coalesce(1).saveAsTextFile(outputVerticesPath) kBCGraph.edges.coalesce(1).saveAsTextFile(outputEdgesPath) } }
Example 4
Source File: PageRankTest.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_7 import org.apache.spark.graphx.GraphLoader import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class PageRankTest extends FunSuite { private val sc = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should calculate page rank using GraphX API") { //given val graph = GraphLoader.edgeListFile(sc, getClass.getResource("/pagerank/followers.txt").getPath) val ranks = graph.pageRank(0.0001).vertices val users = sc.textFile(getClass.getResource("/pagerank/users.txt").getPath).map { line => val fields = line.split(",") (fields(0).toLong, fields(1)) } //when val rankByUsername = users.join(ranks).map { case (_, (username, rank)) => (username, rank) }.sortBy((t) => t._2, ascending = false) .collect() .toList println(rankByUsername) //then rankByUsername.map(_._1) should contain theSameElementsInOrderAs List( "BarackObama", "ladygaga", "odersky", "jeresig", "matei_zaharia", "justinbieber" ) } }
Example 5
Source File: GraphDataGenTest.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration import java.io.File import com.ibm.sparktc.sparkbench.testfixtures.{BuildAndTeardownData, SparkSessionProvider} import com.ibm.sparktc.sparkbench.utils.SparkBenchException import org.apache.spark.graphx.GraphLoader import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers} class GraphDataGenTest extends FlatSpec with Matchers with BeforeAndAfterEach { val cool = new BuildAndTeardownData("graph-data-gen") val fileName = s"${cool.sparkBenchTestFolder}/${java.util.UUID.randomUUID.toString}.txt" var file: File = _ override def beforeEach() { cool.createFolders() file = new File(fileName) } override def afterEach() { cool.deleteFolders() } "GraphDataGeneration" should "generate data correctly with all default options" in { val m = Map( "name" -> "graph-data-generator", "vertices" -> 100, "output" -> fileName ) val generator = GraphDataGen(m) generator.doWorkload(spark = SparkSessionProvider.spark) val res = GraphLoader.edgeListFile(SparkSessionProvider.spark.sparkContext, fileName) res.vertices.count() shouldBe m("vertices") } it should "throw an error for any output format but .txt" in { val m1 = Map( "name" -> "graph-data-generator", "vertices" -> 100, "output" -> "/my-cool-file.csv" ) val m2 = Map( "name" -> "graph-data-generator", "vertices" -> 100, "output" -> "/my-cool-file.parquet" ) val m3 = Map( "name" -> "graph-data-generator", "vertices" -> 100, "output" -> "/my-cool-file.tsv" ) a [SparkBenchException] should be thrownBy GraphDataGen(m1) a [SparkBenchException] should be thrownBy GraphDataGen(m2) a [SparkBenchException] should be thrownBy GraphDataGen(m3) } it should "throw errors when required values are missing" in { // Missing vertices val m1 = Map( "name" -> "graph-data-generator", "output" -> "/my-cool-file.csv" ) // Missing output file name val m2 = Map( "name" -> "graph-data-generator", "vertices" -> 100 ) a [SparkBenchException] should be thrownBy GraphDataGen(m1) a [SparkBenchException] should be thrownBy GraphDataGen(m2) } }
Example 6
Source File: TriangleCountingExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.graphx.{GraphLoader, PartitionStrategy} // $example off$ // Join the triangle counts with the usernames val users = sc.textFile("data/graphx/users.txt").map { line => val fields = line.split(",") (fields(0).toLong, fields(1)) } val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) => (username, tc) } // Print the result println(triCountByUsername.collect().mkString("\n")) // $example off$ sc.stop() } } // scalastyle:on println
Example 7
Source File: IterativeComputation$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.api.operators import org.apache.spark.graphx.GraphLoader import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfter, FlatSpec} class IterativeComputation$Test extends FlatSpec with BeforeAndAfter{ val master = "local[*]" def appName:String="InterativeComputationTest" implicit val sc:SparkContext= { val conf = new SparkConf() .setMaster(master) .setAppName(appName) new SparkContext(conf) } after { if(!sc.isStopped){ sc.stop() } } def loadGraph(file:String)={ GraphLoader.edgeListFile(sc,file.toString) } "Correct number of vertices " should "be returned" in{ //Given("Graph") val graph=loadGraph(getClass.getResource("/graph").toString) //When("Taking size") val bucketSize: Long = IterativeComputation.wholeGraphBucket(graph) //Then("") assert(graph.numVertices==bucketSize) } }
Example 8
Source File: MeasureTest.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.graphx.{Graph, GraphLoader} import org.scalatest._ abstract class MeasureTest(implicit sc:SparkContext) extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers{ def time[T](str: String)(thunk: => T): (T,Long) = { logger.info(s"$str...") val t1 = System.currentTimeMillis val x = thunk val t2 = System.currentTimeMillis val diff=t2 - t1 logger.info(s"$diff ms") (x,diff) } val logger=Logger.getLogger(this.getClass) def loadGraph(file:String)={ val out: Graph[Int, Int] =GraphLoader.edgeListFile(sc,file.toString) out.vertices.setName(s"Graph vertices ${file}") out.edges.setName(s"Graph edges ${file}") out.triplets.setName(s"Graph triplets ${file}") out } }