org.apache.spark.graphx.PartitionStrategy Scala Examples
The following examples show how to use org.apache.spark.graphx.PartitionStrategy.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 6 votes |
package com.github.maxpumperla.ml_spark.graphs import org.apache.spark.graphx.lib.TriangleCount import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object GraphGeneration extends App { val conf = new SparkConf() .setAppName("Graph generation") .setMaster("local[4]") val sc = new SparkContext(conf) val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt") val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map { line => val field = line.split(" ") (field(0).toLong, field(1).toLong) } val edgeTupleGraph = Graph.fromEdgeTuples( rawEdges=rawEdges, defaultValue="") val gridGraph = GraphGenerators.gridGraph(sc, 5, 5) val starGraph = GraphGenerators.starGraph(sc, 11) val logNormalGraph = GraphGenerators.logNormalGraph( sc, numVertices = 20, mu=1, sigma = 3 ) logNormalGraph.outDegrees.map(_._2).collect().sorted val actorGraph = GraphLoader.edgeListFile( sc, "./ca-hollywood-2009.txt", true ).partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.edges.count() val actorComponents = actorGraph.connectedComponents().cache actorComponents.vertices.map(_._2).distinct().count val clusterSizes =actorComponents.vertices.map( v => (v._2, 1)).reduceByKey(_ + _) clusterSizes.map(_._2).max clusterSizes.map(_._2).min val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt") val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5) strongComponents.vertices.map(_._2).distinct().count val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges() val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut) actorGraph.triangleCount() val triangles = TriangleCount.runPreCanonicalized(partitionedGraph) actorGraph.staticPageRank(10) val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001) actorPrGraph.vertices.reduce((v1, v2) => { if (v1._2 > v2._2) v1 else v2 }) actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10) actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count }
Example 2
Source File: SynthBenchmark.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx import java.io.{FileOutputStream, PrintWriter} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.graphx.{GraphXUtils, PartitionStrategy} import org.apache.spark.graphx.util.GraphGenerators def main(args: Array[String]) { val options = args.map { arg => arg.dropWhile(_ == '-').split('=') match { case Array(opt, v) => (opt -> v) case _ => throw new IllegalArgumentException("Invalid argument: " + arg) } } var app = "pagerank" var niter = 10 var numVertices = 100000 var numEPart: Option[Int] = None var partitionStrategy: Option[PartitionStrategy] = None var mu: Double = 4.0 var sigma: Double = 1.3 var degFile: String = "" var seed: Int = -1 options.foreach { case ("app", v) => app = v case ("niters", v) => niter = v.toInt case ("nverts", v) => numVertices = v.toInt case ("numEPart", v) => numEPart = Some(v.toInt) case ("partStrategy", v) => partitionStrategy = Some(PartitionStrategy.fromString(v)) case ("mu", v) => mu = v.toDouble case ("sigma", v) => sigma = v.toDouble case ("degFile", v) => degFile = v case ("seed", v) => seed = v.toInt case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt) } val conf = new SparkConf() .setAppName(s"GraphX Synth Benchmark (nverts = $numVertices, app = $app)") GraphXUtils.registerKryoClasses(conf) val sc = new SparkContext(conf) // Create the graph println(s"Creating graph...") val unpartitionedGraph = GraphGenerators.logNormalGraph(sc, numVertices, numEPart.getOrElse(sc.defaultParallelism), mu, sigma, seed) // Repartition the graph val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_)).cache() var startTime = System.currentTimeMillis() val numEdges = graph.edges.count() println(s"Done creating graph. Num Vertices = $numVertices, Num Edges = $numEdges") val loadTime = System.currentTimeMillis() - startTime // Collect the degree distribution (if desired) if (!degFile.isEmpty) { val fos = new FileOutputStream(degFile) val pos = new PrintWriter(fos) val hist = graph.vertices.leftJoin(graph.degrees)((id, _, optDeg) => optDeg.getOrElse(0)) .map(p => p._2).countByValue() hist.foreach { case (deg, count) => pos.println(s"$deg \t $count") } } // Run PageRank startTime = System.currentTimeMillis() if (app == "pagerank") { println("Running PageRank") val totalPR = graph.staticPageRank(niter).vertices.map(_._2).sum() println(s"Total PageRank = $totalPR") } else if (app == "cc") { println("Running Connected Components") val numComponents = graph.connectedComponents.vertices.map(_._2).distinct().count() println(s"Number of components = $numComponents") } val runTime = System.currentTimeMillis() - startTime println(s"Num Vertices = $numVertices") println(s"Num Edges = $numEdges") println(s"Creation time = ${loadTime/1000.0} seconds") println(s"Run time = ${runTime/1000.0} seconds") sc.stop() } } // scalastyle:on println
Example 3
Source File: LoadGraph.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.graphx import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.graphx.{GraphLoader, PartitionStrategy} class LoadGraph extends ConfigurableStop { val authorEmail: String = "[email protected]" val description: String = "Load data and construct a graphx" val inportList: List[String] = List(Port.DefaultPort) var edgePort : String = "edges" var vertexPort : String = "vertex" val outportList: List[String] = List(edgePort,vertexPort) var dataPath:String = _ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val sc=spark.sparkContext import spark.sqlContext.implicits._ var graph=GraphLoader .edgeListFile(sc,dataPath,true) .partitionBy(PartitionStrategy.RandomVertexCut) //TODO:can not transfer EdgeRdd to Dataset out.write(edgePort,graph.edges.toDF()) out.write(vertexPort,graph.vertices.toDF()) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map : Map[String, Any]): Unit = { dataPath = MapUtil.get(map,"dataPath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val dataPath = new PropertyDescriptor() .name("dataPath") .displayName("Data_Path") .defaultValue("") .allowableValues(Set("")) .required(true) .example("hdfs://192.168.3.138:8020/work/test/test.csv") descriptor = dataPath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/graphx/LoadGraph.png") } override def getGroup(): List[String] = { List(StopGroup.GraphX.toString) } }
Example 4
Source File: kBCDriver.scala From spark-betweenness with Apache License 2.0 | 5 votes |
package com.centrality.kBC import java.util.Calendar import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.GraphLoader import org.apache.spark.graphx.PartitionStrategy object kBCDriver { def main(args: Array[String]) { // Create spark context val appName="kBCDriver" val conf = new SparkConf().setAppName(appName)//.setMaster(master) val sc = new SparkContext(conf) // Graph partition params val DEFAULT_K = 2 val DEFAULT_EDGE_PARTITIONS=60 val DEFAULT_CANONICAL_ORIENTATION=true val k = args(0).toInt println("k : " + k) val canonicalOrientation = DEFAULT_CANONICAL_ORIENTATION val numEdgePartitions = args(1).toInt // Input params val DEFAULT_INPUT_DIR="/tmp/input/" val DEFAULT_INPUT_FILE_NAME="edge_list.txt" val inputDir = args(2) val inputFileName = args(4) val inputPath = inputDir+inputFileName println("inputPath : " + inputPath) // Output params val DEFAULT_OUTPUT_DIR="/tmp/output/" val DEFAULT_V_OUTPUT_FILE=List(inputFileName,"kbc",k,"vertices").mkString("_")+".txt" val DEFAULT_E_OUTPUT_FILE=List(inputFileName,"kbc",k,"edges").mkString("_")+".txt" val outputDir = args(3) val outputVerticesFileName = sc.hadoopConfiguration.get("outputVerticesFileName", DEFAULT_V_OUTPUT_FILE) val outputEdgesFileName = sc.hadoopConfiguration.get("outputEdgesFileName", DEFAULT_E_OUTPUT_FILE) val outputVerticesPath = sc.hadoopConfiguration.get("outputVerticesPath", outputDir+outputVerticesFileName) val outputEdgesPath = sc.hadoopConfiguration.get("outputEdgesPath", outputDir+outputEdgesFileName) println("outputVerticesPath : " + outputVerticesPath) println("outputEdgesPath : " + outputEdgesPath) // Read graph val graph = GraphLoader.edgeListFile(sc, inputPath, canonicalOrientation, numEdgePartitions).partitionBy(PartitionStrategy.EdgePartition2D) println(Calendar.getInstance().getTime().toString + " vertices : " + graph.vertices.count()) println(Calendar.getInstance().getTime().toString + " edges : " + graph.edges.count()) // Run kBC println(Calendar.getInstance().getTime().toString + ": start kBC") val kBCGraph = KBetweenness.run(graph, k) // Save graph to file println(Calendar.getInstance().getTime().toString + ": saving results ") kBCGraph.vertices.coalesce(1).saveAsTextFile(outputVerticesPath) kBCGraph.edges.coalesce(1).saveAsTextFile(outputEdgesPath) } }
Example 5
Source File: TriangleCountingExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.graphx // $example on$ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.graphx.{GraphLoader, PartitionStrategy} // $example off$ // Join the triangle counts with the usernames val users = sc.textFile("data/graphx/users.txt").map { line => val fields = line.split(",") (fields(0).toLong, fields(1)) } val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) => (username, tc) } // Print the result println(triCountByUsername.collect().mkString("\n")) // $example off$ sc.stop() } } // scalastyle:on println
Example 6
Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID} import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator import org.apache.log4j.Logger import org.apache.spark.{Partitioner, SparkContext} import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId} import scala.reflect.ClassTag object CommunityBasedPartitioning { @transient val logger=Logger.getLogger(CommunityBasedPartitioning.getClass()) def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph) val numberOfCommunities=communities.vertices.values.countApproxDistinct() val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices) val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator) logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions") val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache() out.edges.foreachPartition((_)=>{}) out.vertices.foreachPartition((_)=>{}) out } def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts) } }
Example 7
Source File: FastUnfolding.scala From fastunfolding with Apache License 2.0 | 5 votes |
package com.soteradefense.dga.graphx.louvain import org.apache.spark.SparkContext import org.apache.spark.graphx.{VertexId, PartitionStrategy, TripletFields, Graph} import scala.reflect.ClassTag class FastUnfolding(outputdir: String, minProgress: Int = 1, progressCounter: Int = 1) { var qValues = Array[(Int, Double)]() def saveLevel(sc: SparkContext, level: Int, q: Double, graph: Graph[MyVertexState, Long]) = { graph.vertices.saveAsTextFile(s"${outputdir}/level_${level}_vertices") graph.edges.saveAsTextFile(s"${outputdir}/level_${level}_edges") //graph.vertices.map( {case (id,v) => ""+id+","+v.internalWeight+","+v.community }).saveAsTextFile(outputdir+"/level_"+level+"_vertices") //graph.edges.mapValues({case e=>""+e.srcId+","+e.dstId+","+e.attr}).saveAsTextFile(outputdir+"/level_"+level+"_edges") qValues = qValues :+ ((level, q)) println(s"qValue: $q") // overwrite the q values at each level sc.parallelize(qValues, 1).saveAsTextFile(s"${outputdir}/qvalues") } def run[VD: ClassTag](sc: SparkContext, graph: Graph[VD, Long]) = { val initialGraph = createGraph(graph) val graphWeight = initialGraph.vertices.map( vertex => { vertex._2.nodeWeight } ).reduce(_ + _) val broadcastGraphWeight = sc.broadcast(graphWeight) val initialModularity = initialGraph.vertices.map( vertex => { vertex._2.in / (2 * graphWeight) - vertex._2.tot * vertex._2.tot / (graphWeight * graphWeight) } ).reduce(_ + _) var level = -1 var halt = false while(!halt) { level += 1 println(s"Starting level ${level}") val (currentQ, currentGraph, passes) = runFastUnfolding(sc, initialGraph, minProgress, progressCounter) } } def runFastUnfolding(sc: SparkContext, graph: Graph[MyVertexState, Long], minProgress: Int, progressCounter: Int) = { val cachedGraph = graph.cache() } def createGraph[VD: ClassTag](graph: Graph[VD, Long]): Graph[MyVertexState, Long] = { val nodeWeights = graph.aggregateMessages[Long]( cxt => { cxt.sendToSrc(cxt.attr) cxt.sendToDst(cxt.attr) }, (a, b) => a + b, TripletFields.EdgeOnly ) nodeWeights.foreach(result => println(s"nodeweight: ${result._1}, ${result._2}")) val louvainGraph = graph.outerJoinVertices(nodeWeights)((vid, data, weightOption) => { val weight = weightOption.getOrElse(0L) val state = new MyVertexState() state.community = vid state.changed = false state.tot = weight state.in = 0 state.nodeWeight = weight state }).partitionBy(PartitionStrategy.EdgePartition2D) louvainGraph } }