org.apache.spark.graphx.PartitionStrategy Scala Example

Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

6 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
    .setMaster("local[4]")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  }
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3
  )
  logNormalGraph.outDegrees.map(_._2).collect().sorted

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true
  ).partitionBy(PartitionStrategy.RandomVertexCut)
  actorGraph.edges.count()

  val actorComponents = actorGraph.connectedComponents().cache
  actorComponents.vertices.map(_._2).distinct().count

  val clusterSizes =actorComponents.vertices.map(
    v => (v._2, 1)).reduceByKey(_ + _)
  clusterSizes.map(_._2).max
  clusterSizes.map(_._2).min

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)
  strongComponents.vertices.map(_._2).distinct().count

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  actorGraph.triangleCount()
  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  actorGraph.staticPageRank(10)
  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2
  })

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println)

  actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10)

  actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count

}

Source File: SynthBenchmark.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.graphx

import java.io.{FileOutputStream, PrintWriter}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{GraphXUtils, PartitionStrategy}
import org.apache.spark.graphx.util.GraphGenerators


  def main(args: Array[String]) {
    val options = args.map {
      arg =>
        arg.dropWhile(_ == '-').split('=') match {
          case Array(opt, v) => (opt -> v)
          case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
        }
    }

    var app = "pagerank"
    var niter = 10
    var numVertices = 100000
    var numEPart: Option[Int] = None
    var partitionStrategy: Option[PartitionStrategy] = None
    var mu: Double = 4.0
    var sigma: Double = 1.3
    var degFile: String = ""
    var seed: Int = -1

    options.foreach {
      case ("app", v) => app = v
      case ("niters", v) => niter = v.toInt
      case ("nverts", v) => numVertices = v.toInt
      case ("numEPart", v) => numEPart = Some(v.toInt)
      case ("partStrategy", v) => partitionStrategy = Some(PartitionStrategy.fromString(v))
      case ("mu", v) => mu = v.toDouble
      case ("sigma", v) => sigma = v.toDouble
      case ("degFile", v) => degFile = v
      case ("seed", v) => seed = v.toInt
      case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
    }

    val conf = new SparkConf()
      .setAppName(s"GraphX Synth Benchmark (nverts = $numVertices, app = $app)")
    GraphXUtils.registerKryoClasses(conf)

    val sc = new SparkContext(conf)

    // Create the graph
    println(s"Creating graph...")
    val unpartitionedGraph = GraphGenerators.logNormalGraph(sc, numVertices,
      numEPart.getOrElse(sc.defaultParallelism), mu, sigma, seed)
    // Repartition the graph
    val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_)).cache()

    var startTime = System.currentTimeMillis()
    val numEdges = graph.edges.count()
    println(s"Done creating graph. Num Vertices = $numVertices, Num Edges = $numEdges")
    val loadTime = System.currentTimeMillis() - startTime

    // Collect the degree distribution (if desired)
    if (!degFile.isEmpty) {
      val fos = new FileOutputStream(degFile)
      val pos = new PrintWriter(fos)
      val hist = graph.vertices.leftJoin(graph.degrees)((id, _, optDeg) => optDeg.getOrElse(0))
        .map(p => p._2).countByValue()
      hist.foreach {
        case (deg, count) => pos.println(s"$deg \t $count")
      }
    }

    // Run PageRank
    startTime = System.currentTimeMillis()
    if (app == "pagerank") {
      println("Running PageRank")
      val totalPR = graph.staticPageRank(niter).vertices.map(_._2).sum()
      println(s"Total PageRank = $totalPR")
    } else if (app == "cc") {
      println("Running Connected Components")
      val numComponents = graph.connectedComponents.vertices.map(_._2).distinct().count()
      println(s"Number of components = $numComponents")
    }
    val runTime = System.currentTimeMillis() - startTime

    println(s"Num Vertices = $numVertices")
    println(s"Num Edges = $numEdges")
    println(s"Creation time = ${loadTime/1000.0} seconds")
    println(s"Run time = ${runTime/1000.0} seconds")

    sc.stop()
  }
}
// scalastyle:on println

Source File: LoadGraph.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.graphx

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.graphx.{GraphLoader, PartitionStrategy}
class LoadGraph extends ConfigurableStop {

  val authorEmail: String = "[email protected]"
  val description: String = "Load data and construct a graphx"
  val inportList: List[String] = List(Port.DefaultPort)


  var edgePort : String = "edges"
  var vertexPort : String = "vertex"
  val outportList: List[String] = List(edgePort,vertexPort)


  var dataPath:String = _

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val sc=spark.sparkContext

    import spark.sqlContext.implicits._
    var graph=GraphLoader
      .edgeListFile(sc,dataPath,true)
      .partitionBy(PartitionStrategy.RandomVertexCut)
    //TODO:can not transfer EdgeRdd to Dataset
    out.write(edgePort,graph.edges.toDF())
    out.write(vertexPort,graph.vertices.toDF())

  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  def setProperties(map : Map[String, Any]): Unit = {
    dataPath = MapUtil.get(map,"dataPath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val dataPath = new PropertyDescriptor()
      .name("dataPath")
      .displayName("Data_Path")
      .defaultValue("")
      .allowableValues(Set(""))
      .required(true)
      .example("hdfs://192.168.3.138:8020/work/test/test.csv")
    descriptor = dataPath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/graphx/LoadGraph.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.GraphX.toString)
  }

}

Source File: kBCDriver.scala From spark-betweenness with Apache License 2.0

5 votes

package com.centrality.kBC

import java.util.Calendar

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.GraphLoader
import org.apache.spark.graphx.PartitionStrategy

object kBCDriver 
{
  def main(args: Array[String])
  {
    // Create spark context
    val appName="kBCDriver"
    val conf = new SparkConf().setAppName(appName)//.setMaster(master)
    val sc = new SparkContext(conf)

    // Graph partition params
    val DEFAULT_K = 2
    val DEFAULT_EDGE_PARTITIONS=60
    val DEFAULT_CANONICAL_ORIENTATION=true
    val k = args(0).toInt
    println("k : " + k)
    val canonicalOrientation = DEFAULT_CANONICAL_ORIENTATION
    val numEdgePartitions = args(1).toInt
    
    // Input params
    val DEFAULT_INPUT_DIR="/tmp/input/"
    val DEFAULT_INPUT_FILE_NAME="edge_list.txt"
    val inputDir = args(2)
    val inputFileName = args(4)
    val inputPath = inputDir+inputFileName
    println("inputPath : " + inputPath)
    
    // Output params
    val DEFAULT_OUTPUT_DIR="/tmp/output/"
    val DEFAULT_V_OUTPUT_FILE=List(inputFileName,"kbc",k,"vertices").mkString("_")+".txt"
    val DEFAULT_E_OUTPUT_FILE=List(inputFileName,"kbc",k,"edges").mkString("_")+".txt"
    val outputDir = args(3)
    val outputVerticesFileName = sc.hadoopConfiguration.get("outputVerticesFileName", DEFAULT_V_OUTPUT_FILE)
    val outputEdgesFileName = sc.hadoopConfiguration.get("outputEdgesFileName", DEFAULT_E_OUTPUT_FILE)
    val outputVerticesPath = sc.hadoopConfiguration.get("outputVerticesPath", outputDir+outputVerticesFileName)
    val outputEdgesPath = sc.hadoopConfiguration.get("outputEdgesPath", outputDir+outputEdgesFileName)
    println("outputVerticesPath : " + outputVerticesPath)
    println("outputEdgesPath : " + outputEdgesPath)
    
    // Read graph
    val graph = GraphLoader.edgeListFile(sc, inputPath, canonicalOrientation, numEdgePartitions).partitionBy(PartitionStrategy.EdgePartition2D)
    println(Calendar.getInstance().getTime().toString + " vertices : " + graph.vertices.count())
    println(Calendar.getInstance().getTime().toString + " edges : " + graph.edges.count())
    
    // Run kBC
    println(Calendar.getInstance().getTime().toString + ": start kBC")
    val kBCGraph = 
      KBetweenness.run(graph, k)
    
    // Save graph to file
    println(Calendar.getInstance().getTime().toString + ": saving results ") 
    kBCGraph.vertices.coalesce(1).saveAsTextFile(outputVerticesPath)
    kBCGraph.edges.coalesce(1).saveAsTextFile(outputEdgesPath)
  }
}

Source File: TriangleCountingExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{GraphLoader, PartitionStrategy}
// $example off$



    // Join the triangle counts with the usernames
    val users = sc.textFile("data/graphx/users.txt").map { line =>
      val fields = line.split(",")
      (fields(0).toLong, fields(1))
    }
    val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
      (username, tc)
    }
    // Print the result
    println(triCountByUsername.collect().mkString("\n"))
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println

Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.partitioning


import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID}
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator
import org.apache.log4j.Logger
import org.apache.spark.{Partitioner, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId}

import scala.reflect.ClassTag


object CommunityBasedPartitioning {
  @transient
  val logger=Logger.getLogger(CommunityBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts
    val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph)
    val numberOfCommunities=communities.vertices.values.countApproxDistinct()
    val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices)
    val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries  and ${coarsedNumberOfPartitions} partitions")
    val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache()
    out.edges.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts)
  }



}

Source File: FastUnfolding.scala From fastunfolding with Apache License 2.0

5 votes

package com.soteradefense.dga.graphx.louvain

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{VertexId, PartitionStrategy, TripletFields, Graph}

import scala.reflect.ClassTag


class FastUnfolding(outputdir: String,
                    minProgress: Int = 1,
                    progressCounter: Int = 1) {

    var qValues = Array[(Int, Double)]()

    def saveLevel(sc: SparkContext,
                  level: Int,
                  q: Double,
                  graph: Graph[MyVertexState, Long]) = {
        graph.vertices.saveAsTextFile(s"${outputdir}/level_${level}_vertices")
        graph.edges.saveAsTextFile(s"${outputdir}/level_${level}_edges")
        //graph.vertices.map( {case (id,v) => ""+id+","+v.internalWeight+","+v.community }).saveAsTextFile(outputdir+"/level_"+level+"_vertices")
        //graph.edges.mapValues({case e=>""+e.srcId+","+e.dstId+","+e.attr}).saveAsTextFile(outputdir+"/level_"+level+"_edges")
        qValues = qValues :+ ((level, q))
        println(s"qValue: $q")

        // overwrite the q values at each level
        sc.parallelize(qValues, 1).saveAsTextFile(s"${outputdir}/qvalues")
    }

    def run[VD: ClassTag](sc: SparkContext, graph: Graph[VD, Long]) = {
        val initialGraph = createGraph(graph)

        val graphWeight = initialGraph.vertices.map(
            vertex => {
                vertex._2.nodeWeight
            }
        ).reduce(_ + _)

        val broadcastGraphWeight = sc.broadcast(graphWeight)

        val initialModularity = initialGraph.vertices.map(
            vertex => {
                vertex._2.in / (2 * graphWeight) - vertex._2.tot * vertex._2.tot / (graphWeight * graphWeight)
            }
        ).reduce(_ + _)

        var level = -1
        var halt = false

        while(!halt) {
            level += 1
            println(s"Starting level ${level}")

            val (currentQ, currentGraph, passes) = runFastUnfolding(sc, initialGraph, minProgress, progressCounter)


        }
    }

    def runFastUnfolding(sc: SparkContext,
                        graph: Graph[MyVertexState, Long],
                        minProgress: Int,
                        progressCounter: Int) = {
        val cachedGraph = graph.cache()
        

    }

    def createGraph[VD: ClassTag](graph: Graph[VD, Long]): Graph[MyVertexState, Long] = {
        val nodeWeights = graph.aggregateMessages[Long](
            cxt => {
                cxt.sendToSrc(cxt.attr)
                cxt.sendToDst(cxt.attr)
            },
            (a, b) => a + b,
            TripletFields.EdgeOnly
        )

        nodeWeights.foreach(result => println(s"nodeweight: ${result._1}, ${result._2}"))


        val louvainGraph = graph.outerJoinVertices(nodeWeights)((vid, data, weightOption) => {
            val weight = weightOption.getOrElse(0L)
            val state = new MyVertexState()
            state.community = vid
            state.changed = false
            state.tot = weight
            state.in = 0
            state.nodeWeight = weight
            state
        }).partitionBy(PartitionStrategy.EdgePartition2D)

        louvainGraph
    }
}

org.apache.spark.graphx.PartitionStrategy Scala Examples