org.apache.spark.graphx.GraphLoader Scala Example

Source File: GraphGeneration.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

6 votes

package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
    .setMaster("local[4]")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  }
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3
  )
  logNormalGraph.outDegrees.map(_._2).collect().sorted

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true
  ).partitionBy(PartitionStrategy.RandomVertexCut)
  actorGraph.edges.count()

  val actorComponents = actorGraph.connectedComponents().cache
  actorComponents.vertices.map(_._2).distinct().count

  val clusterSizes =actorComponents.vertices.map(
    v => (v._2, 1)).reduceByKey(_ + _)
  clusterSizes.map(_._2).max
  clusterSizes.map(_._2).min

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)
  strongComponents.vertices.map(_._2).distinct().count

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  actorGraph.triangleCount()
  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  actorGraph.staticPageRank(10)
  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2
  })

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println)

  actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10)

  actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count

}

Source File: LoadGraph.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.graphx

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.graphx.{GraphLoader, PartitionStrategy}
class LoadGraph extends ConfigurableStop {

  val authorEmail: String = "[email protected]"
  val description: String = "Load data and construct a graphx"
  val inportList: List[String] = List(Port.DefaultPort)


  var edgePort : String = "edges"
  var vertexPort : String = "vertex"
  val outportList: List[String] = List(edgePort,vertexPort)


  var dataPath:String = _

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val sc=spark.sparkContext

    import spark.sqlContext.implicits._
    var graph=GraphLoader
      .edgeListFile(sc,dataPath,true)
      .partitionBy(PartitionStrategy.RandomVertexCut)
    //TODO:can not transfer EdgeRdd to Dataset
    out.write(edgePort,graph.edges.toDF())
    out.write(vertexPort,graph.vertices.toDF())

  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  def setProperties(map : Map[String, Any]): Unit = {
    dataPath = MapUtil.get(map,"dataPath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val dataPath = new PropertyDescriptor()
      .name("dataPath")
      .displayName("Data_Path")
      .defaultValue("")
      .allowableValues(Set(""))
      .required(true)
      .example("hdfs://192.168.3.138:8020/work/test/test.csv")
    descriptor = dataPath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/graphx/LoadGraph.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.GraphX.toString)
  }

}

Source File: kBCDriver.scala From spark-betweenness with Apache License 2.0

5 votes

package com.centrality.kBC

import java.util.Calendar

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.GraphLoader
import org.apache.spark.graphx.PartitionStrategy

object kBCDriver 
{
  def main(args: Array[String])
  {
    // Create spark context
    val appName="kBCDriver"
    val conf = new SparkConf().setAppName(appName)//.setMaster(master)
    val sc = new SparkContext(conf)

    // Graph partition params
    val DEFAULT_K = 2
    val DEFAULT_EDGE_PARTITIONS=60
    val DEFAULT_CANONICAL_ORIENTATION=true
    val k = args(0).toInt
    println("k : " + k)
    val canonicalOrientation = DEFAULT_CANONICAL_ORIENTATION
    val numEdgePartitions = args(1).toInt
    
    // Input params
    val DEFAULT_INPUT_DIR="/tmp/input/"
    val DEFAULT_INPUT_FILE_NAME="edge_list.txt"
    val inputDir = args(2)
    val inputFileName = args(4)
    val inputPath = inputDir+inputFileName
    println("inputPath : " + inputPath)
    
    // Output params
    val DEFAULT_OUTPUT_DIR="/tmp/output/"
    val DEFAULT_V_OUTPUT_FILE=List(inputFileName,"kbc",k,"vertices").mkString("_")+".txt"
    val DEFAULT_E_OUTPUT_FILE=List(inputFileName,"kbc",k,"edges").mkString("_")+".txt"
    val outputDir = args(3)
    val outputVerticesFileName = sc.hadoopConfiguration.get("outputVerticesFileName", DEFAULT_V_OUTPUT_FILE)
    val outputEdgesFileName = sc.hadoopConfiguration.get("outputEdgesFileName", DEFAULT_E_OUTPUT_FILE)
    val outputVerticesPath = sc.hadoopConfiguration.get("outputVerticesPath", outputDir+outputVerticesFileName)
    val outputEdgesPath = sc.hadoopConfiguration.get("outputEdgesPath", outputDir+outputEdgesFileName)
    println("outputVerticesPath : " + outputVerticesPath)
    println("outputEdgesPath : " + outputEdgesPath)
    
    // Read graph
    val graph = GraphLoader.edgeListFile(sc, inputPath, canonicalOrientation, numEdgePartitions).partitionBy(PartitionStrategy.EdgePartition2D)
    println(Calendar.getInstance().getTime().toString + " vertices : " + graph.vertices.count())
    println(Calendar.getInstance().getTime().toString + " edges : " + graph.edges.count())
    
    // Run kBC
    println(Calendar.getInstance().getTime().toString + ": start kBC")
    val kBCGraph = 
      KBetweenness.run(graph, k)
    
    // Save graph to file
    println(Calendar.getInstance().getTime().toString + ": saving results ") 
    kBCGraph.vertices.coalesce(1).saveAsTextFile(outputVerticesPath)
    kBCGraph.edges.coalesce(1).saveAsTextFile(outputEdgesPath)
  }
}

Source File: PageRankTest.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_7

import org.apache.spark.graphx.GraphLoader
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class PageRankTest extends FunSuite {
  private val sc = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should calculate page rank using GraphX API") {
    //given
    val graph = GraphLoader.edgeListFile(sc, getClass.getResource("/pagerank/followers.txt").getPath)
    val ranks = graph.pageRank(0.0001).vertices

    val users = sc.textFile(getClass.getResource("/pagerank/users.txt").getPath).map { line =>
      val fields = line.split(",")
      (fields(0).toLong, fields(1))
    }

    //when
    val rankByUsername = users.join(ranks).map {
      case (_, (username, rank)) => (username, rank)
    }.sortBy((t) => t._2, ascending = false)
      .collect()
      .toList

    println(rankByUsername)
    //then
    rankByUsername.map(_._1) should contain theSameElementsInOrderAs List(
      "BarackObama",
      "ladygaga",
      "odersky",
      "jeresig",
      "matei_zaharia",
      "justinbieber"
    )
  }


}

Source File: GraphDataGenTest.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration

import java.io.File

import com.ibm.sparktc.sparkbench.testfixtures.{BuildAndTeardownData, SparkSessionProvider}
import com.ibm.sparktc.sparkbench.utils.SparkBenchException
import org.apache.spark.graphx.GraphLoader
import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}

class GraphDataGenTest extends FlatSpec with Matchers with BeforeAndAfterEach {
  val cool = new BuildAndTeardownData("graph-data-gen")

  val fileName = s"${cool.sparkBenchTestFolder}/${java.util.UUID.randomUUID.toString}.txt"

  var file: File = _

  override def beforeEach() {
    cool.createFolders()
    file = new File(fileName)
  }

  override def afterEach() {
    cool.deleteFolders()
  }

  "GraphDataGeneration" should "generate data correctly with all default options" in {

    val m = Map(
      "name" -> "graph-data-generator",
      "vertices" -> 100,
      "output" -> fileName
    )
    val generator = GraphDataGen(m)
    generator.doWorkload(spark = SparkSessionProvider.spark)
    val res = GraphLoader.edgeListFile(SparkSessionProvider.spark.sparkContext, fileName)

    res.vertices.count() shouldBe m("vertices")
  }

  it should "throw an error for any output format but .txt" in {
    val m1 = Map(
      "name" -> "graph-data-generator",
      "vertices" -> 100,
      "output" -> "/my-cool-file.csv"
    )
    val m2 = Map(
      "name" -> "graph-data-generator",
      "vertices" -> 100,
      "output" -> "/my-cool-file.parquet"
    )
    val m3 = Map(
      "name" -> "graph-data-generator",
      "vertices" -> 100,
      "output" -> "/my-cool-file.tsv"
    )

    a [SparkBenchException] should be thrownBy GraphDataGen(m1)
    a [SparkBenchException] should be thrownBy GraphDataGen(m2)
    a [SparkBenchException] should be thrownBy GraphDataGen(m3)
  }

  it should "throw errors when required values are missing" in {
    // Missing vertices
    val m1 = Map(
      "name" -> "graph-data-generator",
      "output" -> "/my-cool-file.csv"
    )
    // Missing output file name
    val m2 = Map(
      "name" -> "graph-data-generator",
      "vertices" -> 100
    )
    a [SparkBenchException] should be thrownBy GraphDataGen(m1)
    a [SparkBenchException] should be thrownBy GraphDataGen(m2)
  }
}

Source File: TriangleCountingExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{GraphLoader, PartitionStrategy}
// $example off$



    // Join the triangle counts with the usernames
    val users = sc.textFile("data/graphx/users.txt").map { line =>
      val fields = line.split(",")
      (fields(0).toLong, fields(1))
    }
    val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
      (username, tc)
    }
    // Print the result
    println(triCountByUsername.collect().mkString("\n"))
    // $example off$
    sc.stop()
  }
}
// scalastyle:on println

Source File: IterativeComputation$Test.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.api.operators

import org.apache.spark.graphx.GraphLoader
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfter, FlatSpec}


class IterativeComputation$Test extends FlatSpec with BeforeAndAfter{

  val master = "local[*]"
  def appName:String="InterativeComputationTest"

  implicit val sc:SparkContext= {
    val conf = new SparkConf()
      .setMaster(master)
      .setAppName(appName)
    new SparkContext(conf)
  }

  after {
    if(!sc.isStopped){
      sc.stop()
    }
  }

  def loadGraph(file:String)={
    GraphLoader.edgeListFile(sc,file.toString)
  }

  "Correct number of vertices " should "be returned" in{
    //Given("Graph")
    val graph=loadGraph(getClass.getResource("/graph").toString)
    //When("Taking size")
    val bucketSize: Long = IterativeComputation.wholeGraphBucket(graph)
    //Then("")
    assert(graph.numVertices==bucketSize)
  }

}

Source File: MeasureTest.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, GraphLoader}
import org.scalatest._


abstract class MeasureTest(implicit sc:SparkContext)  extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers{
  def time[T](str: String)(thunk: => T): (T,Long) = {
    logger.info(s"$str...")
    val t1 = System.currentTimeMillis
    val x = thunk
    val t2 = System.currentTimeMillis
    val diff=t2 - t1
    logger.info(s"$diff ms")
    (x,diff)
  }

  val logger=Logger.getLogger(this.getClass)

  def loadGraph(file:String)={
    val out: Graph[Int, Int] =GraphLoader.edgeListFile(sc,file.toString)
    out.vertices.setName(s"Graph vertices ${file}")
    out.edges.setName(s"Graph edges ${file}")
    out.triplets.setName(s"Graph triplets ${file}")
    out
  }

}

org.apache.spark.graphx.GraphLoader Scala Examples