org.apache.spark.HashPartitioner Scala Example

Source File: EdgeRDDImpl.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: PregelNWeight.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.graph.nweight

import scala.collection.JavaConversions._
import org.apache.spark.SparkContext
import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel
import org.apache.spark.graphx._
import org.apache.spark.graphx.impl.GraphImpl
import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap



object PregelNWeight extends Serializable{

  def sendMsg(edge: EdgeTriplet[SizedPriorityQueue, Double]) = {
    val m = new Long2DoubleOpenHashMap()
    val w1 = edge.attr
    val id = edge.srcId
    edge.dstAttr.foreach{ case (target, wn) =>
      if (target != id)
        m.put(target, wn*w1)
    }
    Iterator((id, m))
  }

  def mergMsg(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
    c2.long2DoubleEntrySet()
      .fastIterator()
      .foreach(pair =>
        c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
    c1
  }

  def vProg(id: VertexId, vdata: SizedPriorityQueue, msg: Long2DoubleOpenHashMap) = {
    vdata.clear()
    if (msg.size > 0) {
      msg.long2DoubleEntrySet().fastIterator().foreach { pair =>
        val src = pair.getLongKey()
        val wn = pair.getDoubleValue()
        vdata.enqueue((src, wn))
      }
      vdata
    } else {
      vdata.enqueue((id, 1))
      vdata 
    }
  }

  def nweight(sc: SparkContext, input: String, output: String, step: Int,
    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {

    //val start1 = System.currentTimeMillis
    val part = new HashPartitioner(numPartitions)
    val edges = sc.textFile(input, numPartitions).flatMap { line =>
      val fields = line.split("\\s+", 2)
      val src = fields(0).trim.toLong

      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
        val pair = pairStr.split(":")
        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
        (src, Edge(src, dest, weight))
      }
    }.partitionBy(part).map(_._2)

    var g = GraphImpl(edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()

    g = Pregel(g, new Long2DoubleOpenHashMap, step, EdgeDirection.In)(
      vProg, sendMsg, mergMsg)

    g.vertices.map { case (vid, vdata) => 
      var s = new StringBuilder
      s.append(vid)

      vdata.foreach { r =>
        s.append(' ')
        s.append(r._1)
        s.append(':')
        s.append(r._2)
      }
      s.toString
    }.saveAsTextFile(output)
  }
}

Source File: GraphxNWeight.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.graph.nweight

import scala.collection.JavaConversions._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel
import org.apache.spark.graphx._
import org.apache.spark.graphx.impl.GraphImpl
import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap



object GraphxNWeight extends Serializable{

  def mapF(edge: EdgeContext[SizedPriorityQueue, Double, Long2DoubleOpenHashMap]) = {
    val theMap = new Long2DoubleOpenHashMap()
    val edgeAttribute = edge.attr
    val id = edge.srcId
    edge.dstAttr.foreach{ case (target, wn) =>
      if (target != id)
        theMap.put(target, wn * edgeAttribute)
    }
    edge.sendToSrc(theMap)
  }

  def reduceF(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
    c2.long2DoubleEntrySet()
      .fastIterator()
      .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
    c1
  }

  def updateF(id: VertexId, vdata: SizedPriorityQueue, msg: Option[Long2DoubleOpenHashMap]) = {
    vdata.clear()
    val weightMap = msg.orNull
    if (weightMap != null) {
      weightMap.long2DoubleEntrySet().fastIterator().foreach { pair =>
        val src = pair.getLongKey()
        val wn = pair.getDoubleValue()
        vdata.enqueue((src, wn))
      }
    }
    vdata
  }

  def nweight(sc: SparkContext, input: String, output: String, step: Int,
    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {

    //val start1 = System.currentTimeMillis
    val part = new HashPartitioner(numPartitions)
    val edges = sc.textFile(input, numPartitions).flatMap { line =>
      val fields = line.split("\\s+", 2)
      val src = fields(0).trim.toLong

      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
        val pair = pairStr.split(":")
        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
        (src, Edge(src, dest, weight))
      }
    }.partitionBy(part).map(_._2)

    val vertices = edges.map { e =>
      (e.srcId, (e.dstId, e.attr))
    }.groupByKey(part).map { case (id, seq) =>
      val vdata = new SizedPriorityQueue(maxDegree)
      seq.foreach(vdata.enqueue)
      (id, vdata)
    }

    var g = GraphImpl(vertices, edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()

    var msg: RDD[(VertexId, Long2DoubleOpenHashMap)] = null
    for (i <- 2 to step) {
      msg = g.aggregateMessages(mapF, reduceF)
      g = g.outerJoinVertices(msg)(updateF).persist(storageLevel)
    }

    g.vertices.map { case (vid, vdata) => 
      var s = new StringBuilder
      s.append(vid)

      vdata.foreach { r =>
        s.append(' ')
        s.append(r._1)
        s.append(':')
        s.append(r._2)
      }
      s.toString
    }.saveAsTextFile(output)
  }
}

Source File: DBHPartitioner.scala From zen with Apache License 2.0

5 votes

package com.github.cloudml.zen.ml.partitioner

import scala.reflect.ClassTag

import com.github.cloudml.zen.ml.clustering.LDADefines._

import org.apache.spark.HashPartitioner
import org.apache.spark.graphx2._
import org.apache.spark.graphx2.impl.GraphImpl
import org.apache.spark.storage.StorageLevel


  def getKey(et: EdgeTriplet[Int, _]): Long = {
    val srcId = et.srcId
    val dstId = et.dstId
    val srcDeg = et.srcAttr
    val dstDeg = et.dstAttr
    val maxDeg = math.max(srcDeg, dstDeg)
    val minDegId = if (maxDeg == srcDeg) dstId else srcId
    val maxDegId = if (maxDeg == srcDeg) srcId else dstId
    if (maxDeg < threshold) {
      maxDegId
    } else {
      minDegId
    }
  }

  override def equals(other: Any): Boolean = other match {
    case dbh: DBHPartitioner =>
      dbh.numPartitions == numPartitions
    case _ =>
      false
  }
}

object DBHPartitioner {
  def partitionByDBH[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED],
    storageLevel: StorageLevel): Graph[VD, ED] = {
    val edges = input.edges
    val conf = edges.context.getConf
    val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length)
    val dbh = new DBHPartitioner(numPartitions, 0)
    val degGraph = GraphImpl(input.degrees, edges)
    val newEdges = degGraph.triplets.mapPartitions(_.map(et =>
      (dbh.getKey(et), Edge(et.srcId, et.dstId, et.attr))
    )).partitionBy(dbh).map(_._2)
    GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel)
  }
}

Source File: EdgeDstPartitioner.scala From zen with Apache License 2.0

5 votes

package com.github.cloudml.zen.ml.partitioner

import scala.reflect.ClassTag

import com.github.cloudml.zen.ml.clustering.LDADefines._

import org.apache.spark.HashPartitioner
import org.apache.spark.graphx2._
import org.apache.spark.graphx2.impl.GraphImpl
import org.apache.spark.storage.StorageLevel


class EdgeDstPartitioner(val partitions: Int) extends HashPartitioner(partitions) {

  @inline def getKey(et: EdgeTriplet[_, _]): Long = et.dstId

  override def equals(other: Any): Boolean = other match {
    case edp: EdgeDstPartitioner =>
      edp.numPartitions == numPartitions
    case _ =>
      false
  }
}

object EdgeDstPartitioner {
  def partitionByEDP[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED],
    storageLevel: StorageLevel): Graph[VD, ED] = {
    val edges = input.edges
    val conf = edges.context.getConf
    val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length)
    val edp = new EdgeDstPartitioner(numPartitions)
    val newEdges = input.triplets.mapPartitions(_.map(et =>
      (edp.getKey(et), Edge(et.srcId, et.dstId, et.attr))
    )).partitionBy(edp).map(_._2)
    GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel)
  }
}

Source File: EdgeRDDImpl.scala From zen with Apache License 2.0

5 votes

package org.apache.spark.graphx2.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx2._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: UtilSpark.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.sparktools

import scala.language.higherKinds
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import scala.reflect.runtime.universe.TypeTag
import scala.util.Random
import scala.reflect.ClassTag
import scala.collection.{GenSeq, mutable}
import org.clustering4ever.preprocessing.Preprocessable
import org.clustering4ever.hashing.HashingScalar
import org.clustering4ever.vectors.{GVector, ScalarVector}

object UtilSpark
{

	type IndexPartition = Int
	type HasConverged = Boolean
    type IsOriginalDot = Boolean


	final def generateDataLocalityOnHashsedDS[
		O,
		Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz]
	](
		rddToPartitioned: RDD[Pz[O, ScalarVector]],
		nbblocs1: Int,
		nbBucketRange: Int
	): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = {
		val isOriginalPoint = true
		val hasConverged = true
		val bucketRange = 1 to nbBucketRange

		val lshRDD = rddToPartitioned.map((_, isOriginalPoint, !hasConverged))

		val localityPerPartitionRDD = lshRDD.mapPartitionsWithIndex{ (idx, it) =>
			val ar = it.toList
			def rightNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx + i < nbblocs1) => (idx + i, (cz, !isOriginalPoint, !hasConverged)) } }
			def leftNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx - i >= 0) => (idx - i, (cz, !isOriginalPoint, !hasConverged)) } }
			val composing = if(idx == 0) ar.map((idx, _)) ::: rightNeighbourhood
				else if(idx == nbblocs1 - 1) ar.map((idx, _)) ::: leftNeighbourhood
				else ar.map((idx, _)) ::: leftNeighbourhood ::: rightNeighbourhood

	      composing.toIterator

	    }.partitionBy(new HashPartitioner(nbblocs1))
	    
	    localityPerPartitionRDD
	}

	final def generateDataLocalityLD[
		O,
		Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz],
		Hasher <: HashingScalar
	](
		rddToPartitioned: RDD[Pz[O, ScalarVector]],
		hashing: Hasher,
		nbblocs1: Int,
		nbBucketRange: Int
	): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = {
		val hashedRDD = rddToPartitioned.sortBy( cz => hashing.hf(cz.v) , ascending = true, nbblocs1 )
		generateDataLocalityOnHashsedDS(hashedRDD, nbblocs1, nbBucketRange)
	}

}

Source File: PartitionBy.scala From learning-spark with Apache License 2.0

5 votes

package com.javachen.spark.examples.rdd

import org.apache.spark.{RangePartitioner,HashPartitioner, SparkContext}

object PartitionBy {
  def main(args: Array[String]) {

    val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
    val data1 = Array[(String, Int)](("K", 1), ("T", 2),
      ("T", 3), ("W", 4),
      ("W", 5), ("W", 6)
    )
    val pairs = sc.parallelize(data1, 3)
    //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
    //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
    var result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
    result = pairs.partitionBy(new HashPartitioner(2))
    result.foreach(println)
  }
}

Source File: EdgeRDDImpl.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: StatefulNetworkWordCount.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.streaming

import org.apache.spark.SparkConf
import org.apache.spark.HashPartitioner
import org.apache.spark.streaming._


object StatefulNetworkWordCount {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.err.println("Usage: StatefulNetworkWordCount <hostname> <port>")
      System.exit(1)
    }

    StreamingExamples.setStreamingLogLevels()

    val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount")
    // Create the context with a 1 second batch size
    val ssc = new StreamingContext(sparkConf, Seconds(1))
    ssc.checkpoint(".")

    // Initial state RDD for mapWithState operation
    val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))

    // Create a ReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited test (eg. generated by 'nc')
    val lines = ssc.socketTextStream(args(0), args(1).toInt)
    val words = lines.flatMap(_.split(" "))
    val wordDstream = words.map(x => (x, 1))

    // Update the cumulative count using mapWithState
    // This will give a DStream made of state (which is the cumulative count of the words)
    val mappingFunc = (word: String, one: Option[Int], state: State[Int]) => {
      val sum = one.getOrElse(0) + state.getOption.getOrElse(0)
      val output = (word, sum)
      state.update(sum)
      output
    }

    val stateDstream = wordDstream.mapWithState(
      StateSpec.function(mappingFunc).initialState(initialRDD))
    stateDstream.print()
    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println

Source File: EdgeRDDImpl.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: EdgeRDDImpl.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: EdgeRDDImpl.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: StatefulNetworkWordCount.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.examples.streaming

import org.apache.spark.SparkConf
import org.apache.spark.HashPartitioner
import org.apache.spark.streaming._


object StatefulNetworkWordCount {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.err.println("Usage: StatefulNetworkWordCount <hostname> <port>")
      System.exit(1)
    }

    StreamingExamples.setStreamingLogLevels()

    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum

      val previousCount = state.getOrElse(0)

      Some(currentCount + previousCount)
    }

    val newUpdateFunc = (iterator: Iterator[(String, Seq[Int], Option[Int])]) => {
      iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
    }

    val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount")
    // Create the context with a 1 second batch size
    val ssc = new StreamingContext(sparkConf, Seconds(1))
    ssc.checkpoint(".")

    // Initial RDD input to updateStateByKey
    val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))

    // Create a ReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited test (eg. generated by 'nc')
    val lines = ssc.socketTextStream(args(0), args(1).toInt)
    val words = lines.flatMap(_.split(" "))
    val wordDstream = words.map(x => (x, 1))

    // Update the cumulative count using updateStateByKey
    // This will give a Dstream made of state (which is the cumulative count of the words)
    val stateDstream = wordDstream.updateStateByKey[Int](newUpdateFunc,
      new HashPartitioner (ssc.sparkContext.defaultParallelism), true, initialRDD)
    stateDstream.print()
    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: EdgeRDDImpl.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: JoinableRDD.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import org.apache.spark.HashPartitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

class JoinableRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) {

  def selfJoin(numPartitions: Int = rdd.partitions.length): RDD[(K, (V, V))] = fastJoin(rdd, numPartitions)

  def fastJoin[W](other: RDD[(K, W)], numPartitions: Int = rdd.partitions.length): RDD[(K, (V, W))] = {
    val partitioner = new HashPartitioner(numPartitions)
    val grouped = rdd cogroup other

    val left = grouped.flatMap{
      case (k, (vs, ws)) => vs.zipWithIndex.map {
        case (v, idx) => ((k, idx), v)
      }
    }.partitionBy(partitioner)

    val right = grouped.flatMap {
      case (k, (vs, ws)) => ws.map { w => ((k, w.hashCode()), (w, vs.size)) }
    }.partitionBy(partitioner).flatMap {
      case ((k, r), (w, size)) => (0 until size).map(i => ((k, w), i))
    }.map {
      case ((k, w), idx) => ((k, idx), w)
    }

    (left join right).map {
      case ((k, idx), (v, w)) => (k, (v, w))
    }
  }

}

Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditWindowAndActionApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditWindowAndActionApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString)
    val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5))
    val windowedCounts = windowedRecs.countByValue()

    windowedCounts.print(10)
    windowedCounts.saveAsObjectFiles("subreddit", "obj")
    windowedCounts.saveAsTextFiles("subreddit", "txt")

    globalCount.saveAsHadoopFiles("subreddit", "hadoop",
      classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]])
    globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop",
      classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]])
    comments.foreachRDD(rdd => {
      LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count()))
    })

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditKeyValueApp {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>")
      System.exit(1)
    }
    val Seq(appName, inputPath, inputPathPopular) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .groupByKey()
      .map(r => (r._2.sum, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .reduceByKey(_ + _)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length))
      .combineByKey(
        (v) => (v, 1),
        (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
        (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
        new HashPartitioner(ssc.sparkContext.defaultParallelism))
      .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubreddit2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2)

    val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubredditCo2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: L10-2DataProc.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.HashPartitioner
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.JsonAST.JNothing
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object DataProcApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    ssc.socketTextStream(hostname, port.toInt)
      .map(r => {
        implicit val formats = DefaultFormats
        parse(r)
      })
      .filter(jvalue => {
        jvalue \ "attributes" \ "Wi-Fi" != JNothing
      })
      .map(jvalue => {
        implicit val formats = DefaultFormats
        ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int])
      })
      .combineByKey(
        (v) => (v, 1),
        (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
        (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
        new HashPartitioner(ssc.sparkContext.defaultParallelism))
      .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
      .print()

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext}
import org.scalatest.FunSuite

class CustomRangePartitionerTest extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use custom range partitioner") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.amount)

    //when, then
    val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000))))

    //then
    partitioned.collect().toList
  }
}

class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{
  override def numPartitions: Int = ranges.size

  override def getPartition(key: Any): Int = {
    if(!key.isInstanceOf[Int]){
      throw new IllegalArgumentException("partitioner works only for Int type")
    }
    val keyInt = key.asInstanceOf[Int]
    val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2)
    println(s"for key: $key return $index")
    index
  }
}

Source File: UsePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.{HashPartitioner, RangePartitioner, SparkContext}
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class UsePartitioner extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use different partitioners") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.userId)

    //when, then
    val partitioner = keyed.partitioner
    assert(partitioner.isEmpty)

    val hashPartitioner = keyed.partitionBy(new HashPartitioner(100))
    println(hashPartitioner)
    assert(hashPartitioner.partitioner.isDefined)

    val rangePartitioner = keyed.partitionBy(new RangePartitioner(100, keyed))
    println(rangePartitioner)
    assert(rangePartitioner.partitioner.isDefined)

  }
}

Source File: ExecutionPlanForJoins.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_3

import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, SparkContext}
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class ExecutionPlanForJoins extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext
  test("should use custom partitioner while join") {
    //given
    val transactions = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car")))
    val persons = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny")))

    //when
    val personsDataPartitioner = transactions.partitioner match {
      case Some(p) => p
      case None => new HashPartitioner(persons.partitions.length)
    }


    val res = persons.join(transactions, personsDataPartitioner).collect().toList

    res should contain theSameElementsAs
      List((2, ("Michael", "dog")), (1, ("Tom", "bag")))
  }

  test("can broadcast small data set to every executor and join in-memory") {
    //given
    val smallDataSet = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car")))
    val hugeDataSet = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny")))

    //when broadcast small rdd to all executors
    val smallInMemoryDataSet = spark.broadcast(smallDataSet.collectAsMap())

    //then join will not need to do shuffle
    val res = hugeDataSet.mapPartitions(iter => {
      iter.flatMap {
        case (k, v1) => smallInMemoryDataSet.value.get(k) match {
          case None => Seq.empty
          case Some(v2) => Seq((k, (v1, v2)))
        }
      }
    })

    res.collect().toList should contain theSameElementsAs
      List((2, ("Michael", "dog")), (1, ("Tom", "bag")))
  }

}

Source File: VectorRDDFunctions.scala From spark-vl-bfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.optim

import scala.language.implicitConversions

import org.apache.spark.HashPartitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg._

class VectorRDDFunctions(self: RDD[Vector]) {

  def treeSum(depth: Int = 2): RDD[Vector] = {
    val zeroValue: Vector = null
    val seqOp = (s: Vector, v: Vector) => {
      if (s != null) {
        BLAS.axpy(1.0, v, s)
        s
      } else {
        v.copy.toDense
      }
    }
    val combOp = (s1: Vector, s2: Vector) => {
      // TODO: handle empty partitions
      BLAS.axpy(1.0, s2, s1)
      s1
    }
    require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.")
    val aggregatePartition = (it: Iterator[Vector]) => it.aggregate(zeroValue)(seqOp, combOp)
    var partiallyAggregated = self.mapPartitions(it => Iterator(aggregatePartition(it)))
    var numPartitions = partiallyAggregated.partitions.length
    val scale = math.max(math.pow(numPartitions, 1.0 / depth), 2.0)
    while (numPartitions > 1) {
      numPartitions = math.ceil(numPartitions / scale).toInt
      val curNumPartitions = numPartitions
      partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex {
        (i, iter) => iter.map((i % curNumPartitions, _))
      }.reduceByKey(new HashPartitioner(curNumPartitions), combOp)
      .values
    }
    require(partiallyAggregated.partitions.length == 1)
    partiallyAggregated
  }
}

object VectorRDDFunctions {
  implicit def fromVectorRDD(rdd: RDD[Vector]): VectorRDDFunctions = new VectorRDDFunctions(rdd)
}

Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous

import java.util.UUID

import org.apache.spark.{HashPartitioner, SparkEnv}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD}


case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan {
  override def output: Seq[Attribute] = child.output

  override def children: Seq[SparkPlan] = child :: Nil

  override def outputPartitioning: Partitioning = SinglePartition

  override def doExecute(): RDD[InternalRow] = {
    assert(numPartitions == 1)
    new ContinuousCoalesceRDD(
      sparkContext,
      numPartitions,
      conf.continuousStreamingExecutorQueueSize,
      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong,
      child.execute())
  }
}

Source File: VRDDFunctionsSuite.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.collection.mutable

import org.apache.spark.{HashPartitioner, SparkFunSuite}
import org.apache.spark.ml.linalg.distributed.{DistributedVectorPartitioner, VGridPartitioner}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class VRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  import org.apache.spark.rdd.VRDDFunctions._

  override def beforeAll(): Unit = {
    super.beforeAll()
  }

  def testMapJoinPartitions(shuffleRdd2: Boolean): Unit = {
    val sc = spark.sparkContext
    val rdd1 = sc.parallelize(Array.tabulate(81) {
      idx => {
        val rowIdx = idx % 9
        val colIdx = idx / 9
        ((rowIdx, colIdx), (rowIdx, colIdx))
      }
    }).partitionBy(VGridPartitioner(9, 9, 3, 3)).cache()
    rdd1.count()
    val rdd2 = sc.parallelize(Array.tabulate(9)(idx => (idx, idx)))
      .partitionBy(new DistributedVectorPartitioner(9)).cache()
    rdd2.count()

    val rddr = rdd1.mapJoinPartition(rdd2, shuffleRdd2)(
      (x: Int) => {
        val blockColIdx = x / 3
        val pos = blockColIdx * 3
        Array(pos, pos + 1, pos + 2)
      },
      (p1: Int, iter1, list: Array[(Int, Iterator[(Int, Int)])]) => {
        Iterator((p1, list.map(tuple => (tuple._1, tuple._2.next())).mkString(",")))
      }
    )

    assert(rddr.collect() === Array(
      (0, "(0,(0,0)),(1,(1,1)),(2,(2,2))"),
      (1, "(0,(0,0)),(1,(1,1)),(2,(2,2))"),
      (2, "(0,(0,0)),(1,(1,1)),(2,(2,2))"),
      (3, "(3,(3,3)),(4,(4,4)),(5,(5,5))"),
      (4, "(3,(3,3)),(4,(4,4)),(5,(5,5))"),
      (5, "(3,(3,3)),(4,(4,4)),(5,(5,5))"),
      (6, "(6,(6,6)),(7,(7,7)),(8,(8,8))"),
      (7, "(6,(6,6)),(7,(7,7)),(8,(8,8))"),
      (8, "(6,(6,6)),(7,(7,7)),(8,(8,8))")
    ))
  }

  test("mapJoinPartitions V1") {
    testMapJoinPartitions(false)
  }

  test("mapJoinPartitions V2") {
    testMapJoinPartitions(true)
  }

  test("test multiZipRDDs") {
    val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val rddList = List(rdd1, rdd1.map(_ + 10), rdd1.map(_ + 200))
    val zipped = VRDDFunctions.zipMultiRDDs(rddList) {
      iterList: List[Iterator[Int]] => new Iterator[Int]{
        override def hasNext: Boolean = iterList.map(_.hasNext).reduce(_ && _)
        override def next(): Int = iterList.map(_.next()).sum
      }
    }
    assert(zipped.glom().map(_.toList).collect().toList ===
      List(List(213, 216), List(219, 222)))
  }

  test("aggregateByKeyInMemory") {
    val rdd: RDD[(Int, Int)] = sc.makeRDD(Array(
      (1, 1), (2, 2), (3, 3),
      (1, 10), (2, 20), (3, 30)
    ), 3)
    import org.apache.spark.rdd.VPairRDDFunctions._
    val res = rdd.aggregateByKeyInMemory(new mutable.HashSet[Int], new HashPartitioner(3))(
      (u, v) => u += v,
      (u1, u2) => u1 ++= u2
    ).mapValues(_.toSet).collect()

    assert(res.sortBy(_._1) === Array(
      (1, Set(1, 10)),
      (2, Set(2, 20)),
      (3, Set(3, 30))
    ))
  }
}

Source File: GroupSorted.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted.api.java

import java.util.{ Comparator, Iterator => JIterator }
import scala.reflect.ClassTag
import scala.collection.JavaConverters._

import org.apache.spark.{ Partitioner, HashPartitioner }
import org.apache.spark.Partitioner.defaultPartitioner
import org.apache.spark.api.java.JavaPairRDD
import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction }

import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted }

object GroupSorted {
  private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] {
    def compare(x: T, y: T) = comparator.compare(x, y)
  }

  private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator)

  private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]]

  private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K])

  private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = {
    implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag
    implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag
    val valueOrdering = Option(valueComparator).map(comparatorToOrdering)
    SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering)
  }
}

class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) {
  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator))

  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, null))

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) =
    this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) =
    this(javaPairRDD, numPartitions, null)

  def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) =
    this(javaPairRDD, -1, valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null)

  import GroupSorted._

  override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala))
  }

  override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v)))
  }

  def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv)))
  }

  def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala))
  }

  def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v)))
  }

  def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call))
  }

  def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v)))
  }
}

Source File: EdgeRDDImpl.scala From graphx-algorithm with GNU General Public License v2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

Source File: TestJoins.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import scala.Iterator



object TestJoins {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob"))

		val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache
		val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache

		inspectRDD(x)
		inspectRDD(y)

		println(">>> joining x with y")
		val joinRDD = x.join(y).cache
		joinRDD.collect().foreach(println)
		inspectRDD(joinRDD)

		println(">>> left outer join of x with y")
		val leftJoin = x.leftOuterJoin(y).cache
		leftJoin.collect().foreach(println)
		inspectRDD(leftJoin)

		println(">>> right outer join of x with y")
		val rightJoin = x.rightOuterJoin(y).cache
		rightJoin.collect().foreach(println)
		inspectRDD(rightJoin)
	}
	
	def inspectRDD[T](rdd: RDD[T]): Unit = {
		
		println(">>> Partition length...")
		rdd.mapPartitions(f => Iterator(f.length), true).foreach(println)
		
		println(">>> Partition data...")
		rdd.foreachPartition(f => f.foreach(println))
	}
}

Source File: TestValueTransformations.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.PairRDDFunctions

case class Customer(ID: Int, name: String)
case class Item(ID: Int, name: String, price: Float)
case class Order(ID: Int, item: Item, quantity: Int, var discount: Float)
case class CustomerOrders(cust: Customer, order: Order, offer: Boolean)

object TestValueTransformations {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestCombineByKeyJob"))
		val rdd = sc.parallelize(
			List(
				CustomerOrders(Customer(1, "A"), Order(1, Item(1, "item_1", 20), 2, 0), false),
				CustomerOrders(Customer(1, "A"), Order(2, Item(2, "item_2", 10), 1, 0), false),
				CustomerOrders(Customer(2, "B"), Order(1, Item(1, "item_1", 20), 2, 0), true)))

		println(">>> List of customers availing offers")
		
		orderValuePerCustomer.foreach(println)

		println(">>> Total order value for customer ID = 1 is " + orderValuePerCustomer.reduceByKey(_ + _).lookup(1).toString())

	}
}

Source File: RDFS3.scala From SparkSRE with Apache License 2.0

5 votes

package com.hj.examples

import com.hj.constant.Const
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}

object RDFS3 {
  def main(args: Array[String]): Unit = {
    //Arguments: input/RDFS3.in output/RDFS3.out
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
      System.exit(1)
    }
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS3.in").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)  //"input/RDFS3.in"

    val triples = lines.map(x => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))
    })

    

    val partitioner = new HashPartitioner(2)

    val range = triples.filter(x => x._2.equals(Const.RDFS_RANGE)).map(x => (x._1, x._3))
    val pso = triples.map(x => (x._2, (x._1, x._3))).partitionBy(partitioner)
    val joined = pso.join(range)
    val res = joined.map(x => (x._2._1._2, x._2._2))

    res.foreach(x => println(x))
    res.saveAsTextFile(outputPath)
  }
}

Source File: RelationWithItemToItem.scala From AI with Apache License 2.0

5 votes

package com.bigchange.mllib

import breeze.numerics.{sqrt, pow}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}



object RelationWithItemToItem {

  def main(args: Array[String]) {

    val sc = new SparkContext(new SparkConf()
      .setAppName("Item to Item")
      .setMaster("local"))
    // announce the top number of items to get
    val topK = 2

    val userItem = sc.textFile("/rating.dat")
      .map(_.split("\t")).map(x =>(x(0),x(1),x(2))).distinct().cache()
    // cal item -> (user,rating) and item -> sqrt(ratings)
    val itemUser = userItem.map(x => (x._2,(x._1,x._3.toDouble))).partitionBy(new HashPartitioner(20))
    // sqrt : 规整化 rating 的值
    val itemPowSqrt = userItem.map(x => (x._2,pow(x._3.toDouble,2.0))).reduceByKey(_+_).mapValues(x => sqrt(x))
    // cal item -> ((user,rating),sqrt(ratings)) => user -> (item,rating/sqrt(ratings))
    val userItemSqrt = itemUser.join(itemPowSqrt).map(x =>{
      val item = x._1
      val sqrtRatings = x._2._2
      val user = x._2._1._1
      val rating = x._2._1._2
      (user,(item,rating / sqrtRatings))
    })
    // cal the relation of item to item in user dimension => get the score of item to item which connection the relation of items
    val itemToItem = userItemSqrt.join(userItemSqrt).map(x =>{
      val item1 = x._2._1._1
      val rating1 = x._2._1._2
      val item2 = x._2._2._1
      val rating2 = x._2._2._2
      val score = rating1 * rating2
      if(item1 == item2){
        ((item1,item2),-1.0)
      }else{
        ((item1,item2),score)
      }
    })

    itemToItem.reduceByKey(_+_).map(x => (x._1._1,(x._1._2,x._2))).groupByKey().foreach(x => {
      val sourceItem = x._1
      val topItem = x._2.toList.filter(_._2 > 0).sortWith(_._2 > _._2).take(topK)
      println(s"item = $sourceItem,topK relative item list:$topItem")
    })
    sc.stop()
  }

}

Source File: EdgeRDDImpl.scala From drizzle-spark with Apache License 2.0

4 votes

package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}

org.apache.spark.HashPartitioner Scala Examples