org.apache.spark.HashPartitioner Scala Examples
The following examples show how to use org.apache.spark.HashPartitioner.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: EdgeRDDImpl.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 2
Source File: PregelNWeight.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.graph.nweight import scala.collection.JavaConversions._ import org.apache.spark.SparkContext import org.apache.spark.HashPartitioner import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.GraphImpl import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap object PregelNWeight extends Serializable{ def sendMsg(edge: EdgeTriplet[SizedPriorityQueue, Double]) = { val m = new Long2DoubleOpenHashMap() val w1 = edge.attr val id = edge.srcId edge.dstAttr.foreach{ case (target, wn) => if (target != id) m.put(target, wn*w1) } Iterator((id, m)) } def mergMsg(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = { c2.long2DoubleEntrySet() .fastIterator() .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue())) c1 } def vProg(id: VertexId, vdata: SizedPriorityQueue, msg: Long2DoubleOpenHashMap) = { vdata.clear() if (msg.size > 0) { msg.long2DoubleEntrySet().fastIterator().foreach { pair => val src = pair.getLongKey() val wn = pair.getDoubleValue() vdata.enqueue((src, wn)) } vdata } else { vdata.enqueue((id, 1)) vdata } } def nweight(sc: SparkContext, input: String, output: String, step: Int, maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) { //val start1 = System.currentTimeMillis val part = new HashPartitioner(numPartitions) val edges = sc.textFile(input, numPartitions).flatMap { line => val fields = line.split("\\s+", 2) val src = fields(0).trim.toLong fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr => val pair = pairStr.split(":") val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble) (src, Edge(src, dest, weight)) } }.partitionBy(part).map(_._2) var g = GraphImpl(edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache() g = Pregel(g, new Long2DoubleOpenHashMap, step, EdgeDirection.In)( vProg, sendMsg, mergMsg) g.vertices.map { case (vid, vdata) => var s = new StringBuilder s.append(vid) vdata.foreach { r => s.append(' ') s.append(r._1) s.append(':') s.append(r._2) } s.toString }.saveAsTextFile(output) } }
Example 3
Source File: GraphxNWeight.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.graph.nweight import scala.collection.JavaConversions._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.HashPartitioner import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ import org.apache.spark.graphx.impl.GraphImpl import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap object GraphxNWeight extends Serializable{ def mapF(edge: EdgeContext[SizedPriorityQueue, Double, Long2DoubleOpenHashMap]) = { val theMap = new Long2DoubleOpenHashMap() val edgeAttribute = edge.attr val id = edge.srcId edge.dstAttr.foreach{ case (target, wn) => if (target != id) theMap.put(target, wn * edgeAttribute) } edge.sendToSrc(theMap) } def reduceF(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = { c2.long2DoubleEntrySet() .fastIterator() .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue())) c1 } def updateF(id: VertexId, vdata: SizedPriorityQueue, msg: Option[Long2DoubleOpenHashMap]) = { vdata.clear() val weightMap = msg.orNull if (weightMap != null) { weightMap.long2DoubleEntrySet().fastIterator().foreach { pair => val src = pair.getLongKey() val wn = pair.getDoubleValue() vdata.enqueue((src, wn)) } } vdata } def nweight(sc: SparkContext, input: String, output: String, step: Int, maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) { //val start1 = System.currentTimeMillis val part = new HashPartitioner(numPartitions) val edges = sc.textFile(input, numPartitions).flatMap { line => val fields = line.split("\\s+", 2) val src = fields(0).trim.toLong fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr => val pair = pairStr.split(":") val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble) (src, Edge(src, dest, weight)) } }.partitionBy(part).map(_._2) val vertices = edges.map { e => (e.srcId, (e.dstId, e.attr)) }.groupByKey(part).map { case (id, seq) => val vdata = new SizedPriorityQueue(maxDegree) seq.foreach(vdata.enqueue) (id, vdata) } var g = GraphImpl(vertices, edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache() var msg: RDD[(VertexId, Long2DoubleOpenHashMap)] = null for (i <- 2 to step) { msg = g.aggregateMessages(mapF, reduceF) g = g.outerJoinVertices(msg)(updateF).persist(storageLevel) } g.vertices.map { case (vid, vdata) => var s = new StringBuilder s.append(vid) vdata.foreach { r => s.append(' ') s.append(r._1) s.append(':') s.append(r._2) } s.toString }.saveAsTextFile(output) } }
Example 4
Source File: DBHPartitioner.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.partitioner import scala.reflect.ClassTag import com.github.cloudml.zen.ml.clustering.LDADefines._ import org.apache.spark.HashPartitioner import org.apache.spark.graphx2._ import org.apache.spark.graphx2.impl.GraphImpl import org.apache.spark.storage.StorageLevel def getKey(et: EdgeTriplet[Int, _]): Long = { val srcId = et.srcId val dstId = et.dstId val srcDeg = et.srcAttr val dstDeg = et.dstAttr val maxDeg = math.max(srcDeg, dstDeg) val minDegId = if (maxDeg == srcDeg) dstId else srcId val maxDegId = if (maxDeg == srcDeg) srcId else dstId if (maxDeg < threshold) { maxDegId } else { minDegId } } override def equals(other: Any): Boolean = other match { case dbh: DBHPartitioner => dbh.numPartitions == numPartitions case _ => false } } object DBHPartitioner { def partitionByDBH[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED], storageLevel: StorageLevel): Graph[VD, ED] = { val edges = input.edges val conf = edges.context.getConf val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length) val dbh = new DBHPartitioner(numPartitions, 0) val degGraph = GraphImpl(input.degrees, edges) val newEdges = degGraph.triplets.mapPartitions(_.map(et => (dbh.getKey(et), Edge(et.srcId, et.dstId, et.attr)) )).partitionBy(dbh).map(_._2) GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel) } }
Example 5
Source File: EdgeDstPartitioner.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.partitioner import scala.reflect.ClassTag import com.github.cloudml.zen.ml.clustering.LDADefines._ import org.apache.spark.HashPartitioner import org.apache.spark.graphx2._ import org.apache.spark.graphx2.impl.GraphImpl import org.apache.spark.storage.StorageLevel class EdgeDstPartitioner(val partitions: Int) extends HashPartitioner(partitions) { @inline def getKey(et: EdgeTriplet[_, _]): Long = et.dstId override def equals(other: Any): Boolean = other match { case edp: EdgeDstPartitioner => edp.numPartitions == numPartitions case _ => false } } object EdgeDstPartitioner { def partitionByEDP[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED], storageLevel: StorageLevel): Graph[VD, ED] = { val edges = input.edges val conf = edges.context.getConf val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length) val edp = new EdgeDstPartitioner(numPartitions) val newEdges = input.triplets.mapPartitions(_.map(et => (edp.getKey(et), Edge(et.srcId, et.dstId, et.attr)) )).partitionBy(edp).map(_._2) GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel) } }
Example 6
Source File: EdgeRDDImpl.scala From zen with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx2.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx2._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 7
Source File: UtilSpark.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.sparktools import scala.language.higherKinds import org.apache.spark.rdd.RDD import org.apache.spark.HashPartitioner import scala.reflect.runtime.universe.TypeTag import scala.util.Random import scala.reflect.ClassTag import scala.collection.{GenSeq, mutable} import org.clustering4ever.preprocessing.Preprocessable import org.clustering4ever.hashing.HashingScalar import org.clustering4ever.vectors.{GVector, ScalarVector} object UtilSpark { type IndexPartition = Int type HasConverged = Boolean type IsOriginalDot = Boolean final def generateDataLocalityOnHashsedDS[ O, Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz] ]( rddToPartitioned: RDD[Pz[O, ScalarVector]], nbblocs1: Int, nbBucketRange: Int ): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = { val isOriginalPoint = true val hasConverged = true val bucketRange = 1 to nbBucketRange val lshRDD = rddToPartitioned.map((_, isOriginalPoint, !hasConverged)) val localityPerPartitionRDD = lshRDD.mapPartitionsWithIndex{ (idx, it) => val ar = it.toList def rightNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx + i < nbblocs1) => (idx + i, (cz, !isOriginalPoint, !hasConverged)) } } def leftNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx - i >= 0) => (idx - i, (cz, !isOriginalPoint, !hasConverged)) } } val composing = if(idx == 0) ar.map((idx, _)) ::: rightNeighbourhood else if(idx == nbblocs1 - 1) ar.map((idx, _)) ::: leftNeighbourhood else ar.map((idx, _)) ::: leftNeighbourhood ::: rightNeighbourhood composing.toIterator }.partitionBy(new HashPartitioner(nbblocs1)) localityPerPartitionRDD } final def generateDataLocalityLD[ O, Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz], Hasher <: HashingScalar ]( rddToPartitioned: RDD[Pz[O, ScalarVector]], hashing: Hasher, nbblocs1: Int, nbBucketRange: Int ): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = { val hashedRDD = rddToPartitioned.sortBy( cz => hashing.hf(cz.v) , ascending = true, nbblocs1 ) generateDataLocalityOnHashsedDS(hashedRDD, nbblocs1, nbBucketRange) } }
Example 8
Source File: PartitionBy.scala From learning-spark with Apache License 2.0 | 5 votes |
package com.javachen.spark.examples.rdd import org.apache.spark.{RangePartitioner,HashPartitioner, SparkContext} object PartitionBy { def main(args: Array[String]) { val sc = new SparkContext("local", "ReduceByKeyToDriver Test") val data1 = Array[(String, Int)](("K", 1), ("T", 2), ("T", 3), ("W", 4), ("W", 5), ("W", 6) ) val pairs = sc.parallelize(data1, 3) //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) var result = pairs.partitionBy(new RangePartitioner(2, pairs, true)) result = pairs.partitionBy(new HashPartitioner(2)) result.foreach(println) } }
Example 9
Source File: EdgeRDDImpl.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 10
Source File: StatefulNetworkWordCount.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.HashPartitioner import org.apache.spark.streaming._ object StatefulNetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: StatefulNetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount") // Create the context with a 1 second batch size val ssc = new StreamingContext(sparkConf, Seconds(1)) ssc.checkpoint(".") // Initial state RDD for mapWithState operation val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1))) // Create a ReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited test (eg. generated by 'nc') val lines = ssc.socketTextStream(args(0), args(1).toInt) val words = lines.flatMap(_.split(" ")) val wordDstream = words.map(x => (x, 1)) // Update the cumulative count using mapWithState // This will give a DStream made of state (which is the cumulative count of the words) val mappingFunc = (word: String, one: Option[Int], state: State[Int]) => { val sum = one.getOrElse(0) + state.getOption.getOrElse(0) val output = (word, sum) state.update(sum) output } val stateDstream = wordDstream.mapWithState( StateSpec.function(mappingFunc).initialState(initialRDD)) stateDstream.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 11
Source File: EdgeRDDImpl.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 12
Source File: EdgeRDDImpl.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 13
Source File: EdgeRDDImpl.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 14
Source File: StatefulNetworkWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.HashPartitioner import org.apache.spark.streaming._ object StatefulNetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: StatefulNetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val newUpdateFunc = (iterator: Iterator[(String, Seq[Int], Option[Int])]) => { iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s))) } val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount") // Create the context with a 1 second batch size val ssc = new StreamingContext(sparkConf, Seconds(1)) ssc.checkpoint(".") // Initial RDD input to updateStateByKey val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1))) // Create a ReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited test (eg. generated by 'nc') val lines = ssc.socketTextStream(args(0), args(1).toInt) val words = lines.flatMap(_.split(" ")) val wordDstream = words.map(x => (x, 1)) // Update the cumulative count using updateStateByKey // This will give a Dstream made of state (which is the cumulative count of the words) val stateDstream = wordDstream.updateStateByKey[Int](newUpdateFunc, new HashPartitioner (ssc.sparkContext.defaultParallelism), true, initialRDD) stateDstream.print() ssc.start() ssc.awaitTermination() } }
Example 15
Source File: EdgeRDDImpl.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 16
Source File: JoinableRDD.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import org.apache.spark.HashPartitioner import org.apache.spark.rdd.RDD import scala.reflect.ClassTag class JoinableRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) { def selfJoin(numPartitions: Int = rdd.partitions.length): RDD[(K, (V, V))] = fastJoin(rdd, numPartitions) def fastJoin[W](other: RDD[(K, W)], numPartitions: Int = rdd.partitions.length): RDD[(K, (V, W))] = { val partitioner = new HashPartitioner(numPartitions) val grouped = rdd cogroup other val left = grouped.flatMap{ case (k, (vs, ws)) => vs.zipWithIndex.map { case (v, idx) => ((k, idx), v) } }.partitionBy(partitioner) val right = grouped.flatMap { case (k, (vs, ws)) => ws.map { w => ((k, w.hashCode()), (w, vs.size)) } }.partitionBy(partitioner).flatMap { case ((k, r), (w, size)) => (0 until size).map(i => ((k, w), i)) }.map { case ((k, w), idx) => ((k, idx), w) } (left join right).map { case ((k, idx), (v, w)) => (k, (v, w)) } } }
Example 17
Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditWindowAndActionApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditWindowAndActionApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) val windowedCounts = windowedRecs.countByValue() windowedCounts.print(10) windowedCounts.saveAsObjectFiles("subreddit", "obj") windowedCounts.saveAsTextFiles("subreddit", "txt") globalCount.saveAsHadoopFiles("subreddit", "hadoop", classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) comments.foreachRDD(rdd => { LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) }) ssc.start() ssc.awaitTermination() } }
Example 18
Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditKeyValueApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>") System.exit(1) } val Seq(appName, inputPath, inputPathPopular) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .groupByKey() .map(r => (r._2.sum, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .reduceByKey(_ + _) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length)) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubreddit2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2) val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubredditCo2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) ssc.start() ssc.awaitTermination() } }
Example 19
Source File: L10-2DataProc.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.HashPartitioner import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.JsonAST.JNothing import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object DataProcApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .filter(jvalue => { jvalue \ "attributes" \ "Wi-Fi" != JNothing }) .map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int]) }) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .print() ssc.start() ssc.awaitTermination() } }
Example 20
Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext} import org.scalatest.FunSuite class CustomRangePartitionerTest extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom range partitioner") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.amount) //when, then val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000)))) //then partitioned.collect().toList } } class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{ override def numPartitions: Int = ranges.size override def getPartition(key: Any): Int = { if(!key.isInstanceOf[Int]){ throw new IllegalArgumentException("partitioner works only for Int type") } val keyInt = key.asInstanceOf[Int] val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2) println(s"for key: $key return $index") index } }
Example 21
Source File: UsePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.{HashPartitioner, RangePartitioner, SparkContext} import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class UsePartitioner extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use different partitioners") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.userId) //when, then val partitioner = keyed.partitioner assert(partitioner.isEmpty) val hashPartitioner = keyed.partitionBy(new HashPartitioner(100)) println(hashPartitioner) assert(hashPartitioner.partitioner.isDefined) val rangePartitioner = keyed.partitionBy(new RangePartitioner(100, keyed)) println(rangePartitioner) assert(rangePartitioner.partitioner.isDefined) } }
Example 22
Source File: ExecutionPlanForJoins.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_3 import org.apache.spark.sql.SparkSession import org.apache.spark.{HashPartitioner, SparkContext} import org.scalatest.FunSuite import org.scalatest.Matchers._ class ExecutionPlanForJoins extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom partitioner while join") { //given val transactions = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car"))) val persons = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny"))) //when val personsDataPartitioner = transactions.partitioner match { case Some(p) => p case None => new HashPartitioner(persons.partitions.length) } val res = persons.join(transactions, personsDataPartitioner).collect().toList res should contain theSameElementsAs List((2, ("Michael", "dog")), (1, ("Tom", "bag"))) } test("can broadcast small data set to every executor and join in-memory") { //given val smallDataSet = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car"))) val hugeDataSet = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny"))) //when broadcast small rdd to all executors val smallInMemoryDataSet = spark.broadcast(smallDataSet.collectAsMap()) //then join will not need to do shuffle val res = hugeDataSet.mapPartitions(iter => { iter.flatMap { case (k, v1) => smallInMemoryDataSet.value.get(k) match { case None => Seq.empty case Some(v2) => Seq((k, (v1, v2))) } } }) res.collect().toList should contain theSameElementsAs List((2, ("Michael", "dog")), (1, ("Tom", "bag"))) } }
Example 23
Source File: VectorRDDFunctions.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import scala.language.implicitConversions import org.apache.spark.HashPartitioner import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg._ class VectorRDDFunctions(self: RDD[Vector]) { def treeSum(depth: Int = 2): RDD[Vector] = { val zeroValue: Vector = null val seqOp = (s: Vector, v: Vector) => { if (s != null) { BLAS.axpy(1.0, v, s) s } else { v.copy.toDense } } val combOp = (s1: Vector, s2: Vector) => { // TODO: handle empty partitions BLAS.axpy(1.0, s2, s1) s1 } require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") val aggregatePartition = (it: Iterator[Vector]) => it.aggregate(zeroValue)(seqOp, combOp) var partiallyAggregated = self.mapPartitions(it => Iterator(aggregatePartition(it))) var numPartitions = partiallyAggregated.partitions.length val scale = math.max(math.pow(numPartitions, 1.0 / depth), 2.0) while (numPartitions > 1) { numPartitions = math.ceil(numPartitions / scale).toInt val curNumPartitions = numPartitions partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex { (i, iter) => iter.map((i % curNumPartitions, _)) }.reduceByKey(new HashPartitioner(curNumPartitions), combOp) .values } require(partiallyAggregated.partitions.length == 1) partiallyAggregated } } object VectorRDDFunctions { implicit def fromVectorRDD(rdd: RDD[Vector]): VectorRDDFunctions = new VectorRDDFunctions(rdd) }
Example 24
Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.UUID import org.apache.spark.{HashPartitioner, SparkEnv} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD} case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan { override def output: Seq[Attribute] = child.output override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = SinglePartition override def doExecute(): RDD[InternalRow] = { assert(numPartitions == 1) new ContinuousCoalesceRDD( sparkContext, numPartitions, conf.continuousStreamingExecutorQueueSize, sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong, child.execute()) } }
Example 25
Source File: VRDDFunctionsSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.collection.mutable import org.apache.spark.{HashPartitioner, SparkFunSuite} import org.apache.spark.ml.linalg.distributed.{DistributedVectorPartitioner, VGridPartitioner} import org.apache.spark.mllib.util.MLlibTestSparkContext class VRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { import org.apache.spark.rdd.VRDDFunctions._ override def beforeAll(): Unit = { super.beforeAll() } def testMapJoinPartitions(shuffleRdd2: Boolean): Unit = { val sc = spark.sparkContext val rdd1 = sc.parallelize(Array.tabulate(81) { idx => { val rowIdx = idx % 9 val colIdx = idx / 9 ((rowIdx, colIdx), (rowIdx, colIdx)) } }).partitionBy(VGridPartitioner(9, 9, 3, 3)).cache() rdd1.count() val rdd2 = sc.parallelize(Array.tabulate(9)(idx => (idx, idx))) .partitionBy(new DistributedVectorPartitioner(9)).cache() rdd2.count() val rddr = rdd1.mapJoinPartition(rdd2, shuffleRdd2)( (x: Int) => { val blockColIdx = x / 3 val pos = blockColIdx * 3 Array(pos, pos + 1, pos + 2) }, (p1: Int, iter1, list: Array[(Int, Iterator[(Int, Int)])]) => { Iterator((p1, list.map(tuple => (tuple._1, tuple._2.next())).mkString(","))) } ) assert(rddr.collect() === Array( (0, "(0,(0,0)),(1,(1,1)),(2,(2,2))"), (1, "(0,(0,0)),(1,(1,1)),(2,(2,2))"), (2, "(0,(0,0)),(1,(1,1)),(2,(2,2))"), (3, "(3,(3,3)),(4,(4,4)),(5,(5,5))"), (4, "(3,(3,3)),(4,(4,4)),(5,(5,5))"), (5, "(3,(3,3)),(4,(4,4)),(5,(5,5))"), (6, "(6,(6,6)),(7,(7,7)),(8,(8,8))"), (7, "(6,(6,6)),(7,(7,7)),(8,(8,8))"), (8, "(6,(6,6)),(7,(7,7)),(8,(8,8))") )) } test("mapJoinPartitions V1") { testMapJoinPartitions(false) } test("mapJoinPartitions V2") { testMapJoinPartitions(true) } test("test multiZipRDDs") { val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddList = List(rdd1, rdd1.map(_ + 10), rdd1.map(_ + 200)) val zipped = VRDDFunctions.zipMultiRDDs(rddList) { iterList: List[Iterator[Int]] => new Iterator[Int]{ override def hasNext: Boolean = iterList.map(_.hasNext).reduce(_ && _) override def next(): Int = iterList.map(_.next()).sum } } assert(zipped.glom().map(_.toList).collect().toList === List(List(213, 216), List(219, 222))) } test("aggregateByKeyInMemory") { val rdd: RDD[(Int, Int)] = sc.makeRDD(Array( (1, 1), (2, 2), (3, 3), (1, 10), (2, 20), (3, 30) ), 3) import org.apache.spark.rdd.VPairRDDFunctions._ val res = rdd.aggregateByKeyInMemory(new mutable.HashSet[Int], new HashPartitioner(3))( (u, v) => u += v, (u1, u2) => u1 ++= u2 ).mapValues(_.toSet).collect() assert(res.sortBy(_._1) === Array( (1, Set(1, 10)), (2, Set(2, 20)), (3, Set(3, 30)) )) } }
Example 26
Source File: GroupSorted.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.api.java import java.util.{ Comparator, Iterator => JIterator } import scala.reflect.ClassTag import scala.collection.JavaConverters._ import org.apache.spark.{ Partitioner, HashPartitioner } import org.apache.spark.Partitioner.defaultPartitioner import org.apache.spark.api.java.JavaPairRDD import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction } import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted } object GroupSorted { private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] { def compare(x: T, y: T) = comparator.compare(x, y) } private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator) private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K]) private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = { implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag val valueOrdering = Option(valueComparator).map(comparatorToOrdering) SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering) } } class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) { def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) = this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator)) def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) = this(GroupSorted.groupSort(javaPairRDD, partitioner, null)) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) = this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) = this(javaPairRDD, numPartitions, null) def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) = this(javaPairRDD, -1, valueComparator) def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null) import GroupSorted._ override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala)) } override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v))) } def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv))) } def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala)) } def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v))) } def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call)) } def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v))) } }
Example 27
Source File: EdgeRDDImpl.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 28
Source File: TestJoins.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner } import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import scala.Iterator object TestJoins { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob")) val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache inspectRDD(x) inspectRDD(y) println(">>> joining x with y") val joinRDD = x.join(y).cache joinRDD.collect().foreach(println) inspectRDD(joinRDD) println(">>> left outer join of x with y") val leftJoin = x.leftOuterJoin(y).cache leftJoin.collect().foreach(println) inspectRDD(leftJoin) println(">>> right outer join of x with y") val rightJoin = x.rightOuterJoin(y).cache rightJoin.collect().foreach(println) inspectRDD(rightJoin) } def inspectRDD[T](rdd: RDD[T]): Unit = { println(">>> Partition length...") rdd.mapPartitions(f => Iterator(f.length), true).foreach(println) println(">>> Partition data...") rdd.foreachPartition(f => f.foreach(println)) } }
Example 29
Source File: TestValueTransformations.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner } import org.apache.spark.rdd.PairRDDFunctions case class Customer(ID: Int, name: String) case class Item(ID: Int, name: String, price: Float) case class Order(ID: Int, item: Item, quantity: Int, var discount: Float) case class CustomerOrders(cust: Customer, order: Order, offer: Boolean) object TestValueTransformations { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TestCombineByKeyJob")) val rdd = sc.parallelize( List( CustomerOrders(Customer(1, "A"), Order(1, Item(1, "item_1", 20), 2, 0), false), CustomerOrders(Customer(1, "A"), Order(2, Item(2, "item_2", 10), 1, 0), false), CustomerOrders(Customer(2, "B"), Order(1, Item(1, "item_1", 20), 2, 0), true))) println(">>> List of customers availing offers") orderValuePerCustomer.foreach(println) println(">>> Total order value for customer ID = 1 is " + orderValuePerCustomer.reduceByKey(_ + _).lookup(1).toString()) } }
Example 30
Source File: RDFS3.scala From SparkSRE with Apache License 2.0 | 5 votes |
package com.hj.examples import com.hj.constant.Const import org.apache.spark.{HashPartitioner, SparkConf, SparkContext} object RDFS3 { def main(args: Array[String]): Unit = { //Arguments: input/RDFS3.in output/RDFS3.out if(args.length != 2) { System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>") System.exit(1) } val inputPath = args(0) val outputPath = args(1) val conf = new SparkConf().setAppName("RDFS3.in").setMaster("local[2]") val sc = new SparkContext(conf) val lines = sc.textFile(inputPath) //"input/RDFS3.in" val triples = lines.map(x => { val arr = x.split(" ") (arr(0), arr(1), arr(2)) }) val partitioner = new HashPartitioner(2) val range = triples.filter(x => x._2.equals(Const.RDFS_RANGE)).map(x => (x._1, x._3)) val pso = triples.map(x => (x._2, (x._1, x._3))).partitionBy(partitioner) val joined = pso.join(range) val res = joined.map(x => (x._2._1._2, x._2._2)) res.foreach(x => println(x)) res.saveAsTextFile(outputPath) } }
Example 31
Source File: RelationWithItemToItem.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.mllib import breeze.numerics.{sqrt, pow} import org.apache.spark.{HashPartitioner, SparkConf, SparkContext} object RelationWithItemToItem { def main(args: Array[String]) { val sc = new SparkContext(new SparkConf() .setAppName("Item to Item") .setMaster("local")) // announce the top number of items to get val topK = 2 val userItem = sc.textFile("/rating.dat") .map(_.split("\t")).map(x =>(x(0),x(1),x(2))).distinct().cache() // cal item -> (user,rating) and item -> sqrt(ratings) val itemUser = userItem.map(x => (x._2,(x._1,x._3.toDouble))).partitionBy(new HashPartitioner(20)) // sqrt : 规整化 rating 的值 val itemPowSqrt = userItem.map(x => (x._2,pow(x._3.toDouble,2.0))).reduceByKey(_+_).mapValues(x => sqrt(x)) // cal item -> ((user,rating),sqrt(ratings)) => user -> (item,rating/sqrt(ratings)) val userItemSqrt = itemUser.join(itemPowSqrt).map(x =>{ val item = x._1 val sqrtRatings = x._2._2 val user = x._2._1._1 val rating = x._2._1._2 (user,(item,rating / sqrtRatings)) }) // cal the relation of item to item in user dimension => get the score of item to item which connection the relation of items val itemToItem = userItemSqrt.join(userItemSqrt).map(x =>{ val item1 = x._2._1._1 val rating1 = x._2._1._2 val item2 = x._2._2._1 val rating2 = x._2._2._2 val score = rating1 * rating2 if(item1 == item2){ ((item1,item2),-1.0) }else{ ((item1,item2),score) } }) itemToItem.reduceByKey(_+_).map(x => (x._1._1,(x._1._2,x._2))).groupByKey().foreach(x => { val sourceItem = x._1 val topItem = x._2.toList.filter(_._2 > 0).sortWith(_._2 > _._2).take(topK) println(s"item = $sourceItem,topK relative item list:$topItem") }) sc.stop() } }
Example 32
Source File: EdgeRDDImpl.scala From drizzle-spark with Apache License 2.0 | 4 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }