org.apache.spark.Partitioner Scala Examples
The following examples show how to use org.apache.spark.Partitioner.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OrderedRDDFunctions.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => { val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) } case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 2
Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID} import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator import org.apache.log4j.Logger import org.apache.spark.{Partitioner, SparkContext} import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId} import scala.reflect.ClassTag object CommunityBasedPartitioning { @transient val logger=Logger.getLogger(CommunityBasedPartitioning.getClass()) def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph) val numberOfCommunities=communities.vertices.values.countApproxDistinct() val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices) val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator) logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions") val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache() out.edges.foreachPartition((_)=>{}) out.vertices.foreachPartition((_)=>{}) out } def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts) } }
Example 3
Source File: Parsing.scala From meetup-stream with Apache License 2.0 | 5 votes |
package util import core._ import org.joda.time.DateTime import org.json4s.DefaultFormats import org.json4s._ import org.json4s.native.JsonMethods._ import org.joda.time.DateTime import org.apache.spark.Partitioner import org.apache.spark.streaming.Seconds import scala.util.Try object Parsing { @transient implicit val formats = DefaultFormats def parseEvent(eventJson: String):Option[Event]={ Try({ val json=parse(eventJson).camelizeKeys val event=json.extract[Event] event }).toOption } def parseRsvp(rsvpJson: String)={ Try({ val json=parse(rsvpJson).camelizeKeys val member=(json \ "member").extract[Member] val event=(json \ "event").extract[MemberEvent] val response=(json \ "response").extract[String] (member, event, response) }).toOption } }
Example 4
Source File: HBasePartitioner.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Partitioner import org.apache.spark.util.CollectionsUtils object HBasePartitioner { implicit object HBaseRawOrdering extends Ordering[HBaseRawType] { def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b) } } class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner { import HBasePartitioner.HBaseRawOrdering type t = HBaseRawType lazy private val len = splitKeys.length // For pre-split table splitKeys(0) = bytes[0], to remove it, // otherwise partition 0 always be empty and // we will miss the last region's date when bulk load lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail def numPartitions = if (len == 0) 1 else len @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t] def getPartition(key: Any): Int = { val k = key.asInstanceOf[t] var partition = 0 if (len <= 128 && len > 0) { // If we have less than 128 partitions naive search val ordering = implicitly[Ordering[t]] while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) { partition += 1 } } else { // Determine which binary search method to use only once. partition = binarySearch(realSplitKeys, k) // binarySearch either returns the match location or -[insertion point]-1 if (partition < 0) { partition = -partition - 1 } if (partition > realSplitKeys.length) { partition = realSplitKeys.length } } partition } override def equals(other: Any): Boolean = other match { case r: HBasePartitioner => r.splitKeys.sameElements(splitKeys) case _ => false } override def hashCode(): Int = { val prime = 31 var result = 1 var i = 0 while (i < splitKeys.length) { result = prime * result + splitKeys(i).hashCode i += 1 } result = prime * result result } }
Example 5
Source File: HBasePartitioner.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Partitioner import org.apache.spark.util.CollectionsUtils object HBasePartitioner { implicit object HBaseRawOrdering extends Ordering[HBaseRawType] { def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b) } } class HBasePartitioner (val splitKeys: Array[HBaseRawType]) extends Partitioner { import HBasePartitioner.HBaseRawOrdering type t = HBaseRawType lazy private val len = splitKeys.length // For pre-split table splitKeys(0) = bytes[0], to remove it, // otherwise partition 0 always be empty and // we will miss the last region's date when bulk load lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail override def numPartitions = if (len == 0) 1 else len @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t] override def getPartition(key: Any): Int = { val k = key.asInstanceOf[t] var partition = 0 if (len <= 128 && len > 0) { // If we have less than 128 partitions naive search val ordering = implicitly[Ordering[t]] while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) { partition += 1 } } else { // Determine which binary search method to use only once. partition = binarySearch(realSplitKeys, k) // binarySearch either returns the match location or -[insertion point]-1 if (partition < 0) { partition = -partition - 1 } if (partition > realSplitKeys.length) { partition = realSplitKeys.length } } partition } override def equals(other: Any): Boolean = other match { case r: HBasePartitioner => r.splitKeys.sameElements(splitKeys) case _ => false } override def hashCode(): Int = { val prime = 31 var result = 1 var i = 0 while (i < splitKeys.length) { result = prime * result + splitKeys(i).hashCode i += 1 } result = prime * result result } }
Example 6
Source File: MapPartitionsWithPreparationRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Partition, Partitioner, TaskContext} override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val prepared = if (preparedArguments.isEmpty) { preparePartition() } else { preparedArguments.remove(0) } val parentIterator = firstParent[T].iterator(partition, context) executePartition(context, partition.index, prepared, parentIterator) } }
Example 7
Source File: PythonPartitioner.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 8
Source File: ShuffledDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 9
Source File: BulkLoadPartitioner.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark import java.util import java.util.Comparator import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Partitioner class BulkLoadPartitioner(startKeys:Array[Array[Byte]]) extends Partitioner { override def numPartitions: Int = startKeys.length override def getPartition(key: Any): Int = { val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] { override def compare(o1: Array[Byte], o2: Array[Byte]): Int = { Bytes.compareTo(o1, o2) } } val rowKey:Array[Byte] = key match { case qualifier: KeyFamilyQualifier => qualifier.rowKey case wrapper: ByteArrayWrapper => wrapper.value case _ => key.asInstanceOf[Array[Byte]] } val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator) if (partition < 0) partition * -1 + -2 else partition } }
Example 10
Source File: ShuffledDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 11
Source File: PythonPartitioner.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 12
Source File: ShuffledDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 13
Source File: OrderedRDDFunctions.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 14
Source File: PythonPartitioner.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 15
Source File: ShuffledDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 16
Source File: RowPartitioner.scala From hail with MIT License | 5 votes |
package is.hail.linalg import org.apache.spark.Partitioner object RowPartitioner { def findInterval(a: Array[Long], key: Long): Int = { var lo = 0 var hi = a.length - 1 while (lo <= hi) { val mid = (lo + hi) >>> 1 if (key < a(mid)) hi = mid - 1 else lo = mid + 1 } lo - 1 } } case class RowPartitioner(partitionStarts: Array[Long]) extends Partitioner { override val numPartitions: Int = partitionStarts.length - 1 override def getPartition(key: Any): Int = key match { case i: Long => RowPartitioner.findInterval(partitionStarts, i) } }
Example 17
Source File: RandomEqualPartitioner.scala From ScalaNetwork with GNU General Public License v2.0 | 5 votes |
package kr.ac.kaist.ir.deep.train import org.apache.spark.Partitioner class RandomEqualPartitioner(val numPartition: Int) extends Partitioner { private var nextNumber = 0 def refreshRandom() = { nextNumber += 1 } override def numPartitions: Int = numPartition override def getPartition(key: Any): Int = { val i = key.asInstanceOf[Long] + nextNumber val remain = i % numPartition val quotient = ((i / numPartition) * nextNumber) % numPartition val hash = ((remain + quotient) % numPartition).asInstanceOf[Int] if (hash < 0) hash + numPartition else hash } }
Example 18
Source File: SuperBigWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.windowing.superbig import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.sql.SparkSession object SuperBigWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val jsonPath = args(0) val pageSize = args(1).toInt val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .getOrCreate() val jsonDf = spark.read.json(jsonPath) import spark.implicits._ val diffDs = jsonDf.flatMap(row => { val group = row.getAs[String]("group") val time = row.getAs[Long]("time") val value = row.getAs[Long]("value") val timePage = time / pageSize if (time % pageSize == 0) { //Am I on the edge of the page Seq((timePage, (time, value)), (timePage + 1, (time, value))) } else { Seq((timePage, (time, value))) } }).groupByKey(r => r._1).flatMapGroups((k, it) => { var lastValue = 0l it.toSeq. sortBy{case (page, (time, value)) => time}. map{case (page, (time, value)) => val dif = value - lastValue lastValue = value (time, value, dif) } }) diffDs.collect().foreach(r => println(" - " + r)) spark.stop() } }
Example 19
Source File: PartitionwiseWeightedSampledRDD.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import java.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, Partitioner, TaskContext} import scala.reflect.ClassTag import scala.util.{Random => ScalaRandom} class PartitionwiseWeightedSampledRDDPartition(val prev: Partition, val seed: Long, val fraction: Double) extends Partition with Serializable { override val index: Int = prev.index } class PartitionwiseWeightedSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[(T, Float)], sampler: WeightedRandomSampler[T, U], fractions: Map[Int, Double], preservesPartitioning: Boolean, @transient private val seed: Long = ScalaRandom.nextLong) extends RDD[U](prev) { @transient override val partitioner: Option[Partitioner] = { if (preservesPartitioning) prev.partitioner else None } override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[(T, Float)].partitions.map { x => new PartitionwiseWeightedSampledRDDPartition(x, random.nextLong(), fractions.getOrElse(x.index, 0.0)) } } override def getPreferredLocations(split: Partition): Seq[String] = { firstParent[(T, Float)].preferredLocations( split.asInstanceOf[PartitionwiseWeightedSampledRDDPartition].prev ) } override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseWeightedSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.setFraction(split.fraction) thisSampler.sample(firstParent[(T, Float)].iterator(split.prev, context)) } }
Example 20
Source File: KeyPartitioner.scala From spark3D with Apache License 2.0 | 5 votes |
package com.astrolabsoftware.spark3d.spatialPartitioning // Spark built-in partitioner import org.apache.spark.Partitioner override def getPartition(key : Any) : Int = { key match { case i:Int => key.asInstanceOf[Int] case l:Long => key.asInstanceOf[Long].toInt case _ => throw new ClassCastException(""" Key from KeyPartitioner must be Int or Long! """) } } }
Example 21
Source File: MapDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object MapDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[T](origin: RDD[(Int, (T, InternalRow))], num_partitions: Int): RDD[(Int, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new MapDPartitioner(num_partitions) new ShuffledRDD[Int, (T, InternalRow), (T, InternalRow)](rdd, part) } } class MapDPartitioner(num_partitions: Int) extends Partitioner { def numPartitions: Int = num_partitions def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] require(k >= 0 && k < num_partitions) k } }
Example 22
Source File: RangeDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.util.CollectionsUtils import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair import scala.reflect.ClassTag object RangeDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[K: Ordering: ClassTag, T](origin: RDD[(K, (T, InternalRow))], range_bounds: Array[K]): RDD[(K, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[K, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new RangeDPartitioner(range_bounds, ascending = true) new ShuffledRDD[K, (T, InternalRow), (T, InternalRow)](rdd, part) } } class RangeDPartitioner[K: Ordering: ClassTag](range_bounds: Array[K], ascending: Boolean) extends Partitioner { def numPartitions: Int = range_bounds.length + 1 private val binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K] def getPartition(key: Any): Int = { val k = key.asInstanceOf[K] var partition = 0 if (range_bounds.length < 128) { while (partition < range_bounds.length && Ordering[K].gt(k, range_bounds(partition))) partition += 1 } else { partition = binarySearch(range_bounds, k) if (partition < 0) partition = -partition - 1 if (partition > range_bounds.length) partition = range_bounds.length } if (ascending) partition else range_bounds.length - partition } }
Example 23
Source File: VoronoiPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object VoronoiPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Int, (Point, InternalRow))], pivot_to_group: Array[Int], num_group: Int) : RDD[(Int, (Point, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (Point, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new VoronoiPartitioner(pivot_to_group, num_group) new ShuffledRDD[Int, (Point, InternalRow), (Point, InternalRow)](rdd, part) } } class VoronoiPartitioner(pivot_to_group: Array[Int], num_group: Int) extends Partitioner { override def numPartitions: Int = num_group override def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] pivot_to_group(k) } }
Example 24
Source File: HashPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object HashPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Any, InternalRow)], num_partitions: Int): RDD[(Any, InternalRow)] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, row._2.copy()))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Any, InternalRow]() iter.map(row => mutablePair.update(row._1, row._2.copy())) } } val part = new HashPartitioner(num_partitions) new ShuffledRDD[Any, InternalRow, InternalRow](rdd, part) } } class HashPartitioner(num_partitions: Int) extends Partitioner { override def numPartitions: Int = num_partitions override def getPartition(key: Any): Int = { key.hashCode() % num_partitions } }
Example 25
Source File: SkewJoinOperations.scala From spark-skewjoin with Apache License 2.0 | 5 votes |
package com.tresata.spark.skewjoin import java.util.{ Random => JRandom } import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.Partitioner import org.apache.spark.Partitioner.defaultPartitioner import com.twitter.algebird.{ CMS, CMSHasher, CMSMonoid } case class CMSParams(eps: Double = 0.005, delta: Double = 1e-8, seed: Int = 1) { def getCMSMonoid[K: Ordering: CMSHasher]: CMSMonoid[K] = CMS.monoid[K](eps, delta, seed) } class SkewJoinOperations[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable { private def getReplicationFactors(random: JRandom, replication: Int, otherReplication: Int): Seq[(Int, Int)] = { require(replication > 0 && otherReplication > 0, "replication must be positive") val rand = random.nextInt(otherReplication) (0 until replication).map(rep => (rand, rep)) } private def createRddCMS[K](rdd: RDD[K], cmsMonoid: CMSMonoid[K]): CMS[K] = rdd.map(k => cmsMonoid.create(k)).reduce(cmsMonoid.plus(_, _)) def skewCogroup[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Iterable[V], Iterable[W]))] = { val numPartitions = partitioner.numPartitions val broadcastedLeftCMS = rdd.sparkContext.broadcast(createRddCMS[K](rdd.keys, cmsParams.getCMSMonoid[K])) val broadcastedRightCMS = rdd.sparkContext.broadcast(createRddCMS[K](other.keys, cmsParams.getCMSMonoid[K])) val rddSkewed = rdd.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => val (leftReplication, rightReplication) = skewReplication.getReplications( broadcastedLeftCMS.value.frequency(kv._1).estimate, broadcastedRightCMS.value.frequency(kv._1).estimate, numPartitions) getReplicationFactors(random, leftReplication, rightReplication).map(rl =>((kv._1, rl.swap), kv._2)) } } val otherSkewed = other.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => val (leftReplication, rightReplication) = skewReplication.getReplications( broadcastedLeftCMS.value.frequency(kv._1).estimate, broadcastedRightCMS.value.frequency(kv._1).estimate, numPartitions) getReplicationFactors(random, rightReplication, leftReplication).map(lr => ((kv._1, lr), kv._2)) } } rddSkewed.cogroup(otherSkewed, partitioner).map(kv => (kv._1._1, kv._2)) } def skewCogroup[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] = skewCogroup(other, defaultPartitioner(rdd, other)) def skewJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, W))] = skewCogroup(other, partitioner, skewReplication, cmsParams).flatMap{ blockPair => for (v <- blockPair._2._1.iterator; w <- blockPair._2._2.iterator) yield (blockPair._1, (v, w)) } def skewJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, W))] = skewJoin(other, defaultPartitioner(rdd, other)) def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, Option[W]))] = skewCogroup(other, partitioner, RightReplication(skewReplication), cmsParams).flatMap{ case (k, (itv, Seq())) => itv.iterator.map(v => (k, (v, None))) case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (v, Some(w))) } def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] = skewLeftOuterJoin(other, defaultPartitioner(rdd, other)) def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Option[V], W))] = skewCogroup(other, partitioner, LeftReplication(skewReplication), cmsParams).flatMap{ case (k, (Seq(), itw)) => itw.iterator.map(w => (k, (None, w))) case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (Some(v), w)) } def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] = skewRightOuterJoin(other, defaultPartitioner(rdd, other)) } trait Dsl { implicit def rddToSkewJoinOperations_e94qoy3tnt[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]): SkewJoinOperations[K, V] = new SkewJoinOperations(rdd) implicit def rddToBlockJoinOperations_7IaIe6dkih[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): BlockJoinOperations[K, V] = new BlockJoinOperations(rdd) } object Dsl extends Dsl
Example 26
Source File: BlockJoinOperations.scala From spark-skewjoin with Apache License 2.0 | 5 votes |
package com.tresata.spark.skewjoin import java.util.{ Random => JRandom } import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.Partitioner import org.apache.spark.Partitioner.defaultPartitioner class BlockJoinOperations[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable { // based on blockJoinWithSmaller in scalding. See com.twitter.scalding.JoinAlgorithms private def blockCogroup[W](other: RDD[(K, W)], leftReplication: Int, rightReplication: Int, partitioner: Partitioner): RDD[((K, (Int, Int)), (Iterable[V], Iterable[W]))] = { assert(leftReplication >= 1, "must specify a positive number for left replication") assert(rightReplication >= 1, "must specify a positive number for right replication") def getReplication(random: JRandom, replication: Int, otherReplication: Int) : Seq[(Int, Int)] = { val rand = random.nextInt(otherReplication) (0 until replication).map{ rep => (rand, rep) } } val rddBlocked = rdd.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => getReplication(random, leftReplication, rightReplication).map{ rl => ((kv._1, rl.swap), kv._2)} } } val otherBlocked = other.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => getReplication(random, rightReplication, leftReplication).map{ lr => ((kv._1, lr), kv._2)} } } rddBlocked.cogroup(otherBlocked, partitioner) } def blockRightOuterJoin[W](other: RDD[(K, W)], leftReplication: Int): RDD[(K, (Option[V], W))] = blockRightOuterJoin(other, leftReplication, defaultPartitioner(rdd, other)) }
Example 27
Source File: BBRPartitioner.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.partitioner import scala.reflect.ClassTag import com.github.cloudml.zen.ml.clustering.LDADefines._ import com.github.cloudml.zen.ml.sampler.AliasTable import com.github.cloudml.zen.ml.util.XORShiftRandom import breeze.linalg.{SparseVector => BSV} import org.apache.spark.Partitioner import org.apache.spark.graphx2._ import org.apache.spark.graphx2.impl.GraphImpl import org.apache.spark.storage.StorageLevel private[ml] class BBRPartitioner(val partitions: Int) extends Partitioner { override def numPartitions: Int = partitions def getKey(et: EdgeTriplet[Int, _]): VertexId = { if (et.srcAttr >= et.dstAttr) et.srcId else et.dstId } def getPartition(key: Any): PartitionID = { key.asInstanceOf[PartitionID] % numPartitions } override def equals(other: Any): Boolean = other match { case bbr: BBRPartitioner => bbr.numPartitions == numPartitions case _ => false } override def hashCode: Int = numPartitions } object BBRPartitioner { private[zen] def partitionByBBR[VD: ClassTag, ED: ClassTag]( input: Graph[VD, ED], storageLevel: StorageLevel): Graph[VD, ED] = { val edges = input.edges val conf = edges.context.getConf val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length) val bbr = new BBRPartitioner(numPartitions) val degGraph = GraphImpl(input.degrees, edges) val assnGraph = degGraph.mapTriplets((pid, iter) => iter.map(et => (bbr.getKey(et), Edge(et.srcId, et.dstId, et.attr))), TripletFields.All) assnGraph.persist(storageLevel) val assnVerts = assnGraph.aggregateMessages[Long](ect => { if (ect.attr._1 == ect.srcId) { ect.sendToSrc(1L) } else { ect.sendToDst(1L) } }, _ + _, TripletFields.EdgeOnly) val (kids, koccurs) = assnVerts.filter(_._2 > 0L).collect().unzip val partRdd = edges.context.parallelize(kids.zip(rearrage(koccurs, numPartitions))) val rearrGraph = assnGraph.mapVertices((_, _) => null.asInstanceOf[AliasTable[Long]]) .joinVertices(partRdd)((_, _, arr) => AliasTable.generateAlias(arr)) val newEdges = rearrGraph.triplets.mapPartitions(iter => { val gen = new XORShiftRandom() iter.map(et => { val (kid, edge) = et.attr val table = if (kid == et.srcId) et.srcAttr else et.dstAttr (table.sampleRandom(gen), edge) }) }).partitionBy(bbr).map(_._2) GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel) } private def rearrage(koccurs: IndexedSeq[Long], numPartitions: Int): IndexedSeq[BSV[Long]] = { val numKeys = koccurs.length val numEdges = koccurs.sum val npp = numEdges / numPartitions val rpn = numEdges - npp * numPartitions @inline def nrpp(pi: Int): Long = npp + (if (pi < rpn) 1L else 0L) @inline def kbn(ki: Int): Long = if (ki < numKeys) koccurs(ki) else 0L val keyPartCount = koccurs.map(t => BSV.zeros[Long](numPartitions)) def put(ki: Int, krest: Long, pi: Int, prest: Long): Unit = { if (ki < numKeys) { if (krest == prest) { keyPartCount(ki)(pi) = krest put(ki + 1, kbn(ki + 1), pi + 1, nrpp(pi + 1)) } else if (krest < prest) { keyPartCount(ki)(pi) = krest put(ki + 1, kbn(ki + 1), pi, prest - krest) } else { keyPartCount(ki)(pi) = prest put(ki, krest - prest, pi + 1, nrpp(pi + 1)) } } } put(0, kbn(0), 0, nrpp(0)) keyPartCount } }
Example 28
Source File: SimpleCustomPartitioner.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.partitioning import org.apache.spark.Partitioner import org.apache.spark.sql.SparkSession object SimpleCustomPartitioner { def main(args:Array[String]): Unit = { val jsonPath = args(0) val partitions = args(1).toInt val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .getOrCreate() val jsonDf = sparkSession.read.json(jsonPath) val partitionedRdd = jsonDf.rdd.map(row => { val group = row.getAs[String]("group") val time = row.getAs[Long]("time") val value = row.getAs[Long]("value") ((group, time), value) //this a tuple with in a tuple }).repartitionAndSortWithinPartitions(new SimpleCustomPartitioner(partitions)) val pairRdd = jsonDf.rdd.map(row => { val group = row.getAs[String]("group") val time = row.getAs[Long]("time") val value = row.getAs[Long]("value") ((group, time), value) //this a tuple with in a tuple }) pairRdd.reduceByKey(_ + _, 100) pairRdd.reduceByKey(new SimpleCustomPartitioner(partitions), _ + _) partitionedRdd.collect().foreach(r => { println(r) }) sparkSession.stop() } } class SimpleCustomPartitioner(numOfParts:Int) extends Partitioner { override def numPartitions: Int = numOfParts override def getPartition(key: Any): Int = { val k = key.asInstanceOf[(String, Long)] Math.abs(k._1.hashCode) % numPartitions } }
Example 29
Source File: BulkLoadPartitioner.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark import java.util import java.util.Comparator import org.apache.yetus.audience.InterfaceAudience; import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Partitioner @InterfaceAudience.Public class BulkLoadPartitioner(startKeys:Array[Array[Byte]]) extends Partitioner { // when table not exist, startKeys = Byte[0][] override def numPartitions: Int = if (startKeys.length == 0) 1 else startKeys.length override def getPartition(key: Any): Int = { val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] { override def compare(o1: Array[Byte], o2: Array[Byte]): Int = { Bytes.compareTo(o1, o2) } } val rowKey:Array[Byte] = key match { case qualifier: KeyFamilyQualifier => qualifier.rowKey case wrapper: ByteArrayWrapper => wrapper.value case _ => key.asInstanceOf[Array[Byte]] } var partition = util.Arrays.binarySearch(startKeys, rowKey, comparator) if (partition < 0) partition = partition * -1 + -2 if (partition < 0) partition = 0 partition } }
Example 30
Source File: MetricImplicits.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.timely import java.io.PrintStream import java.net.Socket import java.nio.charset.StandardCharsets import io.gzet.timeseries.SimpleConfig import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.{Logging, Partitioner} object MetricImplicits extends Logging with SimpleConfig { def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } class MetricPartitioner(partitions: Int) extends Partitioner { require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.") override def numPartitions: Int = partitions override def getPartition(key: Any): Int = { val k = key.asInstanceOf[MetricKey] nonNegativeMod(k.metricName.hashCode, partitions) } } implicit class Metrics(rdd: RDD[Metric]) { val partitions = rdd.partitions.length val partitioner = new MetricPartitioner(partitions) def publish() = { val sSortedMetricRDD = rdd filter { metric => metric.tags.nonEmpty } map { metric => (MetricKey(metric.name, metric.time), metric) } repartitionAndSortWithinPartitions partitioner sSortedMetricRDD.values foreachPartition { it: Iterator[Metric] => val sock = new Socket(timelyHost, timelyPort) val writer = new PrintStream(sock.getOutputStream, true, StandardCharsets.UTF_8.name) it foreach { metric => writer.println(metric.toPut) } writer.flush() } } } implicit class MetricStream(stream: DStream[Metric]) { def publish() = { stream foreachRDD { rdd => rdd.publish() } } } } case class Metric(name: String, time: Long, value: Double, tags: Map[String, String], viz: Option[String] = None) { def toPut = { val vizMap = if(viz.isDefined) List("viz" -> viz.get) else List[(String, String)]() val strTags = vizMap.union(tags.toList).map({ case (k, v) => s"$k=$v" }).mkString(" ") s"put $name $time $value $strTags" } } case class MetricKey(metricName: String, metricTime: Long) object MetricKey { implicit def orderingByMetricDate[A <: MetricKey] : Ordering[A] = { Ordering.by(fk => (fk.metricName, fk.metricTime)) } }
Example 31
Source File: IDPartitioner.scala From traj-sim-spark with Apache License 2.0 | 5 votes |
package edu.utah.cs.partitioner import org.apache.spark.Partitioner import org.apache.spark.rdd.{RDD, ShuffledRDD} object IDPartition { def apply(origin: RDD[_ <: Product2[Int, Any]], n_part: Int) : RDD[_ <: Product2[Int, Any]] = { val part = new IDPartitioner(n_part) val shuffled = new ShuffledRDD[Int, Any, Any](origin, part) shuffled } } class IDPartitioner(n_part: Int) extends Partitioner { override def numPartitions: Int = n_part override def getPartition(key: Any): Int = { key.asInstanceOf[Int] } }
Example 32
Source File: OrderedRDDFunctions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => { val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) } case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 33
Source File: PythonPartitioner.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 34
Source File: ShuffledDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 35
Source File: OrderedRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 36
Source File: PythonPartitioner.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 37
Source File: BulkLoadPartitioner.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.spark import java.util import java.util.Comparator import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Partitioner class BulkLoadPartitioner(startKeys:Array[Array[Byte]]) extends Partitioner { override def numPartitions: Int = startKeys.length override def getPartition(key: Any): Int = { val rowKey:Array[Byte] = key match { case qualifier: KeyFamilyQualifier => qualifier.rowKey case _ => key.asInstanceOf[Array[Byte]] } val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] { override def compare(o1: Array[Byte], o2: Array[Byte]): Int = { Bytes.compareTo(o1, o2) } } val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator) if (partition < 0) partition * -1 + -2 else partition } }
Example 38
Source File: DummyRangePartitioner.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei import org.apache.spark.Partitioner object DummyRangePartitioner extends Partitioner { override def numPartitions: Int = 2 override def getPartition(key: Any): Int = { key match { case x: Int => if (x < 0) 0 else 1 case _ => 0 } } }
Example 39
Source File: RDDOrderedFunctions.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.rdd import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[kontextfrei] trait RDDOrderedFunctions extends DCollectionOrderedFunctions[RDD] { this: RDDBase => override final def sortByKey[A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])(ascending: Boolean): RDD[(A, B)] = withSite(x) { _.sortByKey(ascending) } override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])(ascending: Boolean, numPartitions: Int): RDD[(A, B)] = withSite(x) { _.sortByKey(ascending, numPartitions) } override final def filterByRange[A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])(lower: A, upper: A): RDD[(A, B)] = withSite(x) { _.filterByRange(lower, upper) } override def repartitionAndSortWithinPartitions[ A: ClassTag: Ordering, B: ClassTag]( x: RDD[(A, B)])( partitioner: Partitioner) : RDD[(A, B)] = withSite(x) { _.repartitionAndSortWithinPartitions(partitioner) } }
Example 40
Source File: RDDPairFunctions.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.rdd import com.danielwestheide.kontextfrei.DCollectionPairFunctions import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import scala.collection.Map import scala.reflect.ClassTag private[kontextfrei] trait RDDPairFunctions extends DCollectionPairFunctions[RDD] { this: RDDBase => override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) { _.cogroup(y) } override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) { _.values } override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) { _.keys } override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) { _.leftOuterJoin(y) } override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) { _.rightOuterJoin(y) } override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) { _.fullOuterJoin(y) } override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) { _.mapValues(f) } override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) { _.flatMapValues(f) } override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])( f: (B, B) => B): RDD[(A, B)] = withSite(xs) { _.reduceByKey(f) } override final def foldByKey[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) { _.foldByKey(zeroValue)(f) } override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag]( xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C, combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) { _.aggregateByKey(zeroValue)(seqOp, combOp) } override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag]( xs: RDD[(A, B)])(createCombiner: B => C)( mergeValue: (C, B) => C, mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) { _.combineByKey(createCombiner, mergeValue, mergeCombiners) } override final def countByKey[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) { _.countByKey() } override final def collectAsMap[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)]): Map[A, B] = withSite(xs) { _.collectAsMap() } override final def partitionBy[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) { _.partitionBy(partitioner) } }
Example 41
Source File: PairSyntax.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.syntax import com.danielwestheide.kontextfrei.DCollectionOps import org.apache.spark.Partitioner import scala.collection.Map import scala.reflect.ClassTag class PairSyntax[DCollection[_], A: ClassTag, B: ClassTag]( val self: DCollectionOps[DCollection], val coll: DCollection[(A, B)]) { final def keys: DCollection[A] = self.keys(coll) final def values: DCollection[B] = self.values(coll) final def cogroup[C: ClassTag](other: DCollection[(A, C)]) : DCollection[(A, (Iterable[B], Iterable[C]))] = self.cogroup(coll)(other) final def leftOuterJoin[C: ClassTag]( other: DCollection[(A, C)]): DCollection[(A, (B, Option[C]))] = self.leftOuterJoin(coll)(other) final def rightOuterJoin[C: ClassTag]( other: DCollection[(A, C)]): DCollection[(A, (Option[B], C))] = self.rightOuterJoin(coll)(other) final def fullOuterJoin[C: ClassTag]( other: DCollection[(A, C)]): DCollection[(A, (Option[B], Option[C]))] = self.fullOuterJoin(coll)(other) final def mapValues[C: ClassTag](f: B => C): DCollection[(A, C)] = self.mapValues(coll)(f) final def flatMapValues[C: ClassTag]( f: B => TraversableOnce[C]): DCollection[(A, C)] = self.flatMapValues(coll)(f) final def reduceByKey(f: (B, B) => B): DCollection[(A, B)] = self.reduceByKey(coll)(f) final def foldByKey(zeroValue: B)(f: (B, B) => B): DCollection[(A, B)] = self.foldByKey(coll)(zeroValue, f) final def aggregateByKey[C: ClassTag](zeroValue: C)( seqOp: (C, B) => C, combOp: (C, C) => C): DCollection[(A, C)] = self.aggregateByKey(coll)(zeroValue)(seqOp, combOp) final def combineByKey[C: ClassTag]( createCombiner: B => C, mergeValue: (C, B) => C, mergeCombiners: (C, C) => C): DCollection[(A, C)] = self.combineByKey(coll)(createCombiner)(mergeValue, mergeCombiners) final def countByKey(): Map[A, Long] = self.countByKey(coll) final def collectAsMap(): Map[A, B] = self.collectAsMap(coll) final def partitionBy(partitioner: Partitioner): DCollection[(A, B)] = self.partitionBy(coll)(partitioner) }
Example 42
Source File: OrderedSyntax.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.syntax import com.danielwestheide.kontextfrei.DCollectionOps import org.apache.spark.Partitioner import scala.reflect.ClassTag class OrderedSyntax[DCollection[_], A: ClassTag: Ordering, B: ClassTag]( val self: DCollectionOps[DCollection], val coll: DCollection[(A, B)]) { final def sortByKey(ascending: Boolean): DCollection[(A, B)] = self.sortByKey(coll)(ascending) final def sortByKey(): DCollection[(A, B)] = self.sortByKey(coll)(ascending = true) final def sortByKey(ascending: Boolean = true, numPartitions: Int): DCollection[(A, B)] = self.sortByKeyWithNumPartitions(coll)(ascending, numPartitions) final def filterByRange(lower: A, upper: A): DCollection[(A, B)] = self.filterByRange(coll)(lower, upper) final def repartitionAndSortWithinPartitions( partitioner: Partitioner): DCollection[(A, B)] = self.repartitionAndSortWithinPartitions(coll)(partitioner) }
Example 43
Source File: StreamOrderedFunctions.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.stream import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions import org.apache.spark.Partitioner import scala.reflect.ClassTag private[kontextfrei] trait StreamOrderedFunctions extends DCollectionOrderedFunctions[Stream] { import Ordering.Implicits._ override final def sortByKey[A: ClassTag: Ordering, B: ClassTag]( x: Stream[(A, B)])(ascending: Boolean): Stream[(A, B)] = x.sortBy(_._1)(ordering(ascending)) override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering, B: ClassTag](x: Stream[(A, B)])( ascending: Boolean, numPartitions: Int): Stream[(A, B)] = x.sortBy(_._1)(ordering(ascending)) override final def filterByRange[A: ClassTag: Ordering, B: ClassTag]( x: Stream[(A, B)])(lower: A, upper: A): Stream[(A, B)] = x.filter(e => e._1 >= lower && e._1 <= upper) override def repartitionAndSortWithinPartitions[A: ClassTag: Ordering, B: ClassTag]( x: Stream[(A, B)])(partitioner: Partitioner): Stream[(A, B)] = x.sortBy(_._1)(ordering(ascending = true)) private def ordering[A](ascending: Boolean)( implicit ev: Ordering[A]): Ordering[A] = if (ascending) ev else ev.reverse }
Example 44
Source File: HBasePartitioner.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.serializer.JavaSerializer import org.apache.spark.util.{CollectionsUtils, Utils} import org.apache.spark.{Partitioner, SparkEnv} object HBasePartitioner { implicit object HBaseRawOrdering extends Ordering[HBaseRawType] { def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b) } } class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner { import HBasePartitioner.HBaseRawOrdering type t = HBaseRawType lazy private val len = splitKeys.length // For pre-split table splitKeys(0) = bytes[0], to remove it, // otherwise partition 0 always be empty and // we will miss the last region's date when bulk load lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail def numPartitions = if (len == 0) 1 else len @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t] def getPartition(key: Any): Int = { val k = key.asInstanceOf[t] var partition = 0 if (len <= 128 && len > 0) { // If we have less than 128 partitions naive search val ordering = implicitly[Ordering[t]] while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) { partition += 1 } } else { // Determine which binary search method to use only once. partition = binarySearch(realSplitKeys, k) // binarySearch either returns the match location or -[insertion point]-1 if (partition < 0) { partition = -partition - 1 } if (partition > realSplitKeys.length) { partition = realSplitKeys.length } } partition } override def equals(other: Any): Boolean = other match { case r: HBasePartitioner => r.splitKeys.sameElements(splitKeys) case _ => false } override def hashCode(): Int = { val prime = 31 var result = 1 var i = 0 while (i < splitKeys.length) { result = prime * result + splitKeys(i).hashCode i += 1 } result = prime * result result } }
Example 45
Source File: BulkLoadPartitioner.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.loader.spark import java.util import java.util.Comparator import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Partitioner class BulkLoadPartitioner(startKeys:Array[Array[Byte]]) extends Partitioner { override def numPartitions: Int = startKeys.length override def getPartition(key: Any): Int = { val rowKey:Array[Byte] = key match { case qualifier: KeyFamilyQualifier => qualifier.rowKey case _ => key.asInstanceOf[Array[Byte]] } val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] { override def compare(o1: Array[Byte], o2: Array[Byte]): Int = { Bytes.compareTo(o1, o2) } } val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator) if (partition < 0) partition * -1 + -2 else partition } }
Example 46
Source File: TiRegionPartitioner.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.write import java.util import com.pingcap.tikv.key.Key import com.pingcap.tikv.region.TiRegion import org.apache.spark.Partitioner class TiRegionPartitioner(regions: util.List[TiRegion], writeConcurrency: Int) extends Partitioner { override def getPartition(key: Any): Int = { val serializableKey = key.asInstanceOf[SerializableKey] val rawKey = Key.toRawKey(serializableKey.bytes) binarySearch(rawKey) % numPartitions } def binarySearch(key: Key): Int = { if (regions.get(0).contains(key)) { return 0 } var l = 0 var r = regions.size() while (l < r) { val mid = l + (r - l) / 2 val region = regions.get(mid) if (Key.toRawKey(region.getEndKey).compareTo(key) <= 0) { l = mid + 1 } else { r = mid } } assert(regions.get(l).contains(key)) l } override def numPartitions: Int = if (writeConcurrency <= 0) regions.size() else writeConcurrency }
Example 47
Source File: ColumnPartitioner.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.partitioner import org.apache.spark.{Partitioner, SparkConf} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.matfast.util.MatfastSerializer // scalastyle:off class ColumnPartitioner(partitions: Int) extends Partitioner{ require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions") override val numPartitions = partitions override def getPartition(key: Any): Int = { key match { case (i: Int, j: Int) => j % partitions case (i: Int, j: Int, _: Int) => j % partitions case _ => throw new IllegalArgumentException(s"Unrecognized key: $key") } } override def equals(other: Any): Boolean = { other.isInstanceOf[ColumnPartitioner] && numPartitions == other.asInstanceOf[ColumnPartitioner].numPartitions } override def hashCode(): Int = { com.google.common.base.Objects.hashCode(partitions: java.lang.Integer) } } // scalastyle:on object ColumnPartitioner { def apply(origin: RDD[InternalRow], numPartitions: Int): RDD[((Int, Int), InternalRow)] = { val rdd = origin.map { row => val rid = row.getInt(0) val cid = row.getInt(1) val matrix = row.getStruct(2, 7) ((rid, cid), matrix) } val partitioner = new ColumnPartitioner(numPartitions) val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner) shuffled.setSerializer(new MatfastSerializer(new SparkConf(false))) shuffled } }
Example 48
Source File: BlockCyclicPartitioner.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.partitioner import org.apache.spark.{Partitioner, SparkConf} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.matfast.util.MatfastSerializer class BlockCyclicPartitioner(val ROW_BLKS: Int, val COL_BLKS: Int, val ROW_BLKS_PER_PARTITION: Int, val COL_BLKS_PER_PARTITION: Int) extends Partitioner{ require(ROW_BLKS > 0, s"Number of row blocks should be larger than 0, but found $ROW_BLKS") require(COL_BLKS > 0, s"Number of col blocks should be larger than 0, but found $COL_BLKS") require(ROW_BLKS_PER_PARTITION > 0, s"Number of row blocks per partition should be larger than 0, " + s"but found $ROW_BLKS_PER_PARTITION") require(COL_BLKS_PER_PARTITION > 0, s"Number of col blocks per partition should be larger than 0, " + s"but found $COL_BLKS_PER_PARTITION") private val row_partition_num = math.ceil(ROW_BLKS * 1.0 / ROW_BLKS_PER_PARTITION).toInt private val col_partition_num = math.ceil(COL_BLKS * 1.0 / COL_BLKS_PER_PARTITION).toInt private val num_row_part = ROW_BLKS / row_partition_num private val num_col_part = COL_BLKS / col_partition_num override val numPartitions: Int = row_partition_num * col_partition_num override def getPartition(key: Any): Int = { key match { case (i: Int, j : Int) => ((i % num_row_part) * col_partition_num + (j % num_col_part)) % numPartitions case (i: Int, j: Int, _: Int) => ((i % num_row_part) * col_partition_num + (j % num_col_part)) % numPartitions case _ => throw new IllegalArgumentException(s"Unrecognized key: $key") } } override def equals(obj: Any): Boolean = { obj match { case r: BlockCyclicPartitioner => (ROW_BLKS == r.ROW_BLKS) && (COL_BLKS == r.COL_BLKS) && (ROW_BLKS_PER_PARTITION == r.ROW_BLKS_PER_PARTITION) && (COL_BLKS_PER_PARTITION == r.COL_BLKS_PER_PARTITION) case _ => false } } override def hashCode(): Int = { com.google.common.base.Objects.hashCode( ROW_BLKS: java.lang.Integer, COL_BLKS: java.lang.Integer, ROW_BLKS_PER_PARTITION: java.lang.Integer, COL_BLKS_PER_PARTITION: java.lang.Integer ) } } object BlockCyclicPartitioner { def apply(origin: RDD[InternalRow], ROW_BLKS: Int, COL_BLKS: Int, ROW_BLKS_PER_PARTITION: Int, COL_BLKS_PER_PARTITION: Int): RDD[((Int, Int), InternalRow)] = { val rdd = origin.map { row => val rid = row.getInt(0) val cid = row.getInt(1) val matrix = row.getStruct(2, 7) ((rid, cid), matrix) } val partitioner = new BlockCyclicPartitioner(ROW_BLKS, COL_BLKS, ROW_BLKS_PER_PARTITION, COL_BLKS_PER_PARTITION) val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner) shuffled.setSerializer(new MatfastSerializer(new SparkConf(false))) shuffled } }
Example 49
Source File: IndexPartitioner.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.partitioner import org.apache.spark.Partitioner // scalastyle:off class IndexPartitioner(partitions: Int) extends Partitioner{ require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions") override val numPartitions: Int = partitions override def getPartition(key: Any): Int = { key match { case (i: Int) => i case _ => throw new IllegalArgumentException(s"Unrecognized key: $key") } } override def equals(other: Any): Boolean = { other.isInstanceOf[IndexPartitioner] && numPartitions == other.asInstanceOf[IndexPartitioner].numPartitions } override def hashCode(): Int = { com.google.common.base.Objects.hashCode(partitions: java.lang.Integer) } }
Example 50
Source File: RowPartitioner.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.partitioner import org.apache.spark.{Partitioner, SparkConf} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.matfast.util.MatfastSerializer // scalastyle:off class RowPartitioner(partitions: Int) extends Partitioner{ require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions") override val numPartitions = partitions override def getPartition(key: Any): Int = { key match { case (i: Int, j: Int) => i % partitions case (i: Int, j: Int, _: Int) => i % partitions case _ => throw new IllegalArgumentException(s"Unrecognized key: $key") } } override def equals(other: Any): Boolean = { other.isInstanceOf[RowPartitioner] && numPartitions == other.asInstanceOf[RowPartitioner].numPartitions } override def hashCode(): Int = { com.google.common.base.Objects.hashCode(partitions: java.lang.Integer) } } object RowPartitioner { def apply(origin: RDD[InternalRow], numPartitions: Int): RDD[((Int, Int), InternalRow)] = { val rdd = origin.map { row => val rid = row.getInt(0) val cid = row.getInt(1) val matrix = row.getStruct(2, 7) ((rid, cid), matrix) } val partitioner = new RowPartitioner(numPartitions) val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner) shuffled.setSerializer(new MatfastSerializer(new SparkConf(false))) shuffled } }
Example 51
Source File: OrderedRDDFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 52
Source File: SubtractedRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag]( @transient var rdd1: RDD[_ <: Product2[K, V]], @transient var rdd2: RDD[_ <: Product2[K, W]], part: Partitioner) extends RDD[(K, V)](rdd1.context, Nil) { override def getDependencies: Seq[Dependency[_]] = { def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]]) : Dependency[_] = { if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency[T1, T2, Any](rdd, part) } } Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2)) } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.length) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => None case _ => Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i))) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = { dependencies(depNum) match { case oneToOneDependency: OneToOneDependency[_] => val dependencyPartition = partition.narrowDeps(depNum).get.split oneToOneDependency.rdd.iterator(dependencyPartition, context) .asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case shuffleDependency: ShuffleDependency[_, _, _] => val iter = SparkEnv.get.shuffleManager .getReader( shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } } // the first dep is rdd1; add all values to the map integrate(0, t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(1, t => map.remove(t._1)) map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 53
Source File: PythonPartitioner.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 54
Source File: ShuffledDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 55
Source File: CustomPartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_3 import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{Partitioner, SparkContext} import org.scalatest.FunSuite import org.scalatest.Matchers._ class CustomPartitioner extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom partitioner") { //given val numberOfExecutors = 2 val data = spark .parallelize(List( UserTransaction("a", 100), UserTransaction("b", 101), UserTransaction("a", 202), UserTransaction("b", 1), UserTransaction("c", 55) ) ).keyBy(_.userId) .partitionBy(new Partitioner { override def numPartitions: Int = numberOfExecutors override def getPartition(key: Any): Int = { key.hashCode % numberOfExecutors } }) println(data.partitions.length) //when val res = data.mapPartitions[Long](iter => iter.map(_._2).map(_.amount) ).collect().toList //then res should contain theSameElementsAs List(55, 100, 202, 101, 1) } }
Example 56
Source File: AppleCustomPartitioner.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.partitioning import java.util.Random import org.apache.spark.Partitioner class AppleCustomPartitioner(numOfParts:Int) extends Partitioner { override def numPartitions: Int = numOfParts def random = new Random() override def getPartition(key: Any): Int = { val k = key.asInstanceOf[(String, Long)] val ticker = k._1 if (ticker.equals("apple")) { val saltedTicker = ticker + random.nextInt(9) Math.abs(saltedTicker.hashCode) % numPartitions } else { Math.abs(ticker.hashCode) % numPartitions } } }
Example 57
Source File: SubtractedRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext import org.apache.spark.serializer.Serializer def setSerializer(serializer: Serializer): SubtractedRDD[K, V, W] = { this.serializer = Option(serializer) this } override def getDependencies: Seq[Dependency[_]] = { Seq(rdd1, rdd2).map { rdd => if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency(rdd, part, serializer) } } } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.size) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => new ShuffleCoGroupSplitDep(s.shuffleHandle) case _ => new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(dep: CoGroupSplitDep, op: Product2[K, V] => Unit) = dep match { case NarrowCoGroupSplitDep(rdd, _, itsSplit) => rdd.iterator(itsSplit, context).asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case ShuffleCoGroupSplitDep(handle) => val iter = SparkEnv.get.shuffleManager .getReader(handle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } // the first dep is rdd1; add all values to the map integrate(partition.deps(0), t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(partition.deps(1), t => map.remove(t._1)) map.iterator.map { t => t._2.iterator.map { (t._1, _) } }.flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 58
Source File: PythonPartitioner.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 59
Source File: OrderedRDDFunctions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 60
Source File: PythonPartitioner.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.Partitioner import org.apache.spark.util.Utils private[spark] class PythonPartitioner( override val numPartitions: Int, val pyPartitionFunctionId: Long) extends Partitioner { override def getPartition(key: Any): Int = key match { case null => 0 // we don't trust the Python partition function to return valid partition ID's so // let's do a modulo numPartitions in any case case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions) case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions) } override def equals(other: Any): Boolean = other match { case h: PythonPartitioner => h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId case _ => false } override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode }
Example 61
Source File: ShuffledDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 62
Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext} import org.scalatest.FunSuite class CustomRangePartitionerTest extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom range partitioner") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.amount) //when, then val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000)))) //then partitioned.collect().toList } } class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{ override def numPartitions: Int = ranges.size override def getPartition(key: Any): Int = { if(!key.isInstanceOf[Int]){ throw new IllegalArgumentException("partitioner works only for Int type") } val keyInt = key.asInstanceOf[Int] val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2) println(s"for key: $key return $index") index } }
Example 63
Source File: SavePlainText.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_4 import java.io.File import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{Partitioner, SparkContext} import org.scalatest.{BeforeAndAfterEach, FunSuite} import org.scalatest.Matchers._ import scala.reflect.io.Path class SavePlainText extends FunSuite with BeforeAndAfterEach{ val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext private val FileName = "transactions.txt" override def afterEach() { val path = Path (FileName) path.deleteRecursively() } test("should save and load in plain text") { //given val rdd = spark.makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200))) //when rdd.coalesce(1).saveAsTextFile(FileName) val fromFile = spark.textFile(FileName) fromFile.collect().toList should contain theSameElementsAs List( "UserTransaction(a,100)", "UserTransaction(b,200)" //note - this is string! ) } }
Example 64
Source File: cogroup.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import org.apache.spark.Partitioner import org.apache.spark.rdd.{ CoGroupedRDD, RDD } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ ArrayType, StructField } import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row } import scala.reflect.ClassTag import scala.util.Try object cogroup { implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) { def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] = //Use SparkAddOn ? ??? } def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)( implicit encA: Encoder[A], encB: Encoder[B], encC: Encoder[K], enc: Encoder[(K, Seq[A], Seq[B])], ca: ClassTag[A], ck: ClassTag[K], cb: ClassTag[B] ): Dataset[(K, Seq[A], Seq[B])] = left.sparkSession.implicits .rddToDatasetHolder( RDD .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft)) .cogroup(right.rdd.keyBy(keyRight)) .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) }) ) .toDS def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)( byKey: String, partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*) ): Try[DataFrame] = Try { val subGroup: Seq[DataFrame] = namedSubGroup.map(_._2) val allFrames: Seq[DataFrame] = group +: subGroup val allFramesKeyed: Seq[RDD[(String, Row)]] = allFrames.map(df => { val idx = df.columns.indexOf(byKey) df.rdd.keyBy(_.get(idx).toString) }) val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner) val rowRdd: RDD[Row] = cogroupRdd.map(x => { val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq) val seq = rows.head.head.toSeq ++ rows.tail new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row] }) val schema = types.StructType( group.schema.fields ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) } ) group.sparkSession.createDataFrame(rowRdd, schema) } }
Example 65
Source File: RPCContinuousShuffleWriter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import scala.concurrent.Future import scala.concurrent.duration.Duration import org.apache.spark.Partitioner import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.util.ThreadUtils class RPCContinuousShuffleWriter( writerId: Int, outputPartitioner: Partitioner, endpoints: Array[RpcEndpointRef]) extends ContinuousShuffleWriter { if (outputPartitioner.numPartitions != 1) { throw new IllegalArgumentException("multiple readers not yet supported") } if (outputPartitioner.numPartitions != endpoints.length) { throw new IllegalArgumentException(s"partitioner size ${outputPartitioner.numPartitions} did " + s"not match endpoint count ${endpoints.length}") } def write(epoch: Iterator[UnsafeRow]): Unit = { while (epoch.hasNext) { val row = epoch.next() endpoints(outputPartitioner.getPartition(row)).askSync[Unit](ReceiverRow(writerId, row)) } val futures = endpoints.map(_.ask[Unit](ReceiverEpochMarker(writerId))).toSeq implicit val ec = ThreadUtils.sameThread ThreadUtils.awaitResult(Future.sequence(futures), Duration.Inf) } }
Example 66
Source File: VOrderedRDDFunctionsSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.rdd.VOrderedRDDFunctions._ import org.apache.spark.{Partitioner, SparkFunSuite} import org.apache.spark.mllib.util.MLlibTestSparkContext class VOrderedRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext { override def beforeAll(): Unit = { super.beforeAll() } test("testGroupByKeyUsingSort") { val rdd: RDD[(Int, Int)] = sc.parallelize(Seq((1, 4), (1, 5), (1, 8), (0, 3), (0, 6), (2, 3), (3, 2)), 3) val res = rdd.groupByKeyUsingSort(new Partitioner { override def numPartitions: Int = 3 override def getPartition(key: Any): Int = key.asInstanceOf[Int] % 3 }).mapValues(_.toList).collect() assert(res === Array( (0, List(3, 6)), (3, List(2)), (1, List(4, 5, 8)), (2, List(3)) )) } }
Example 67
Source File: VOrderedRDDFunctions.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.Partitioner import org.apache.spark.internal.Logging import org.apache.spark.util.collection.CompactBuffer import scala.reflect.ClassTag class VOrderedRDDFunctions[K, V](self: RDD[(K, V)]) (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K]) extends Logging with Serializable { def groupByKeyUsingSort(partitioner: Partitioner): RDD[(K, Iterable[V])] = { self.repartitionAndSortWithinPartitions(partitioner) .mapPartitions { (iter: Iterator[(K, V)]) => new Iterator[(K, CompactBuffer[V])] { private var firstElemInNextGroup: (K, V) = null override def hasNext: Boolean = firstElemInNextGroup != null || iter.hasNext override def next(): (K, CompactBuffer[V]) = { if (firstElemInNextGroup == null) { firstElemInNextGroup = iter.next() } val key = firstElemInNextGroup._1 val group = CompactBuffer[V](firstElemInNextGroup._2) firstElemInNextGroup = null var reachNewGroup = false while (iter.hasNext && !reachNewGroup) { val currElem = iter.next() if (currElem._1 == key) { group += currElem._2 } else { firstElemInNextGroup = currElem reachNewGroup = true } } (key, group) } } } } } private[spark] object VOrderedRDDFunctions { implicit def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)])(implicit ord: Ordering[K]): VOrderedRDDFunctions[K, V] = { new VOrderedRDDFunctions(rdd) } }
Example 68
Source File: GroupSorted.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.api.java import java.util.{ Comparator, Iterator => JIterator } import scala.reflect.ClassTag import scala.collection.JavaConverters._ import org.apache.spark.{ Partitioner, HashPartitioner } import org.apache.spark.Partitioner.defaultPartitioner import org.apache.spark.api.java.JavaPairRDD import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction } import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted } object GroupSorted { private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] { def compare(x: T, y: T) = comparator.compare(x, y) } private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator) private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K]) private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = { implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag val valueOrdering = Option(valueComparator).map(comparatorToOrdering) SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering) } } class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) { def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) = this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator)) def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) = this(GroupSorted.groupSort(javaPairRDD, partitioner, null)) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) = this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) = this(javaPairRDD, numPartitions, null) def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) = this(javaPairRDD, -1, valueComparator) def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null) import GroupSorted._ override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala)) } override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v))) } def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv))) } def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala)) } def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v))) } def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call)) } def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v))) } }
Example 69
Source File: utils.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted import org.apache.spark.Partitioner case class HashOrdering[A](ord: Ordering[A]) extends Ordering[A] { override def compare(x: A, y: A): Int = { val h1 = if (x == null) 0 else x.hashCode val h2 = if (y == null) 0 else y.hashCode if (h1 < h2) -1 else if (h1 > h2) 1 else ord.compare(x, y) } } private case class KeyPartitioner(partitioner: Partitioner) extends Partitioner { override def numPartitions: Int = partitioner.numPartitions override def getPartition(key: Any): Int = partitioner.getPartition(key.asInstanceOf[Tuple2[Any, Any]]._1) }
Example 70
Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch03.partition import org.apache.log4j.{Level, Logger} import org.apache.spark.Partitioner import org.apache.spark.{SparkConf, SparkContext} // scalastyle:off println object CustomPartitionerExample { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) val conf = new SparkConf().setAppName("CustomPartitionerExample") val sc = new SparkContext(conf) run(sc) sc.stop() } def run(sc: SparkContext) { val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange")) val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _) val customPartitioned = fruits.map((_, 1)).reduceByKey( new FirstLetterPartitioner(sc.defaultParallelism), _ + _) println(s"""fruits:\n ${fruits.collect().mkString(", ")}""") println() println("partitioned by default partitioner") defaultPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) println() println("partitioned by first letter partitioner") customPartitioned.glom().mapPartitionsWithIndex((p, it) => it.map(n => s""" Par$p: ${n.mkString(",")}""") ).foreach(println) } } private[partition] class FirstLetterPartitioner(numParts: Int) extends Partitioner { override def numPartitions: Int = numParts override def getPartition(key: Any): Int = { key.toString.charAt(0).hashCode % numPartitions match { case p if p < 0 => p + numPartitions case p => p } } override def equals(other: Any): Boolean = { other match { case p: FirstLetterPartitioner => p.numPartitions == numPartitions case _ => false } } } // scalastyle:on println
Example 71
Source File: RoutingTablePartition.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.util.collection.{BitSet, PrimitiveVector} import org.apache.spark.graphx._ import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage private[graphx] object RoutingTablePartition { def foreachWithinEdgePartition (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean) (f: VertexId => Unit) { val (vidsCandidate, srcVids, dstVids) = routingTable(pid) val size = vidsCandidate.length if (includeSrc && includeDst) { // Avoid checks for performance vidsCandidate.iterator.foreach(f) } else if (!includeSrc && !includeDst) { // Do nothing } else { val relevantVids = if (includeSrc) srcVids else dstVids relevantVids.iterator.foreach { i => f(vidsCandidate(i)) } } } }