org.apache.spark.Partitioner Scala Example

Source File: OrderedRDDFunctions.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) => {
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      }
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.partitioning


import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID}
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator
import org.apache.log4j.Logger
import org.apache.spark.{Partitioner, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId}

import scala.reflect.ClassTag


object CommunityBasedPartitioning {
  @transient
  val logger=Logger.getLogger(CommunityBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts
    val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph)
    val numberOfCommunities=communities.vertices.values.countApproxDistinct()
    val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices)
    val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries  and ${coarsedNumberOfPartitions} partitions")
    val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache()
    out.edges.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts)
  }



}

Source File: Parsing.scala From meetup-stream with Apache License 2.0

5 votes

package util

import core._
import org.joda.time.DateTime
import org.json4s.DefaultFormats
import org.json4s._
import org.json4s.native.JsonMethods._
import org.joda.time.DateTime
import org.apache.spark.Partitioner
import org.apache.spark.streaming.Seconds
import scala.util.Try

object Parsing {
  
  
  @transient implicit val formats = DefaultFormats
  
  def parseEvent(eventJson: String):Option[Event]={
    Try({
      val json=parse(eventJson).camelizeKeys
      val event=json.extract[Event]
      event      
    }).toOption
  }
  
  def parseRsvp(rsvpJson: String)={
    Try({
      val json=parse(rsvpJson).camelizeKeys
      val member=(json \ "member").extract[Member]
      val event=(json \ "event").extract[MemberEvent]
      val response=(json \ "response").extract[String]
      (member, event, response)
    }).toOption
  }
             
}

Source File: HBasePartitioner.scala From Spark-SQL-on-HBase with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner
import org.apache.spark.util.CollectionsUtils

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
}

Source File: HBasePartitioner.scala From Heracles with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner
import org.apache.spark.util.CollectionsUtils

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (val splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  override def numPartitions = if (len == 0) 1 else len

  @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
}

Source File: MapPartitionsWithPreparationRDD.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Partition, Partitioner, TaskContext}


  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val prepared =
      if (preparedArguments.isEmpty) {
        preparePartition()
      } else {
        preparedArguments.remove(0)
      }
    val parentIterator = firstParent[T].iterator(partition, context)
    executePartition(context, partition.index, prepared, parentIterator)
  }
}

Source File: PythonPartitioner.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: ShuffledDStream.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.{Duration, Time}
import scala.reflect.ClassTag

private[streaming]
class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
    parent: DStream[(K, V)],
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiner: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true
  ) extends DStream[(K, C)] (parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
    parent.getOrCompute(validTime) match {
      case Some(rdd) => Some(rdd.combineByKey[C](
          createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
      case None => None
    }
  }
}

Source File: BulkLoadPartitioner.scala From SparkOnHBase with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case wrapper: ByteArrayWrapper =>
          wrapper.value
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
}

Source File: ShuffledDStream.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
    parent: DStream[(K, V)],
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiner: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true
  ) extends DStream[(K, C)] (parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
    parent.getOrCompute(validTime) match {
      case Some(rdd) => Some(rdd.combineByKey[C](
          createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
      case None => None
    }
  }
}

Source File: PythonPartitioner.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: ShuffledDStream.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.{Duration, Time}
import scala.reflect.ClassTag

private[streaming]
class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
    parent: DStream[(K, V)],
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiner: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true
  ) extends DStream[(K, C)] (parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
    parent.getOrCompute(validTime) match {
      case Some(rdd) => Some(rdd.combineByKey[C](
          createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
      case None => None
    }
  }
}

Source File: OrderedRDDFunctions.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Source File: PythonPartitioner.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: ShuffledDStream.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
    parent: DStream[(K, V)],
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiner: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true
  ) extends DStream[(K, C)] (parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
    parent.getOrCompute(validTime) match {
      case Some(rdd) => Some(rdd.combineByKey[C](
          createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
      case None => None
    }
  }
}

Source File: RowPartitioner.scala From hail with MIT License

5 votes

package is.hail.linalg

import org.apache.spark.Partitioner

object RowPartitioner {
  
  def findInterval(a: Array[Long], key: Long): Int = {
    var lo = 0
    var hi = a.length - 1
    while (lo <= hi) {
      val mid = (lo + hi) >>> 1
      if (key < a(mid))
        hi = mid - 1
      else
        lo = mid + 1
    }
    lo - 1
  }
}

case class RowPartitioner(partitionStarts: Array[Long]) extends Partitioner {
  override val numPartitions: Int = partitionStarts.length - 1

  override def getPartition(key: Any): Int = key match {
    case i: Long => RowPartitioner.findInterval(partitionStarts, i)
  }
}

Source File: RandomEqualPartitioner.scala From ScalaNetwork with GNU General Public License v2.0

5 votes

package kr.ac.kaist.ir.deep.train

import org.apache.spark.Partitioner


class RandomEqualPartitioner(val numPartition: Int) extends Partitioner {
  private var nextNumber = 0

  def refreshRandom() = {
    nextNumber += 1
  }

  override def numPartitions: Int = numPartition

  override def getPartition(key: Any): Int = {
    val i = key.asInstanceOf[Long] + nextNumber
    val remain = i % numPartition
    val quotient = ((i / numPartition) * nextNumber) % numPartition
    val hash = ((remain + quotient) % numPartition).asInstanceOf[Int]
    if (hash < 0)
      hash + numPartition
    else
      hash
  }
}

Source File: SuperBigWindowing.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.windowing.superbig

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.sql.SparkSession

object SuperBigWindowing {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val jsonPath = args(0)
    val pageSize = args(1).toInt

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = spark.read.json(jsonPath)

    import spark.implicits._

    val diffDs = jsonDf.flatMap(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")

      val timePage = time / pageSize

      if (time %  pageSize == 0) { //Am I on the edge of the page
        Seq((timePage, (time, value)), (timePage + 1, (time, value)))
      } else {
        Seq((timePage, (time, value)))
      }
    }).groupByKey(r => r._1).flatMapGroups((k, it) => {
      var lastValue = 0l

      it.toSeq.
        sortBy{case (page, (time, value)) => time}.
        map{case (page, (time, value)) =>
        val dif = value - lastValue
        lastValue = value
        (time, value, dif)
      }
    })

    diffDs.collect().foreach(r => println(" - " + r))

    spark.stop()

  }
}

Source File: PartitionwiseWeightedSampledRDD.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.graph.utils
import java.util.Random

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, Partitioner, TaskContext}

import scala.reflect.ClassTag
import scala.util.{Random => ScalaRandom}

class PartitionwiseWeightedSampledRDDPartition(val prev: Partition, val seed: Long, val fraction: Double)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


class PartitionwiseWeightedSampledRDD[T: ClassTag, U: ClassTag](
                                                                 prev: RDD[(T, Float)],
                                                                 sampler: WeightedRandomSampler[T, U],
                                                                 fractions: Map[Int, Double],
                                                                 preservesPartitioning: Boolean,
                                                                 @transient private val seed: Long = ScalaRandom.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner: Option[Partitioner] = {
    if (preservesPartitioning) prev.partitioner else None
  }

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[(T, Float)].partitions.map { x =>
      new PartitionwiseWeightedSampledRDDPartition(x, random.nextLong(), fractions.getOrElse(x.index, 0.0))
    }
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    firstParent[(T, Float)].preferredLocations(
      split.asInstanceOf[PartitionwiseWeightedSampledRDDPartition].prev
    )
  }

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseWeightedSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.setFraction(split.fraction)
    thisSampler.sample(firstParent[(T, Float)].iterator(split.prev, context))
  }
}

Source File: KeyPartitioner.scala From spark3D with Apache License 2.0

5 votes

package com.astrolabsoftware.spark3d.spatialPartitioning

// Spark built-in partitioner
import org.apache.spark.Partitioner


  override def getPartition(key : Any) : Int = {
    key match {
      case i:Int => key.asInstanceOf[Int]
      case l:Long => key.asInstanceOf[Long].toInt
      case _ => throw new ClassCastException("""
        Key from KeyPartitioner must be Int or Long!
        """)
    }
  }
}

Source File: MapDPartitioner.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.partitioner

import org.apache.spark.{Partitioner, SparkEnv}
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.util.MutablePair


object MapDPartition {
  def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager]

  def apply[T](origin: RDD[(Int, (T, InternalRow))],
               num_partitions: Int): RDD[(Int, (T, InternalRow))] = {
    val rdd = if (sortBasedShuffleOn) {
      origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))}
    } else {
      origin.mapPartitions {iter =>
        val mutablePair = new MutablePair[Int, (T, InternalRow)]()
        iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy())))
      }
    }

    val part = new MapDPartitioner(num_partitions)
    new ShuffledRDD[Int, (T, InternalRow), (T, InternalRow)](rdd, part)
  }
}

class MapDPartitioner(num_partitions: Int) extends Partitioner {
  def numPartitions: Int = num_partitions
  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[Int]
    require(k >= 0 && k < num_partitions)
    k
  }
}

Source File: RangeDPartitioner.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.partitioner

import org.apache.spark.util.CollectionsUtils
import org.apache.spark.{Partitioner, SparkEnv}
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.util.MutablePair

import scala.reflect.ClassTag


object RangeDPartition {
  def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager]

  def apply[K: Ordering: ClassTag, T](origin: RDD[(K, (T, InternalRow))],
                                      range_bounds: Array[K]): RDD[(K, (T, InternalRow))] = {
    val rdd = if (sortBasedShuffleOn) {
      origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))}
    } else {
      origin.mapPartitions {iter =>
        val mutablePair = new MutablePair[K, (T, InternalRow)]()
        iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy())))
      }
    }

    val part = new RangeDPartitioner(range_bounds, ascending = true)
    new ShuffledRDD[K, (T, InternalRow), (T, InternalRow)](rdd, part)
  }
}

class RangeDPartitioner[K: Ordering: ClassTag](range_bounds: Array[K],
                                               ascending: Boolean) extends Partitioner {
  def numPartitions: Int = range_bounds.length + 1

  private val binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[K]
    var partition = 0
    if (range_bounds.length < 128) {
      while (partition < range_bounds.length && Ordering[K].gt(k, range_bounds(partition)))
        partition += 1
    } else {
      partition = binarySearch(range_bounds, k)
      if (partition < 0) partition = -partition - 1
      if (partition > range_bounds.length) partition = range_bounds.length
    }
    if (ascending) partition
    else range_bounds.length - partition
  }
}

Source File: VoronoiPartitioner.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.partitioner

import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.{Partitioner, SparkEnv}
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.util.MutablePair


object VoronoiPartition {
  def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager]

  def apply(origin: RDD[(Int, (Point, InternalRow))], pivot_to_group: Array[Int], num_group: Int)
  : RDD[(Int, (Point, InternalRow))] = {
    val rdd = if (sortBasedShuffleOn) {
      origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))}
    } else {
      origin.mapPartitions {iter =>
        val mutablePair = new MutablePair[Int, (Point, InternalRow)]()
        iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy())))
      }
    }

    val part = new VoronoiPartitioner(pivot_to_group, num_group)
    new ShuffledRDD[Int, (Point, InternalRow), (Point, InternalRow)](rdd, part)
  }
}

class VoronoiPartitioner(pivot_to_group: Array[Int], num_group: Int) extends Partitioner {
  override def numPartitions: Int = num_group

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[Int]
    pivot_to_group(k)
  }
}

Source File: HashPartitioner.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.partitioner

import org.apache.spark.{Partitioner, SparkEnv}
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.util.MutablePair


object HashPartition {
  def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager]

  def apply(origin: RDD[(Any, InternalRow)], num_partitions: Int): RDD[(Any, InternalRow)] = {
    val rdd = if (sortBasedShuffleOn) {
      origin.mapPartitions {iter => iter.map(row => (row._1, row._2.copy()))}
    } else {
      origin.mapPartitions {iter =>
        val mutablePair = new MutablePair[Any, InternalRow]()
        iter.map(row => mutablePair.update(row._1, row._2.copy()))
      }
    }

    val part = new HashPartitioner(num_partitions)
    new ShuffledRDD[Any, InternalRow, InternalRow](rdd, part)
  }
}

class HashPartitioner(num_partitions: Int) extends Partitioner {
  override def numPartitions: Int = num_partitions

  override def getPartition(key: Any): Int = {
    key.hashCode() % num_partitions
  }
}

Source File: SkewJoinOperations.scala From spark-skewjoin with Apache License 2.0

5 votes

package com.tresata.spark.skewjoin

import java.util.{ Random => JRandom }
import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.Partitioner
import org.apache.spark.Partitioner.defaultPartitioner

import com.twitter.algebird.{ CMS, CMSHasher, CMSMonoid }

case class CMSParams(eps: Double = 0.005, delta: Double = 1e-8, seed: Int = 1) {
  def getCMSMonoid[K: Ordering: CMSHasher]: CMSMonoid[K] = CMS.monoid[K](eps, delta, seed)
}

class SkewJoinOperations[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable {
  private def getReplicationFactors(random: JRandom, replication: Int, otherReplication: Int): Seq[(Int, Int)] = {
    require(replication > 0 && otherReplication > 0, "replication must be positive")
    val rand = random.nextInt(otherReplication)
    (0 until replication).map(rep => (rand, rep))
  }
  
  private def createRddCMS[K](rdd: RDD[K], cmsMonoid: CMSMonoid[K]): CMS[K] =
    rdd.map(k => cmsMonoid.create(k)).reduce(cmsMonoid.plus(_, _))

  def skewCogroup[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Iterable[V], Iterable[W]))] = {
    val numPartitions = partitioner.numPartitions
    val broadcastedLeftCMS = rdd.sparkContext.broadcast(createRddCMS[K](rdd.keys, cmsParams.getCMSMonoid[K]))
    val broadcastedRightCMS = rdd.sparkContext.broadcast(createRddCMS[K](other.keys, cmsParams.getCMSMonoid[K]))
    
    val rddSkewed = rdd.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, leftReplication, rightReplication).map(rl =>((kv._1, rl.swap), kv._2))
      }
    }
    
    val otherSkewed = other.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, rightReplication, leftReplication).map(lr => ((kv._1, lr), kv._2))
      }
    }

    rddSkewed.cogroup(otherSkewed, partitioner).map(kv => (kv._1._1, kv._2))
  }

  def skewCogroup[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] =
    skewCogroup(other, defaultPartitioner(rdd, other))

  def skewJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, W))] =
    skewCogroup(other, partitioner, skewReplication, cmsParams).flatMap{ blockPair =>
      for (v <- blockPair._2._1.iterator; w <- blockPair._2._2.iterator) yield
        (blockPair._1, (v, w))
    }

  def skewJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, W))] =
    skewJoin(other, defaultPartitioner(rdd, other))

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, Option[W]))] =
    skewCogroup(other, partitioner, RightReplication(skewReplication), cmsParams).flatMap{
      case (k, (itv, Seq())) => itv.iterator.map(v => (k, (v, None)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (v, Some(w)))
    }

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] =
    skewLeftOuterJoin(other, defaultPartitioner(rdd, other))

  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Option[V], W))] =
    skewCogroup(other, partitioner, LeftReplication(skewReplication), cmsParams).flatMap{
      case (k, (Seq(), itw)) => itw.iterator.map(w => (k, (None, w)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (Some(v), w))
    }
  
  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] =
    skewRightOuterJoin(other, defaultPartitioner(rdd, other))
}

trait Dsl {
  implicit def rddToSkewJoinOperations_e94qoy3tnt[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]): SkewJoinOperations[K, V] = new SkewJoinOperations(rdd)
  implicit def rddToBlockJoinOperations_7IaIe6dkih[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): BlockJoinOperations[K, V] = new BlockJoinOperations(rdd)
}

object Dsl extends Dsl

Source File: BlockJoinOperations.scala From spark-skewjoin with Apache License 2.0

5 votes

package com.tresata.spark.skewjoin

import java.util.{ Random => JRandom }
import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.Partitioner
import org.apache.spark.Partitioner.defaultPartitioner

class BlockJoinOperations[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable {
  // based on blockJoinWithSmaller in scalding. See com.twitter.scalding.JoinAlgorithms
  private def blockCogroup[W](other: RDD[(K, W)], leftReplication: Int, rightReplication: Int, partitioner: Partitioner): RDD[((K, (Int, Int)), (Iterable[V], Iterable[W]))] = {
    assert(leftReplication >= 1, "must specify a positive number for left replication")
    assert(rightReplication >= 1, "must specify a positive number for right replication")
    def getReplication(random: JRandom, replication: Int, otherReplication: Int) : Seq[(Int, Int)] = {
      val rand = random.nextInt(otherReplication)
      (0 until replication).map{ rep => (rand, rep) }
    }
    val rddBlocked = rdd.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv =>
        getReplication(random, leftReplication, rightReplication).map{ rl => ((kv._1, rl.swap), kv._2)}
      }
    }
    val otherBlocked = other.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv =>
        getReplication(random, rightReplication, leftReplication).map{ lr => ((kv._1, lr), kv._2)}
      }
    }
    rddBlocked.cogroup(otherBlocked, partitioner)
  }
  
  
  def blockRightOuterJoin[W](other: RDD[(K, W)], leftReplication: Int): RDD[(K, (Option[V], W))] =
    blockRightOuterJoin(other, leftReplication, defaultPartitioner(rdd, other)) 
}

Source File: BBRPartitioner.scala From zen with Apache License 2.0

5 votes

package com.github.cloudml.zen.ml.partitioner

import scala.reflect.ClassTag

import com.github.cloudml.zen.ml.clustering.LDADefines._
import com.github.cloudml.zen.ml.sampler.AliasTable
import com.github.cloudml.zen.ml.util.XORShiftRandom
import breeze.linalg.{SparseVector => BSV}
import org.apache.spark.Partitioner
import org.apache.spark.graphx2._
import org.apache.spark.graphx2.impl.GraphImpl
import org.apache.spark.storage.StorageLevel


private[ml] class BBRPartitioner(val partitions: Int) extends Partitioner {

  override def numPartitions: Int = partitions

  def getKey(et: EdgeTriplet[Int, _]): VertexId = {
    if (et.srcAttr >= et.dstAttr) et.srcId else et.dstId
  }

  def getPartition(key: Any): PartitionID = {
    key.asInstanceOf[PartitionID] % numPartitions
  }

  override def equals(other: Any): Boolean = other match {
    case bbr: BBRPartitioner =>
      bbr.numPartitions == numPartitions
    case _ =>
      false
  }

  override def hashCode: Int = numPartitions
}


object BBRPartitioner {
  private[zen] def partitionByBBR[VD: ClassTag, ED: ClassTag](
    input: Graph[VD, ED],
    storageLevel: StorageLevel): Graph[VD, ED] = {
    val edges = input.edges
    val conf = edges.context.getConf
    val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length)
    val bbr = new BBRPartitioner(numPartitions)
    val degGraph = GraphImpl(input.degrees, edges)
    val assnGraph = degGraph.mapTriplets((pid, iter) =>
      iter.map(et => (bbr.getKey(et), Edge(et.srcId, et.dstId, et.attr))), TripletFields.All)
    assnGraph.persist(storageLevel)

    val assnVerts = assnGraph.aggregateMessages[Long](ect => {
      if (ect.attr._1 == ect.srcId) {
        ect.sendToSrc(1L)
      } else {
        ect.sendToDst(1L)
      }
    }, _ + _, TripletFields.EdgeOnly)
    val (kids, koccurs) = assnVerts.filter(_._2 > 0L).collect().unzip
    val partRdd = edges.context.parallelize(kids.zip(rearrage(koccurs, numPartitions)))
    val rearrGraph = assnGraph.mapVertices((_, _) => null.asInstanceOf[AliasTable[Long]])
      .joinVertices(partRdd)((_, _, arr) => AliasTable.generateAlias(arr))

    val newEdges = rearrGraph.triplets.mapPartitions(iter => {
      val gen = new XORShiftRandom()
      iter.map(et => {
        val (kid, edge) = et.attr
        val table = if (kid == et.srcId) et.srcAttr else et.dstAttr
        (table.sampleRandom(gen), edge)
      })
    }).partitionBy(bbr).map(_._2)
    GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel)
  }

  private def rearrage(koccurs: IndexedSeq[Long], numPartitions: Int): IndexedSeq[BSV[Long]] = {
    val numKeys = koccurs.length
    val numEdges = koccurs.sum
    val npp = numEdges / numPartitions
    val rpn = numEdges - npp * numPartitions
    @inline def nrpp(pi: Int): Long = npp + (if (pi < rpn) 1L else 0L)
    @inline def kbn(ki: Int): Long = if (ki < numKeys) koccurs(ki) else 0L
    val keyPartCount = koccurs.map(t => BSV.zeros[Long](numPartitions))
    def put(ki: Int, krest: Long, pi: Int, prest: Long): Unit = {
      if (ki < numKeys) {
        if (krest == prest) {
          keyPartCount(ki)(pi) = krest
          put(ki + 1, kbn(ki + 1), pi + 1, nrpp(pi + 1))
        } else if (krest < prest) {
          keyPartCount(ki)(pi) = krest
          put(ki + 1, kbn(ki + 1), pi, prest - krest)
        } else {
          keyPartCount(ki)(pi) = prest
          put(ki, krest - prest, pi + 1, nrpp(pi + 1))
        }
      }
    }
    put(0, kbn(0), 0, nrpp(0))
    keyPartCount
  }
}

Source File: SimpleCustomPartitioner.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.partitioning

import org.apache.spark.Partitioner
import org.apache.spark.sql.SparkSession

object SimpleCustomPartitioner {
  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)
    val partitions = args(1).toInt

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val partitionedRdd = jsonDf.rdd.map(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")
      ((group, time), value) //this a tuple with in a tuple
    }).repartitionAndSortWithinPartitions(new SimpleCustomPartitioner(partitions))

    val pairRdd = jsonDf.rdd.map(row => {
      val group = row.getAs[String]("group")
      val time = row.getAs[Long]("time")
      val value = row.getAs[Long]("value")
      ((group, time), value) //this a tuple with in a tuple
    })

    pairRdd.reduceByKey(_ + _, 100)
    pairRdd.reduceByKey(new SimpleCustomPartitioner(partitions), _ + _)


    partitionedRdd.collect().foreach(r => {
      println(r)
    })

    sparkSession.stop()
  }
}

class SimpleCustomPartitioner(numOfParts:Int) extends Partitioner {
  override def numPartitions: Int = numOfParts

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[(String, Long)]
    Math.abs(k._1.hashCode) % numPartitions
  }
}

Source File: BulkLoadPartitioner.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import java.util
import java.util.Comparator

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


@InterfaceAudience.Public
class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {
  // when table not exist, startKeys = Byte[0][]
  override def numPartitions: Int = if (startKeys.length == 0) 1 else startKeys.length

  override def getPartition(key: Any): Int = {

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case wrapper: ByteArrayWrapper =>
          wrapper.value
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }
    var partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0)
      partition = partition * -1 + -2
    if (partition < 0)
      partition = 0
    partition
  }
}

Source File: MetricImplicits.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.timeseries.timely

import java.io.PrintStream
import java.net.Socket
import java.nio.charset.StandardCharsets

import io.gzet.timeseries.SimpleConfig
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{Logging, Partitioner}

object MetricImplicits extends Logging with SimpleConfig {

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  class MetricPartitioner(partitions: Int) extends Partitioner {
    require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")

    override def numPartitions: Int = partitions

    override def getPartition(key: Any): Int = {
      val k = key.asInstanceOf[MetricKey]
      nonNegativeMod(k.metricName.hashCode, partitions)
    }
  }

  implicit class Metrics(rdd: RDD[Metric]) {

    val partitions = rdd.partitions.length
    val partitioner = new MetricPartitioner(partitions)

    def publish() = {
      val sSortedMetricRDD = rdd filter { metric =>
        metric.tags.nonEmpty
      } map { metric =>
        (MetricKey(metric.name, metric.time), metric)
      } repartitionAndSortWithinPartitions partitioner

      sSortedMetricRDD.values foreachPartition { it: Iterator[Metric] =>
        val sock = new Socket(timelyHost, timelyPort)
        val writer = new PrintStream(sock.getOutputStream, true, StandardCharsets.UTF_8.name)
        it foreach { metric =>
          writer.println(metric.toPut)
        }
        writer.flush()
      }
    }
  }


  implicit class MetricStream(stream: DStream[Metric]) {
    def publish() = {
      stream foreachRDD {
        rdd => rdd.publish()
      }
    }
  }
}

case class Metric(name: String, time: Long, value: Double, tags: Map[String, String], viz: Option[String] = None) {
  def toPut = {
    val vizMap = if(viz.isDefined) List("viz" -> viz.get) else List[(String, String)]()
    val strTags = vizMap.union(tags.toList).map({ case (k, v) =>
      s"$k=$v"
    }).mkString(" ")
    s"put $name $time $value $strTags"
  }
}

case class MetricKey(metricName: String, metricTime: Long)

object MetricKey {
  implicit def orderingByMetricDate[A <: MetricKey] : Ordering[A] = {
    Ordering.by(fk => (fk.metricName, fk.metricTime))
  }
}

Source File: IDPartitioner.scala From traj-sim-spark with Apache License 2.0

5 votes

package edu.utah.cs.partitioner

import org.apache.spark.Partitioner
import org.apache.spark.rdd.{RDD, ShuffledRDD}



object IDPartition {
  def apply(origin: RDD[_ <: Product2[Int, Any]], n_part: Int)
  : RDD[_ <: Product2[Int, Any]] = {
    val part = new IDPartitioner(n_part)
    val shuffled = new ShuffledRDD[Int, Any, Any](origin, part)
    shuffled
  }
}

class IDPartitioner(n_part: Int) extends Partitioner {
  override def numPartitions: Int = n_part

  override def getPartition(key: Any): Int = {
    key.asInstanceOf[Int]
  }
}

Source File: OrderedRDDFunctions.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) => {
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      }
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Source File: PythonPartitioner.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: ShuffledDStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.{Duration, Time}
import scala.reflect.ClassTag

private[streaming]
class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
    parent: DStream[(K, V)],
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiner: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true
  ) extends DStream[(K, C)] (parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
    parent.getOrCompute(validTime) match {
      case Some(rdd) => Some(rdd.combineByKey[C](
          createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
      case None => None
    }
  }
}

Source File: OrderedRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Source File: PythonPartitioner.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: BulkLoadPartitioner.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
}

Source File: DummyRangePartitioner.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei
import org.apache.spark.Partitioner

object DummyRangePartitioner extends Partitioner {
  override def numPartitions: Int          = 2
  override def getPartition(key: Any): Int = {
    key match {
      case x: Int =>
        if (x < 0) 0
        else 1
      case _ => 0
    }
  }
}

Source File: RDDOrderedFunctions.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.rdd
import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

private[kontextfrei] trait RDDOrderedFunctions
    extends DCollectionOrderedFunctions[RDD] { this: RDDBase =>

  override final def sortByKey[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean): RDD[(A, B)] = withSite(x) {
    _.sortByKey(ascending)
  }

  override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering,
                                                B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean, numPartitions: Int): RDD[(A, B)] = withSite(x) {
    _.sortByKey(ascending, numPartitions)
  }

  override final def filterByRange[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(lower: A, upper: A): RDD[(A, B)] = withSite(x) {
    _.filterByRange(lower, upper)
  }

  override def repartitionAndSortWithinPartitions[
      A: ClassTag: Ordering,
      B: ClassTag](
      x: RDD[(A, B)])(
      partitioner: Partitioner)
    : RDD[(A, B)] = withSite(x) {
    _.repartitionAndSortWithinPartitions(partitioner)
  }
}

Source File: RDDPairFunctions.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.rdd

import com.danielwestheide.kontextfrei.DCollectionPairFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.reflect.ClassTag

private[kontextfrei] trait RDDPairFunctions
    extends DCollectionPairFunctions[RDD] { this: RDDBase =>

  override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) {
    _.cogroup(y)
  }

  override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) {
    _.values
  }

  override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) {
    _.keys
  }

  override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) {
    _.leftOuterJoin(y)
  }

  override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) {
    _.rightOuterJoin(y)
  }

  override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) {
    _.fullOuterJoin(y)
  }

  override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) {
    _.mapValues(f)
  }

  override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) {
    _.flatMapValues(f)
  }

  override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])(
      f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.reduceByKey(f)
  }

  override final def foldByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.foldByKey(zeroValue)(f)
  }

  override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C,
                                     combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.aggregateByKey(zeroValue)(seqOp, combOp)
  }

  override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(createCombiner: B => C)(
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.combineByKey(createCombiner, mergeValue, mergeCombiners)
  }

  override final def countByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) {
    _.countByKey()
  }

  override final def collectAsMap[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, B] = withSite(xs) {
    _.collectAsMap()
  }

  override final def partitionBy[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) {
    _.partitionBy(partitioner)
  }
}

Source File: PairSyntax.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.syntax

import com.danielwestheide.kontextfrei.DCollectionOps
import org.apache.spark.Partitioner

import scala.collection.Map
import scala.reflect.ClassTag

class PairSyntax[DCollection[_], A: ClassTag, B: ClassTag](
    val self: DCollectionOps[DCollection],
    val coll: DCollection[(A, B)]) {

  final def keys: DCollection[A] = self.keys(coll)

  final def values: DCollection[B] = self.values(coll)

  final def cogroup[C: ClassTag](other: DCollection[(A, C)])
    : DCollection[(A, (Iterable[B], Iterable[C]))] =
    self.cogroup(coll)(other)

  final def leftOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (B, Option[C]))] =
    self.leftOuterJoin(coll)(other)

  final def rightOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (Option[B], C))] =
    self.rightOuterJoin(coll)(other)

  final def fullOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (Option[B], Option[C]))] =
    self.fullOuterJoin(coll)(other)

  final def mapValues[C: ClassTag](f: B => C): DCollection[(A, C)] =
    self.mapValues(coll)(f)

  final def flatMapValues[C: ClassTag](
      f: B => TraversableOnce[C]): DCollection[(A, C)] =
    self.flatMapValues(coll)(f)

  final def reduceByKey(f: (B, B) => B): DCollection[(A, B)] =
    self.reduceByKey(coll)(f)

  final def foldByKey(zeroValue: B)(f: (B, B) => B): DCollection[(A, B)] =
    self.foldByKey(coll)(zeroValue, f)

  final def aggregateByKey[C: ClassTag](zeroValue: C)(
      seqOp: (C, B) => C,
      combOp: (C, C) => C): DCollection[(A, C)] =
    self.aggregateByKey(coll)(zeroValue)(seqOp, combOp)

  final def combineByKey[C: ClassTag](
      createCombiner: B => C,
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): DCollection[(A, C)] =
    self.combineByKey(coll)(createCombiner)(mergeValue, mergeCombiners)

  final def countByKey(): Map[A, Long] = self.countByKey(coll)

  final def collectAsMap(): Map[A, B] = self.collectAsMap(coll)

  final def partitionBy(partitioner: Partitioner): DCollection[(A, B)] =
    self.partitionBy(coll)(partitioner)
}

Source File: OrderedSyntax.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.syntax
import com.danielwestheide.kontextfrei.DCollectionOps
import org.apache.spark.Partitioner

import scala.reflect.ClassTag

class OrderedSyntax[DCollection[_], A: ClassTag: Ordering, B: ClassTag](
    val self: DCollectionOps[DCollection],
    val coll: DCollection[(A, B)]) {

  final def sortByKey(ascending: Boolean): DCollection[(A, B)] =
    self.sortByKey(coll)(ascending)

  final def sortByKey(): DCollection[(A, B)] =
    self.sortByKey(coll)(ascending = true)

  final def sortByKey(ascending: Boolean = true,
                      numPartitions: Int): DCollection[(A, B)] =
    self.sortByKeyWithNumPartitions(coll)(ascending, numPartitions)

  final def filterByRange(lower: A, upper: A): DCollection[(A, B)] =
    self.filterByRange(coll)(lower, upper)

  final def repartitionAndSortWithinPartitions(
      partitioner: Partitioner): DCollection[(A, B)] =
    self.repartitionAndSortWithinPartitions(coll)(partitioner)

}

Source File: StreamOrderedFunctions.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.stream
import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions
import org.apache.spark.Partitioner

import scala.reflect.ClassTag

private[kontextfrei] trait StreamOrderedFunctions
    extends DCollectionOrderedFunctions[Stream] {

  import Ordering.Implicits._

  override final def sortByKey[A: ClassTag: Ordering, B: ClassTag](
      x: Stream[(A, B)])(ascending: Boolean): Stream[(A, B)] =
    x.sortBy(_._1)(ordering(ascending))

  override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering,
                                                B: ClassTag](x: Stream[(A, B)])(
      ascending: Boolean,
      numPartitions: Int): Stream[(A, B)] = x.sortBy(_._1)(ordering(ascending))

  override final def filterByRange[A: ClassTag: Ordering, B: ClassTag](
      x: Stream[(A, B)])(lower: A, upper: A): Stream[(A, B)] =
    x.filter(e => e._1 >= lower && e._1 <= upper)

  override def repartitionAndSortWithinPartitions[A: ClassTag: Ordering,
                                                  B: ClassTag](
      x: Stream[(A, B)])(partitioner: Partitioner): Stream[(A, B)] =
    x.sortBy(_._1)(ordering(ascending = true))

  private def ordering[A](ascending: Boolean)(
      implicit ev: Ordering[A]): Ordering[A] =
    if (ascending) ev
    else ev.reverse

}

Source File: HBasePartitioner.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.{CollectionsUtils, Utils}
import org.apache.spark.{Partitioner, SparkEnv}

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
}

Source File: BulkLoadPartitioner.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.loader.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
}

Source File: TiRegionPartitioner.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.write

import java.util

import com.pingcap.tikv.key.Key
import com.pingcap.tikv.region.TiRegion
import org.apache.spark.Partitioner

class TiRegionPartitioner(regions: util.List[TiRegion], writeConcurrency: Int)
    extends Partitioner {
  override def getPartition(key: Any): Int = {
    val serializableKey = key.asInstanceOf[SerializableKey]
    val rawKey = Key.toRawKey(serializableKey.bytes)

    binarySearch(rawKey) % numPartitions
  }

  def binarySearch(key: Key): Int = {
    if (regions.get(0).contains(key)) {
      return 0
    }
    var l = 0
    var r = regions.size()
    while (l < r) {
      val mid = l + (r - l) / 2
      val region = regions.get(mid)
      if (Key.toRawKey(region.getEndKey).compareTo(key) <= 0) {
        l = mid + 1
      } else {
        r = mid
      }
    }
    assert(regions.get(l).contains(key))
    l
  }

  override def numPartitions: Int =
    if (writeConcurrency <= 0) regions.size() else writeConcurrency
}

Source File: ColumnPartitioner.scala From MatRel with Apache License 2.0

5 votes

package org.apache.spark.sql.matfast.partitioner

import org.apache.spark.{Partitioner, SparkConf}
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.matfast.util.MatfastSerializer
// scalastyle:off

class ColumnPartitioner(partitions: Int) extends Partitioner{

  require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions")

  override val numPartitions = partitions

  override def getPartition(key: Any): Int = {
    key match {
      case (i: Int, j: Int) => j % partitions
      case (i: Int, j: Int, _: Int) => j % partitions
      case _ => throw new IllegalArgumentException(s"Unrecognized key: $key")
    }
  }

  override def equals(other: Any): Boolean = {
    other.isInstanceOf[ColumnPartitioner] &&
      numPartitions == other.asInstanceOf[ColumnPartitioner].numPartitions
  }

  override def hashCode(): Int = {
    com.google.common.base.Objects.hashCode(partitions: java.lang.Integer)
  }
}

// scalastyle:on

object ColumnPartitioner {

  def apply(origin: RDD[InternalRow], numPartitions: Int): RDD[((Int, Int), InternalRow)] = {
    val rdd = origin.map { row =>
      val rid = row.getInt(0)
      val cid = row.getInt(1)
      val matrix = row.getStruct(2, 7)
      ((rid, cid), matrix)
    }
    val partitioner = new ColumnPartitioner(numPartitions)
    val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner)
    shuffled.setSerializer(new MatfastSerializer(new SparkConf(false)))
    shuffled
  }
}

Source File: BlockCyclicPartitioner.scala From MatRel with Apache License 2.0

5 votes

package org.apache.spark.sql.matfast.partitioner

import org.apache.spark.{Partitioner, SparkConf}
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.matfast.util.MatfastSerializer




class BlockCyclicPartitioner(val ROW_BLKS: Int,
                             val COL_BLKS: Int,
                             val ROW_BLKS_PER_PARTITION: Int,
                             val COL_BLKS_PER_PARTITION: Int) extends Partitioner{

  require(ROW_BLKS > 0, s"Number of row blocks should be larger than 0, but found $ROW_BLKS")
  require(COL_BLKS > 0, s"Number of col blocks should be larger than 0, but found $COL_BLKS")
  require(ROW_BLKS_PER_PARTITION > 0,
    s"Number of row blocks per partition should be larger than 0, " +
    s"but found $ROW_BLKS_PER_PARTITION")
  require(COL_BLKS_PER_PARTITION > 0,
    s"Number of col blocks per partition should be larger than 0, " +
    s"but found $COL_BLKS_PER_PARTITION")

  private val row_partition_num = math.ceil(ROW_BLKS * 1.0 / ROW_BLKS_PER_PARTITION).toInt
  private val col_partition_num = math.ceil(COL_BLKS * 1.0 / COL_BLKS_PER_PARTITION).toInt

  private val num_row_part = ROW_BLKS / row_partition_num
  private val num_col_part = COL_BLKS / col_partition_num

  override val numPartitions: Int = row_partition_num * col_partition_num

  override def getPartition(key: Any): Int = {
    key match {
      case (i: Int, j : Int) =>
        ((i % num_row_part) * col_partition_num + (j % num_col_part)) % numPartitions
      case (i: Int, j: Int, _: Int) =>
        ((i % num_row_part) * col_partition_num + (j % num_col_part)) % numPartitions
      case _ => throw new IllegalArgumentException(s"Unrecognized key: $key")
    }
  }

  override def equals(obj: Any): Boolean = {
    obj match {
      case r: BlockCyclicPartitioner =>
        (ROW_BLKS == r.ROW_BLKS) &&
          (COL_BLKS == r.COL_BLKS) &&
          (ROW_BLKS_PER_PARTITION == r.ROW_BLKS_PER_PARTITION) &&
          (COL_BLKS_PER_PARTITION == r.COL_BLKS_PER_PARTITION)
      case _ => false
    }
  }

  override def hashCode(): Int = {
    com.google.common.base.Objects.hashCode(
      ROW_BLKS: java.lang.Integer,
      COL_BLKS: java.lang.Integer,
      ROW_BLKS_PER_PARTITION: java.lang.Integer,
      COL_BLKS_PER_PARTITION: java.lang.Integer
    )
  }
}

object BlockCyclicPartitioner {

  def apply(origin: RDD[InternalRow],
            ROW_BLKS: Int,
            COL_BLKS: Int,
            ROW_BLKS_PER_PARTITION: Int,
            COL_BLKS_PER_PARTITION: Int): RDD[((Int, Int), InternalRow)] = {

    val rdd = origin.map { row =>
      val rid = row.getInt(0)
      val cid = row.getInt(1)
      val matrix = row.getStruct(2, 7)
      ((rid, cid), matrix)
    }
    val partitioner = new BlockCyclicPartitioner(ROW_BLKS, COL_BLKS,
      ROW_BLKS_PER_PARTITION, COL_BLKS_PER_PARTITION)
    val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner)
    shuffled.setSerializer(new MatfastSerializer(new SparkConf(false)))
    shuffled
  }
}

Source File: IndexPartitioner.scala From MatRel with Apache License 2.0

5 votes

package org.apache.spark.sql.matfast.partitioner

import org.apache.spark.Partitioner
// scalastyle:off

class IndexPartitioner(partitions: Int) extends Partitioner{

  require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions")

  override val numPartitions: Int = partitions

  override def getPartition(key: Any): Int = {
    key match {
      case (i: Int) => i
      case _ => throw new IllegalArgumentException(s"Unrecognized key: $key")
    }
  }

  override def equals(other: Any): Boolean = {
    other.isInstanceOf[IndexPartitioner] &&
      numPartitions == other.asInstanceOf[IndexPartitioner].numPartitions
  }

  override def hashCode(): Int = {
    com.google.common.base.Objects.hashCode(partitions: java.lang.Integer)
  }
}

Source File: RowPartitioner.scala From MatRel with Apache License 2.0

5 votes

package org.apache.spark.sql.matfast.partitioner

import org.apache.spark.{Partitioner, SparkConf}
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.matfast.util.MatfastSerializer

// scalastyle:off
class RowPartitioner(partitions: Int) extends Partitioner{

  require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions")

  override val numPartitions = partitions

  override def getPartition(key: Any): Int = {
    key match {
      case (i: Int, j: Int) => i % partitions
      case (i: Int, j: Int, _: Int) => i % partitions
      case _ => throw new IllegalArgumentException(s"Unrecognized key: $key")
    }
  }

  override def equals(other: Any): Boolean = {
    other.isInstanceOf[RowPartitioner] &&
      numPartitions == other.asInstanceOf[RowPartitioner].numPartitions
  }

  override def hashCode(): Int = {
    com.google.common.base.Objects.hashCode(partitions: java.lang.Integer)
  }
}

object RowPartitioner {

  def apply(origin: RDD[InternalRow], numPartitions: Int): RDD[((Int, Int), InternalRow)] = {
    val rdd = origin.map { row =>
      val rid = row.getInt(0)
      val cid = row.getInt(1)
      val matrix = row.getStruct(2, 7)
      ((rid, cid), matrix)
    }
    val partitioner = new RowPartitioner(numPartitions)
    val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner)
    shuffled.setSerializer(new MatfastSerializer(new SparkConf(false)))
    shuffled
  }
}

Source File: OrderedRDDFunctions.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Source File: SubtractedRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.Dependency
import org.apache.spark.OneToOneDependency
import org.apache.spark.Partition
import org.apache.spark.Partitioner
import org.apache.spark.ShuffleDependency
import org.apache.spark.SparkEnv
import org.apache.spark.TaskContext


private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
    @transient var rdd1: RDD[_ <: Product2[K, V]],
    @transient var rdd2: RDD[_ <: Product2[K, W]],
    part: Partitioner)
  extends RDD[(K, V)](rdd1.context, Nil) {


  override def getDependencies: Seq[Dependency[_]] = {
    def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]])
      : Dependency[_] = {
      if (rdd.partitioner == Some(part)) {
        logDebug("Adding one-to-one dependency with " + rdd)
        new OneToOneDependency(rdd)
      } else {
        logDebug("Adding shuffle dependency with " + rdd)
        new ShuffleDependency[T1, T2, Any](rdd, part)
      }
    }
    Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2))
  }

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](part.numPartitions)
    for (i <- 0 until array.length) {
      // Each CoGroupPartition will depend on rdd1 and rdd2
      array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) =>
        dependencies(j) match {
          case s: ShuffleDependency[_, _, _] =>
            None
          case _ =>
            Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)))
        }
      }.toArray)
    }
    array
  }

  override val partitioner = Some(part)

  override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = {
    val partition = p.asInstanceOf[CoGroupPartition]
    val map = new JHashMap[K, ArrayBuffer[V]]
    def getSeq(k: K): ArrayBuffer[V] = {
      val seq = map.get(k)
      if (seq != null) {
        seq
      } else {
        val seq = new ArrayBuffer[V]()
        map.put(k, seq)
        seq
      }
    }
    def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = {
      dependencies(depNum) match {
        case oneToOneDependency: OneToOneDependency[_] =>
          val dependencyPartition = partition.narrowDeps(depNum).get.split
          oneToOneDependency.rdd.iterator(dependencyPartition, context)
            .asInstanceOf[Iterator[Product2[K, V]]].foreach(op)

        case shuffleDependency: ShuffleDependency[_, _, _] =>
          val iter = SparkEnv.get.shuffleManager
            .getReader(
              shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context)
            .read()
          iter.foreach(op)
      }
    }

    // the first dep is rdd1; add all values to the map
    integrate(0, t => getSeq(t._1) += t._2)
    // the second dep is rdd2; remove all of its keys
    integrate(1, t => map.remove(t._1))
    map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }

}

Source File: PythonPartitioner.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: ShuffledDStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
    parent: DStream[(K, V)],
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiner: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true
  ) extends DStream[(K, C)] (parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
    parent.getOrCompute(validTime) match {
      case Some(rdd) => Some(rdd.combineByKey[C](
          createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
      case None => None
    }
  }
}

Source File: CustomPartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_3

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{Partitioner, SparkContext}
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class CustomPartitioner extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use custom partitioner") {
    //given
    val numberOfExecutors = 2
    val data = spark
      .parallelize(List(
        UserTransaction("a", 100),
        UserTransaction("b", 101),
        UserTransaction("a", 202),
        UserTransaction("b", 1),
        UserTransaction("c", 55)
      )
      ).keyBy(_.userId)
      .partitionBy(new Partitioner {
        override def numPartitions: Int = numberOfExecutors

        override def getPartition(key: Any): Int = {
          key.hashCode % numberOfExecutors
        }
      })

    println(data.partitions.length)

    //when
    val res = data.mapPartitions[Long](iter =>
      iter.map(_._2).map(_.amount)
    ).collect().toList

    //then
    res should contain theSameElementsAs List(55, 100, 202, 101, 1)
  }
}

Source File: AppleCustomPartitioner.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.partitioning

import java.util.Random

import org.apache.spark.Partitioner


class AppleCustomPartitioner(numOfParts:Int) extends Partitioner {
  override def numPartitions: Int = numOfParts
  def random = new Random()

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[(String, Long)]
    val ticker = k._1
    if (ticker.equals("apple")) {
      val saltedTicker = ticker + random.nextInt(9)
      Math.abs(saltedTicker.hashCode) % numPartitions
    } else {
      Math.abs(ticker.hashCode) % numPartitions
    }
  }
}

Source File: SubtractedRDD.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.rdd

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.Dependency
import org.apache.spark.OneToOneDependency
import org.apache.spark.Partition
import org.apache.spark.Partitioner
import org.apache.spark.ShuffleDependency
import org.apache.spark.SparkEnv
import org.apache.spark.TaskContext
import org.apache.spark.serializer.Serializer


  def setSerializer(serializer: Serializer): SubtractedRDD[K, V, W] = {
    this.serializer = Option(serializer)
    this
  }

  override def getDependencies: Seq[Dependency[_]] = {
    Seq(rdd1, rdd2).map { rdd =>
      if (rdd.partitioner == Some(part)) {
        logDebug("Adding one-to-one dependency with " + rdd)
        new OneToOneDependency(rdd)
      } else {
        logDebug("Adding shuffle dependency with " + rdd)
        new ShuffleDependency(rdd, part, serializer)
      }
    }
  }

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](part.numPartitions)
    for (i <- 0 until array.size) {
      // Each CoGroupPartition will depend on rdd1 and rdd2
      array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) =>
        dependencies(j) match {
          case s: ShuffleDependency[_, _, _] =>
            new ShuffleCoGroupSplitDep(s.shuffleHandle)
          case _ =>
            new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i))
        }
      }.toArray)
    }
    array
  }

  override val partitioner = Some(part)

  override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = {
    val partition = p.asInstanceOf[CoGroupPartition]
    val map = new JHashMap[K, ArrayBuffer[V]]
    def getSeq(k: K): ArrayBuffer[V] = {
      val seq = map.get(k)
      if (seq != null) {
        seq
      } else {
        val seq = new ArrayBuffer[V]()
        map.put(k, seq)
        seq
      }
    }
    def integrate(dep: CoGroupSplitDep, op: Product2[K, V] => Unit) = dep match {
      case NarrowCoGroupSplitDep(rdd, _, itsSplit) =>
        rdd.iterator(itsSplit, context).asInstanceOf[Iterator[Product2[K, V]]].foreach(op)

      case ShuffleCoGroupSplitDep(handle) =>
        val iter = SparkEnv.get.shuffleManager
          .getReader(handle, partition.index, partition.index + 1, context)
          .read()
        iter.foreach(op)
    }
    // the first dep is rdd1; add all values to the map
    integrate(partition.deps(0), t => getSeq(t._1) += t._2)
    // the second dep is rdd2; remove all of its keys
    integrate(partition.deps(1), t => map.remove(t._1))
    map.iterator.map { t =>  t._2.iterator.map { (t._1, _) } }.flatten
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }

}

Source File: PythonPartitioner.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: OrderedRDDFunctions.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Source File: PythonPartitioner.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.Partitioner
import org.apache.spark.util.Utils



private[spark] class PythonPartitioner(
  override val numPartitions: Int,
  val pyPartitionFunctionId: Long)
  extends Partitioner {

  override def getPartition(key: Any): Int = key match {
    case null => 0
    // we don't trust the Python partition function to return valid partition ID's so
    // let's do a modulo numPartitions in any case
    case key: Long => Utils.nonNegativeMod(key.toInt, numPartitions)
    case _ => Utils.nonNegativeMod(key.hashCode(), numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: PythonPartitioner =>
      h.numPartitions == numPartitions && h.pyPartitionFunctionId == pyPartitionFunctionId
    case _ =>
      false
  }

  override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
}

Source File: ShuffledDStream.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag](
    parent: DStream[(K, V)],
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiner: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true
  ) extends DStream[(K, C)] (parent.ssc) {

  override def dependencies: List[DStream[_]] = List(parent)

  override def slideDuration: Duration = parent.slideDuration

  override def compute(validTime: Time): Option[RDD[(K, C)]] = {
    parent.getOrCompute(validTime) match {
      case Some(rdd) => Some(rdd.combineByKey[C](
          createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine))
      case None => None
    }
  }
}

Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext}
import org.scalatest.FunSuite

class CustomRangePartitionerTest extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use custom range partitioner") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.amount)

    //when, then
    val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000))))

    //then
    partitioned.collect().toList
  }
}

class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{
  override def numPartitions: Int = ranges.size

  override def getPartition(key: Any): Int = {
    if(!key.isInstanceOf[Int]){
      throw new IllegalArgumentException("partitioner works only for Int type")
    }
    val keyInt = key.asInstanceOf[Int]
    val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2)
    println(s"for key: $key return $index")
    index
  }
}

Source File: SavePlainText.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_4

import java.io.File

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{Partitioner, SparkContext}
import org.scalatest.{BeforeAndAfterEach, FunSuite}
import org.scalatest.Matchers._

import scala.reflect.io.Path

class SavePlainText extends FunSuite with BeforeAndAfterEach{
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  private val FileName = "transactions.txt"

  override def afterEach() {
    val path = Path (FileName)
    path.deleteRecursively()
  }

  test("should save and load in plain text") {
    //given
    val rdd = spark.makeRDD(List(UserTransaction("a", 100), UserTransaction("b", 200)))

    //when
    rdd.coalesce(1).saveAsTextFile(FileName)

    val fromFile = spark.textFile(FileName)

    fromFile.collect().toList should contain theSameElementsAs List(
      "UserTransaction(a,100)", "UserTransaction(b,200)"
      //note - this is string!
    )
  }
}

Source File: cogroup.scala From spark-tools with Apache License 2.0

5 votes

package io.univalence.plumbus

import org.apache.spark.Partitioner
import org.apache.spark.rdd.{ CoGroupedRDD, RDD }
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ ArrayType, StructField }
import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row }

import scala.reflect.ClassTag
import scala.util.Try

object cogroup {

  
  implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) {
    def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] =
      //Use SparkAddOn ?
      ???
  }

  def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)(
    implicit encA: Encoder[A],
    encB: Encoder[B],
    encC: Encoder[K],
    enc: Encoder[(K, Seq[A], Seq[B])],
    ca: ClassTag[A],
    ck: ClassTag[K],
    cb: ClassTag[B]
  ): Dataset[(K, Seq[A], Seq[B])] =
    left.sparkSession.implicits
      .rddToDatasetHolder(
        RDD
          .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft))
          .cogroup(right.rdd.keyBy(keyRight))
          .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) })
      )
      .toDS

  def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)(
    byKey: String,
    partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*)
  ): Try[DataFrame] =
    Try {
      val subGroup: Seq[DataFrame]  = namedSubGroup.map(_._2)
      val allFrames: Seq[DataFrame] = group +: subGroup
      val allFramesKeyed: Seq[RDD[(String, Row)]] =
        allFrames.map(df => {
          val idx = df.columns.indexOf(byKey)
          df.rdd.keyBy(_.get(idx).toString)
        })

      val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner)

      val rowRdd: RDD[Row] =
        cogroupRdd.map(x => {
          val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq)
          val seq                   = rows.head.head.toSeq ++ rows.tail

          new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row]
        })

      val schema =
        types.StructType(
          group.schema.fields
            ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) }
        )

      group.sparkSession.createDataFrame(rowRdd, schema)
    }

}

Source File: RPCContinuousShuffleWriter.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous.shuffle

import scala.concurrent.Future
import scala.concurrent.duration.Duration

import org.apache.spark.Partitioner
import org.apache.spark.rpc.RpcEndpointRef
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.util.ThreadUtils


class RPCContinuousShuffleWriter(
    writerId: Int,
    outputPartitioner: Partitioner,
    endpoints: Array[RpcEndpointRef]) extends ContinuousShuffleWriter {

  if (outputPartitioner.numPartitions != 1) {
    throw new IllegalArgumentException("multiple readers not yet supported")
  }

  if (outputPartitioner.numPartitions != endpoints.length) {
    throw new IllegalArgumentException(s"partitioner size ${outputPartitioner.numPartitions} did " +
      s"not match endpoint count ${endpoints.length}")
  }

  def write(epoch: Iterator[UnsafeRow]): Unit = {
    while (epoch.hasNext) {
      val row = epoch.next()
      endpoints(outputPartitioner.getPartition(row)).askSync[Unit](ReceiverRow(writerId, row))
    }

    val futures = endpoints.map(_.ask[Unit](ReceiverEpochMarker(writerId))).toSeq
    implicit val ec = ThreadUtils.sameThread
    ThreadUtils.awaitResult(Future.sequence(futures), Duration.Inf)
  }
}

Source File: VOrderedRDDFunctionsSuite.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.rdd.VOrderedRDDFunctions._
import org.apache.spark.{Partitioner, SparkFunSuite}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class VOrderedRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  override def beforeAll(): Unit = {
    super.beforeAll()
  }

  test("testGroupByKeyUsingSort") {
    val rdd: RDD[(Int, Int)] =
      sc.parallelize(Seq((1, 4), (1, 5), (1, 8), (0, 3), (0, 6), (2, 3), (3, 2)), 3)
    val res = rdd.groupByKeyUsingSort(new Partitioner {
      override def numPartitions: Int = 3
      override def getPartition(key: Any): Int = key.asInstanceOf[Int] % 3
    }).mapValues(_.toList).collect()

    assert(res === Array(
      (0, List(3, 6)), (3, List(2)), (1, List(4, 5, 8)), (2, List(3))
    ))
  }

}

Source File: VOrderedRDDFunctions.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.Partitioner
import org.apache.spark.internal.Logging
import org.apache.spark.util.collection.CompactBuffer

import scala.reflect.ClassTag

class VOrderedRDDFunctions[K, V](self: RDD[(K, V)])
    (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K])
  extends Logging with Serializable {

  def groupByKeyUsingSort(partitioner: Partitioner): RDD[(K, Iterable[V])] = {
    self.repartitionAndSortWithinPartitions(partitioner)
      .mapPartitions { (iter: Iterator[(K, V)]) =>
        new Iterator[(K, CompactBuffer[V])] {
          private var firstElemInNextGroup: (K, V) = null

          override def hasNext: Boolean = firstElemInNextGroup != null || iter.hasNext

          override def next(): (K, CompactBuffer[V]) = {
            if (firstElemInNextGroup == null) {
              firstElemInNextGroup = iter.next()
            }
            val key = firstElemInNextGroup._1
            val group = CompactBuffer[V](firstElemInNextGroup._2)
            firstElemInNextGroup = null
            var reachNewGroup = false
            while (iter.hasNext && !reachNewGroup) {
              val currElem = iter.next()
              if (currElem._1 == key) {
                group += currElem._2
              } else {
                firstElemInNextGroup = currElem
                reachNewGroup = true
              }
            }
            (key, group)
          }
        }
      }
  }
}

private[spark] object VOrderedRDDFunctions {

  implicit def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)])(implicit ord: Ordering[K]):
      VOrderedRDDFunctions[K, V] = {
    new VOrderedRDDFunctions(rdd)
  }
}

Source File: GroupSorted.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted.api.java

import java.util.{ Comparator, Iterator => JIterator }
import scala.reflect.ClassTag
import scala.collection.JavaConverters._

import org.apache.spark.{ Partitioner, HashPartitioner }
import org.apache.spark.Partitioner.defaultPartitioner
import org.apache.spark.api.java.JavaPairRDD
import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction }

import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted }

object GroupSorted {
  private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] {
    def compare(x: T, y: T) = comparator.compare(x, y)
  }

  private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator)

  private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]]

  private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K])

  private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = {
    implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag
    implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag
    val valueOrdering = Option(valueComparator).map(comparatorToOrdering)
    SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering)
  }
}

class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) {
  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator))

  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, null))

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) =
    this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) =
    this(javaPairRDD, numPartitions, null)

  def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) =
    this(javaPairRDD, -1, valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null)

  import GroupSorted._

  override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala))
  }

  override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v)))
  }

  def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv)))
  }

  def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala))
  }

  def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v)))
  }

  def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call))
  }

  def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v)))
  }
}

Source File: utils.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted

import org.apache.spark.Partitioner

case class HashOrdering[A](ord: Ordering[A]) extends Ordering[A] {
  override def compare(x: A, y: A): Int = {
    val h1 = if (x == null) 0 else x.hashCode
    val h2 = if (y == null) 0 else y.hashCode
    if (h1 < h2) -1 else if (h1 > h2) 1 else ord.compare(x, y)
  }
}

private case class KeyPartitioner(partitioner: Partitioner) extends Partitioner {
  override def numPartitions: Int = partitioner.numPartitions

  override def getPartition(key: Any): Int = partitioner.getPartition(key.asInstanceOf[Tuple2[Any, Any]]._1)
}

Source File: CustomPartitionerExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch03.partition

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Partitioner
import org.apache.spark.{SparkConf, SparkContext}

// scalastyle:off println

object CustomPartitionerExample {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("CustomPartitionerExample")
    val sc = new SparkContext(conf)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext) {
    val fruits = sc.parallelize(Array("Apple", "Orange", "Peach", "Orange", "PineApple", "Orange"))

    val defaultPartitioned = fruits.map((_, 1)).reduceByKey(_ + _)
    val customPartitioned = fruits.map((_, 1)).reduceByKey(
      new FirstLetterPartitioner(sc.defaultParallelism), _ + _)

    println(s"""fruits:\n  ${fruits.collect().mkString(", ")}""")
    println()

    println("partitioned by default partitioner")
    defaultPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
    println()

    println("partitioned by first letter partitioner")
    customPartitioned.glom().mapPartitionsWithIndex((p, it) =>
      it.map(n => s"""  Par$p: ${n.mkString(",")}""")
    ).foreach(println)
  }
}

private[partition]
class FirstLetterPartitioner(numParts: Int) extends Partitioner {
  override def numPartitions: Int = numParts

  override def getPartition(key: Any): Int = {
    key.toString.charAt(0).hashCode % numPartitions match {
      case p if p < 0 => p + numPartitions
      case p => p
    }
  }

  override def equals(other: Any): Boolean = {
    other match {
      case p: FirstLetterPartitioner => p.numPartitions == numPartitions
      case _ => false
    }
  }
}

// scalastyle:on println

Source File: RoutingTablePartition.scala From graphx-algorithm with GNU General Public License v2.0

5 votes

package org.apache.spark.graphx.impl

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.util.collection.{BitSet, PrimitiveVector}

import org.apache.spark.graphx._
import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap

import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage

private[graphx]
object RoutingTablePartition {
  
  def foreachWithinEdgePartition
      (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean)
      (f: VertexId => Unit) {
    val (vidsCandidate, srcVids, dstVids) = routingTable(pid)
    val size = vidsCandidate.length
    if (includeSrc && includeDst) {
      // Avoid checks for performance
      vidsCandidate.iterator.foreach(f)
    } else if (!includeSrc && !includeDst) {
      // Do nothing
    } else {
      val relevantVids = if (includeSrc) srcVids else dstVids
      relevantVids.iterator.foreach { i => f(vidsCandidate(i)) }
    }
  }
}

org.apache.spark.Partitioner Scala Examples