org.apache.spark.Partitioner.defaultPartitioner Scala Example

Source File: GroupSorted.scala From spark-sorted with Apache License 2.0

5 votes

package com.tresata.spark.sorted.api.java

import java.util.{ Comparator, Iterator => JIterator }
import scala.reflect.ClassTag
import scala.collection.JavaConverters._

import org.apache.spark.{ Partitioner, HashPartitioner }
import org.apache.spark.Partitioner.defaultPartitioner
import org.apache.spark.api.java.JavaPairRDD
import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction }

import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted }

object GroupSorted {
  private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] {
    def compare(x: T, y: T) = comparator.compare(x, y)
  }

  private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator)

  private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]]

  private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K])

  private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = {
    implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag
    implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag
    val valueOrdering = Option(valueComparator).map(comparatorToOrdering)
    SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering)
  }
}

class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) {
  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator))

  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, null))

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) =
    this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) =
    this(javaPairRDD, numPartitions, null)

  def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) =
    this(javaPairRDD, -1, valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null)

  import GroupSorted._

  override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala))
  }

  override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v)))
  }

  def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv)))
  }

  def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala))
  }

  def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v)))
  }

  def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call))
  }

  def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v)))
  }
}

Source File: BlockJoinOperations.scala From spark-skewjoin with Apache License 2.0

5 votes

package com.tresata.spark.skewjoin

import java.util.{ Random => JRandom }
import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.Partitioner
import org.apache.spark.Partitioner.defaultPartitioner

class BlockJoinOperations[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable {
  // based on blockJoinWithSmaller in scalding. See com.twitter.scalding.JoinAlgorithms
  private def blockCogroup[W](other: RDD[(K, W)], leftReplication: Int, rightReplication: Int, partitioner: Partitioner): RDD[((K, (Int, Int)), (Iterable[V], Iterable[W]))] = {
    assert(leftReplication >= 1, "must specify a positive number for left replication")
    assert(rightReplication >= 1, "must specify a positive number for right replication")
    def getReplication(random: JRandom, replication: Int, otherReplication: Int) : Seq[(Int, Int)] = {
      val rand = random.nextInt(otherReplication)
      (0 until replication).map{ rep => (rand, rep) }
    }
    val rddBlocked = rdd.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv =>
        getReplication(random, leftReplication, rightReplication).map{ rl => ((kv._1, rl.swap), kv._2)}
      }
    }
    val otherBlocked = other.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv =>
        getReplication(random, rightReplication, leftReplication).map{ lr => ((kv._1, lr), kv._2)}
      }
    }
    rddBlocked.cogroup(otherBlocked, partitioner)
  }
  
  
  def blockRightOuterJoin[W](other: RDD[(K, W)], leftReplication: Int): RDD[(K, (Option[V], W))] =
    blockRightOuterJoin(other, leftReplication, defaultPartitioner(rdd, other)) 
}

Source File: SkewJoinOperations.scala From spark-skewjoin with Apache License 2.0

5 votes

package com.tresata.spark.skewjoin

import java.util.{ Random => JRandom }
import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.Partitioner
import org.apache.spark.Partitioner.defaultPartitioner

import com.twitter.algebird.{ CMS, CMSHasher, CMSMonoid }

case class CMSParams(eps: Double = 0.005, delta: Double = 1e-8, seed: Int = 1) {
  def getCMSMonoid[K: Ordering: CMSHasher]: CMSMonoid[K] = CMS.monoid[K](eps, delta, seed)
}

class SkewJoinOperations[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable {
  private def getReplicationFactors(random: JRandom, replication: Int, otherReplication: Int): Seq[(Int, Int)] = {
    require(replication > 0 && otherReplication > 0, "replication must be positive")
    val rand = random.nextInt(otherReplication)
    (0 until replication).map(rep => (rand, rep))
  }
  
  private def createRddCMS[K](rdd: RDD[K], cmsMonoid: CMSMonoid[K]): CMS[K] =
    rdd.map(k => cmsMonoid.create(k)).reduce(cmsMonoid.plus(_, _))

  def skewCogroup[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Iterable[V], Iterable[W]))] = {
    val numPartitions = partitioner.numPartitions
    val broadcastedLeftCMS = rdd.sparkContext.broadcast(createRddCMS[K](rdd.keys, cmsParams.getCMSMonoid[K]))
    val broadcastedRightCMS = rdd.sparkContext.broadcast(createRddCMS[K](other.keys, cmsParams.getCMSMonoid[K]))
    
    val rddSkewed = rdd.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, leftReplication, rightReplication).map(rl =>((kv._1, rl.swap), kv._2))
      }
    }
    
    val otherSkewed = other.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, rightReplication, leftReplication).map(lr => ((kv._1, lr), kv._2))
      }
    }

    rddSkewed.cogroup(otherSkewed, partitioner).map(kv => (kv._1._1, kv._2))
  }

  def skewCogroup[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] =
    skewCogroup(other, defaultPartitioner(rdd, other))

  def skewJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, W))] =
    skewCogroup(other, partitioner, skewReplication, cmsParams).flatMap{ blockPair =>
      for (v <- blockPair._2._1.iterator; w <- blockPair._2._2.iterator) yield
        (blockPair._1, (v, w))
    }

  def skewJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, W))] =
    skewJoin(other, defaultPartitioner(rdd, other))

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, Option[W]))] =
    skewCogroup(other, partitioner, RightReplication(skewReplication), cmsParams).flatMap{
      case (k, (itv, Seq())) => itv.iterator.map(v => (k, (v, None)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (v, Some(w)))
    }

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] =
    skewLeftOuterJoin(other, defaultPartitioner(rdd, other))

  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Option[V], W))] =
    skewCogroup(other, partitioner, LeftReplication(skewReplication), cmsParams).flatMap{
      case (k, (Seq(), itw)) => itw.iterator.map(w => (k, (None, w)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (Some(v), w))
    }
  
  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] =
    skewRightOuterJoin(other, defaultPartitioner(rdd, other))
}

trait Dsl {
  implicit def rddToSkewJoinOperations_e94qoy3tnt[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]): SkewJoinOperations[K, V] = new SkewJoinOperations(rdd)
  implicit def rddToBlockJoinOperations_7IaIe6dkih[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): BlockJoinOperations[K, V] = new BlockJoinOperations(rdd)
}

object Dsl extends Dsl

Source File: SkewJoinOperationsSpec.scala From spark-skewjoin with Apache License 2.0

5 votes

package com.tresata.spark.skewjoin

import org.scalatest.FunSpec

import com.tresata.spark.skewjoin.Dsl._

import com.twitter.algebird.CMSHasherImplicits._

import org.apache.spark.Partitioner.defaultPartitioner

case object DummySkewReplication extends SkewReplication {
  override def getReplications(leftCount: Long, rightCount: Long, numPartitions: Int) = (2, 2)
}

class SkewJoinOperationsSpec extends FunSpec {
  lazy val sc = SparkSuite.sc
  lazy val rdd1 = sc.parallelize(Array(1, 1, 2, 3, 4)).map(s => (s, 1)).repartition(2)
  lazy val rdd2 = sc.parallelize(Array(1, 1, 6, 4, 5)).map(s => (s, 2)).repartition(2)

  describe("SkewJoin") {
    it("should inner join two datasets using skewJoin correctly") {
      assert(rdd1.skewJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList === 
        Seq((1, (1, 2)), (1, (1, 2)), (1, (1, 2)), (1, (1, 2)), (4, (1, 2))))
    }

    it("should left join two datasets using skewLeftOuterJoin correctly") {
      assert(rdd1.skewLeftOuterJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList ===
        Seq((1, (1, Some(2))), (1, (1, Some(2))), (1, (1, Some(2))), (1, (1, Some(2))), (2, (1, None)), (3, (1, None)), (4, (1, Some(2)))))
    }

    it("should right join two datasets using skewRightOuterJoin correctly") {
      assert(rdd1.skewRightOuterJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList ===
        Seq((1, (Some(1), 2)), (1, (Some(1), 2)), (1, (Some(1), 2)), (1, (Some(1), 2)), (4, (Some(1), 2)), (5, (None, 2)), (6, (None, 2))))
    }
  }
}

org.apache.spark.Partitioner.defaultPartitioner Scala Examples