org.apache.spark.Partitioner.defaultPartitioner Scala Examples
The following examples show how to use org.apache.spark.Partitioner.defaultPartitioner.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GroupSorted.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.api.java import java.util.{ Comparator, Iterator => JIterator } import scala.reflect.ClassTag import scala.collection.JavaConverters._ import org.apache.spark.{ Partitioner, HashPartitioner } import org.apache.spark.Partitioner.defaultPartitioner import org.apache.spark.api.java.JavaPairRDD import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction } import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted } object GroupSorted { private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] { def compare(x: T, y: T) = comparator.compare(x, y) } private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator) private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K]) private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = { implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag val valueOrdering = Option(valueComparator).map(comparatorToOrdering) SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering) } } class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) { def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) = this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator)) def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) = this(GroupSorted.groupSort(javaPairRDD, partitioner, null)) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) = this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) = this(javaPairRDD, numPartitions, null) def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) = this(javaPairRDD, -1, valueComparator) def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null) import GroupSorted._ override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala)) } override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v))) } def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv))) } def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala)) } def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v))) } def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call)) } def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v))) } }
Example 2
Source File: BlockJoinOperations.scala From spark-skewjoin with Apache License 2.0 | 5 votes |
package com.tresata.spark.skewjoin import java.util.{ Random => JRandom } import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.Partitioner import org.apache.spark.Partitioner.defaultPartitioner class BlockJoinOperations[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable { // based on blockJoinWithSmaller in scalding. See com.twitter.scalding.JoinAlgorithms private def blockCogroup[W](other: RDD[(K, W)], leftReplication: Int, rightReplication: Int, partitioner: Partitioner): RDD[((K, (Int, Int)), (Iterable[V], Iterable[W]))] = { assert(leftReplication >= 1, "must specify a positive number for left replication") assert(rightReplication >= 1, "must specify a positive number for right replication") def getReplication(random: JRandom, replication: Int, otherReplication: Int) : Seq[(Int, Int)] = { val rand = random.nextInt(otherReplication) (0 until replication).map{ rep => (rand, rep) } } val rddBlocked = rdd.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => getReplication(random, leftReplication, rightReplication).map{ rl => ((kv._1, rl.swap), kv._2)} } } val otherBlocked = other.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => getReplication(random, rightReplication, leftReplication).map{ lr => ((kv._1, lr), kv._2)} } } rddBlocked.cogroup(otherBlocked, partitioner) } def blockRightOuterJoin[W](other: RDD[(K, W)], leftReplication: Int): RDD[(K, (Option[V], W))] = blockRightOuterJoin(other, leftReplication, defaultPartitioner(rdd, other)) }
Example 3
Source File: SkewJoinOperations.scala From spark-skewjoin with Apache License 2.0 | 5 votes |
package com.tresata.spark.skewjoin import java.util.{ Random => JRandom } import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.Partitioner import org.apache.spark.Partitioner.defaultPartitioner import com.twitter.algebird.{ CMS, CMSHasher, CMSMonoid } case class CMSParams(eps: Double = 0.005, delta: Double = 1e-8, seed: Int = 1) { def getCMSMonoid[K: Ordering: CMSHasher]: CMSMonoid[K] = CMS.monoid[K](eps, delta, seed) } class SkewJoinOperations[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable { private def getReplicationFactors(random: JRandom, replication: Int, otherReplication: Int): Seq[(Int, Int)] = { require(replication > 0 && otherReplication > 0, "replication must be positive") val rand = random.nextInt(otherReplication) (0 until replication).map(rep => (rand, rep)) } private def createRddCMS[K](rdd: RDD[K], cmsMonoid: CMSMonoid[K]): CMS[K] = rdd.map(k => cmsMonoid.create(k)).reduce(cmsMonoid.plus(_, _)) def skewCogroup[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Iterable[V], Iterable[W]))] = { val numPartitions = partitioner.numPartitions val broadcastedLeftCMS = rdd.sparkContext.broadcast(createRddCMS[K](rdd.keys, cmsParams.getCMSMonoid[K])) val broadcastedRightCMS = rdd.sparkContext.broadcast(createRddCMS[K](other.keys, cmsParams.getCMSMonoid[K])) val rddSkewed = rdd.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => val (leftReplication, rightReplication) = skewReplication.getReplications( broadcastedLeftCMS.value.frequency(kv._1).estimate, broadcastedRightCMS.value.frequency(kv._1).estimate, numPartitions) getReplicationFactors(random, leftReplication, rightReplication).map(rl =>((kv._1, rl.swap), kv._2)) } } val otherSkewed = other.mapPartitions{ it => val random = new JRandom it.flatMap{ kv => val (leftReplication, rightReplication) = skewReplication.getReplications( broadcastedLeftCMS.value.frequency(kv._1).estimate, broadcastedRightCMS.value.frequency(kv._1).estimate, numPartitions) getReplicationFactors(random, rightReplication, leftReplication).map(lr => ((kv._1, lr), kv._2)) } } rddSkewed.cogroup(otherSkewed, partitioner).map(kv => (kv._1._1, kv._2)) } def skewCogroup[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] = skewCogroup(other, defaultPartitioner(rdd, other)) def skewJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, W))] = skewCogroup(other, partitioner, skewReplication, cmsParams).flatMap{ blockPair => for (v <- blockPair._2._1.iterator; w <- blockPair._2._2.iterator) yield (blockPair._1, (v, w)) } def skewJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, W))] = skewJoin(other, defaultPartitioner(rdd, other)) def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, Option[W]))] = skewCogroup(other, partitioner, RightReplication(skewReplication), cmsParams).flatMap{ case (k, (itv, Seq())) => itv.iterator.map(v => (k, (v, None))) case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (v, Some(w))) } def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] = skewLeftOuterJoin(other, defaultPartitioner(rdd, other)) def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner, skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Option[V], W))] = skewCogroup(other, partitioner, LeftReplication(skewReplication), cmsParams).flatMap{ case (k, (Seq(), itw)) => itw.iterator.map(w => (k, (None, w))) case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (Some(v), w)) } def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] = skewRightOuterJoin(other, defaultPartitioner(rdd, other)) } trait Dsl { implicit def rddToSkewJoinOperations_e94qoy3tnt[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]): SkewJoinOperations[K, V] = new SkewJoinOperations(rdd) implicit def rddToBlockJoinOperations_7IaIe6dkih[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): BlockJoinOperations[K, V] = new BlockJoinOperations(rdd) } object Dsl extends Dsl
Example 4
Source File: SkewJoinOperationsSpec.scala From spark-skewjoin with Apache License 2.0 | 5 votes |
package com.tresata.spark.skewjoin import org.scalatest.FunSpec import com.tresata.spark.skewjoin.Dsl._ import com.twitter.algebird.CMSHasherImplicits._ import org.apache.spark.Partitioner.defaultPartitioner case object DummySkewReplication extends SkewReplication { override def getReplications(leftCount: Long, rightCount: Long, numPartitions: Int) = (2, 2) } class SkewJoinOperationsSpec extends FunSpec { lazy val sc = SparkSuite.sc lazy val rdd1 = sc.parallelize(Array(1, 1, 2, 3, 4)).map(s => (s, 1)).repartition(2) lazy val rdd2 = sc.parallelize(Array(1, 1, 6, 4, 5)).map(s => (s, 2)).repartition(2) describe("SkewJoin") { it("should inner join two datasets using skewJoin correctly") { assert(rdd1.skewJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList === Seq((1, (1, 2)), (1, (1, 2)), (1, (1, 2)), (1, (1, 2)), (4, (1, 2)))) } it("should left join two datasets using skewLeftOuterJoin correctly") { assert(rdd1.skewLeftOuterJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList === Seq((1, (1, Some(2))), (1, (1, Some(2))), (1, (1, Some(2))), (1, (1, Some(2))), (2, (1, None)), (3, (1, None)), (4, (1, Some(2))))) } it("should right join two datasets using skewRightOuterJoin correctly") { assert(rdd1.skewRightOuterJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList === Seq((1, (Some(1), 2)), (1, (Some(1), 2)), (1, (Some(1), 2)), (1, (Some(1), 2)), (4, (Some(1), 2)), (5, (None, 2)), (6, (None, 2)))) } } }