org.apache.spark.NarrowDependency Scala Examples
The following examples show how to use org.apache.spark.NarrowDependency.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LeftJoin.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.join import com.twosigma.flint.rdd.{ PartitionsIterator, PeekableIterator } import org.apache.spark.{ NarrowDependency, OneToOneDependency } import com.twosigma.flint.rdd.OrderedRDD import scala.collection.immutable.TreeMap import scala.reflect.ClassTag import java.util.{ HashMap => JHashMap } protected[flint] object LeftJoin { val skMapInitialSize = 1024 def apply[K: ClassTag, SK, V, V2]( leftRdd: OrderedRDD[K, V], rightRdd: OrderedRDD[K, V2], toleranceFn: K => K, leftSk: V => SK, rightSk: V2 => SK )(implicit ord: Ordering[K]): OrderedRDD[K, (V, Option[(K, V2)])] = { // A map from left partition index to left range split and right partitions. val leftIndexToJoinSplits = TreeMap(RangeMergeJoin.leftJoinSplits( toleranceFn, leftRdd.rangeSplits, rightRdd.rangeSplits ).map { case (split, parts) => (split.partition.index, (split, parts)) }: _*) val leftDep = new OneToOneDependency(leftRdd) val rightDep = new NarrowDependency(rightRdd) { override def getParents(partitionId: Int) = leftIndexToJoinSplits(partitionId)._2.map(_.index) } // A map from left partition index to right partitions val rightPartitions = leftRdd.sc.broadcast(leftIndexToJoinSplits.map { case (idx, joinSplit) => (idx, joinSplit._2) }) val joinedSplits = leftIndexToJoinSplits.map { case (_, (split, _)) => split }.toArray // We don't need the left dependency as we will just load it on demand here new OrderedRDD[K, (V, Option[(K, V2)])](leftRdd.sc, joinedSplits, Seq(leftDep, rightDep))( (part, context) => { val parts = rightPartitions.value(part.index) val rightIter = PeekableIterator(PartitionsIterator(rightRdd, parts, context)) val lastSeen = new JHashMap[SK, (K, V2)](skMapInitialSize) leftRdd.iterator(part, context).map { case (k, v) => // Catch-up the iterator for the right table to match the left key. In the // process, we'll have the last-seen row for each SK in the right table. val sk = leftSk(v) catchUp(k, rightSk, rightIter, lastSeen) val lastSeenRight = lastSeen.get(sk) if (lastSeenRight != null && ord.gteq(lastSeenRight._1, toleranceFn(k))) { (k, (v, Some(lastSeenRight))) } else { (k, (v, None)) } } } ) } @annotation.tailrec private[rdd] def catchUp[K, SK, V]( cur: K, skFn: V => SK, iter: PeekableIterator[(K, V)], lastSeen: JHashMap[SK, (K, V)] )(implicit ord: Ordering[K]) { val peek = iter.peek if (peek.nonEmpty && ord.lteq(peek.get._1, cur)) { val (k, v) = iter.next val sk = skFn(v) lastSeen.put(sk, (k, v)) catchUp(cur, skFn, iter, lastSeen) } } }
Example 2
Source File: Merge.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.join import com.twosigma.flint.rdd.{ PeekableIterator, PartitionsIterator, MergeIterator, RangeSplit } import org.apache.spark.NarrowDependency import com.twosigma.flint.rdd._ import scala.reflect.ClassTag protected[flint] object Merge { def apply[K: Ordering: ClassTag, V: ClassTag]( left: OrderedRDD[K, V], right: OrderedRDD[K, V] ): OrderedRDD[K, V] = ++(left, right).mapValues { case (_, Left(v)) => v case (_, Right(v)) => v } def ++[K: ClassTag, V: ClassTag, V2: ClassTag]( left: OrderedRDD[K, V], right: OrderedRDD[K, V2] )( implicit ord: Ordering[K] ): OrderedRDD[K, Either[V, V2]] = { // A map from new partition to a RangeMergeJoin. val partToMergeJoin = RangeMergeJoin.mergeSplits(left.rangeSplits, right.rangeSplits).zipWithIndex.map { case (mergeJoin, idx) => (OrderedRDDPartition(idx), mergeJoin) }.toMap // A map from partition index to a RangeMergeJoin. val partitionIndexToMergeJoin = partToMergeJoin.map { case (p, m) => (p.index, m) } val leftDep = new NarrowDependency(left) { override def getParents(partitionId: Int) = partitionIndexToMergeJoin(partitionId).left.map(_.partition.index) } val rightDep = new NarrowDependency(right) { override def getParents(partitionId: Int) = partitionIndexToMergeJoin(partitionId).right.map(_.partition.index) } val mergedSplits = partToMergeJoin.map { case (p, mergeJoin) => RangeSplit(p, mergeJoin.range) }.toArray new OrderedRDD[K, Either[V, V2]](left.sc, mergedSplits, Seq(leftDep, rightDep))( (part, context) => { val mergedJoin = partitionIndexToMergeJoin(part.index) // Select rows from both RDDs whose key belongs to this RangeMergeJoin's range val leftParts = mergedJoin.left.map(_.partition) val leftIter = PeekableIterator(PartitionsIterator(left, leftParts, context).filter { case (k, _) => mergedJoin.range.contains(k) }) val rightParts = mergedJoin.right.map(_.partition) val rightIter = PeekableIterator(PartitionsIterator(right, rightParts, context).filter { case (k, _) => mergedJoin.range.contains(k) }) // Perform an ordered merge of the selected rows. MergeIterator(leftIter, rightIter) } ) } }
Example 3
Source File: ReorderedPartitionsRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import is.hail.utils.FastSeq import org.apache.spark.rdd.RDD import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext} import scala.reflect.ClassTag case class ReorderedPartitionsRDDPartition(index: Int, oldPartition: Partition) extends Partition class ReorderedPartitionsRDD[T](@transient var prev: RDD[T], @transient val oldIndices: Array[Int])(implicit tct: ClassTag[T]) extends RDD[T](prev.sparkContext, Nil) { override def getPartitions: Array[Partition] = { val parentPartitions = dependencies.head.rdd.asInstanceOf[RDD[T]].partitions Array.tabulate(oldIndices.length) { i => val oldIndex = oldIndices(i) val oldPartition = parentPartitions(oldIndex) ReorderedPartitionsRDDPartition(i, oldPartition) } } override def compute(split: Partition, context: TaskContext): Iterator[T] = { val parent = dependencies.head.rdd.asInstanceOf[RDD[T]] parent.compute(split.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition, context) } override def getDependencies: Seq[Dependency[_]] = FastSeq(new NarrowDependency[T](prev) { override def getParents(partitionId: Int): Seq[Int] = FastSeq(oldIndices(partitionId)) }) override def clearDependencies() { super.clearDependencies() prev = null } override def getPreferredLocations(partition: Partition): Seq[String] = prev.preferredLocations(partition.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition) }
Example 4
Source File: BlockedRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import is.hail.utils._ import org.apache.spark.rdd.RDD import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext} import scala.language.existentials import scala.reflect.ClassTag case class BlockedRDDPartition(@transient rdd: RDD[_], index: Int, first: Int, last: Int) extends Partition { require(first <= last) val parentPartitions: Array[Partition] = range.map(rdd.partitions).toArray def range: Range = first to last } class BlockedRDD[T](@transient var prev: RDD[T], @transient val partFirst: Array[Int], @transient val partLast: Array[Int] )(implicit tct: ClassTag[T]) extends RDD[T](prev.sparkContext, Nil) { assert(partFirst.length == partLast.length) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](partFirst.length)(i => BlockedRDDPartition(prev, i, partFirst(i), partLast(i))) } override def compute(split: Partition, context: TaskContext): Iterator[T] = { val parent = dependencies.head.rdd.asInstanceOf[RDD[T]] split.asInstanceOf[BlockedRDDPartition].parentPartitions.iterator.flatMap(p => parent.iterator(p, context)) } override def getDependencies: Seq[Dependency[_]] = { FastSeq(new NarrowDependency(prev) { def getParents(id: Int): Seq[Int] = partitions(id).asInstanceOf[BlockedRDDPartition].range }) } override def clearDependencies() { super.clearDependencies() prev = null } override def getPreferredLocations(partition: Partition): Seq[String] = { val prevPartitions = prev.partitions val range = partition.asInstanceOf[BlockedRDDPartition].range val locationAvail = range.flatMap(i => prev.preferredLocations(prevPartitions(i))) .groupBy(identity) .mapValues(_.length) if (locationAvail.isEmpty) return FastSeq.empty[String] val m = locationAvail.values.max locationAvail.filter(_._2 == m) .keys .toFastSeq } }
Example 5
Source File: InsertRDD.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.datastream.writer import scala.annotation.tailrec import scala.reflect.ClassTag import org.apache.spark.{ OneToOneDependency, NarrowDependency, Partition, TaskContext } import org.apache.spark.rdd.RDD import com.actian.spark_vector.datastream.{ DataStreamPartition, DataStreamPartitionAssignment, VectorEndpointConf } private val endPointsToParentPartitionsMap = { val affinities = rdd.partitions.map(getPreferredLocationsRec(rdd, _)) val ret = DataStreamPartitionAssignment(affinities, writeConf.vectorEndpoints) logDebug(s"Computed endPointsToParentPartitionsMap and got: ${ (0 until ret.length).map { case idx => val vals = ret(idx) s"Datastream $idx -> RDD partitions ${vals.length}: ${vals.take(partitionsPerDataStreamToPrint).mkString(",")} ${if (vals.length > partitionsPerDataStreamToPrint) "..." else ""}" } }") ret.map(_.map(rdd.partitions(_).index)) } override protected def getPartitions = (0 until writeConf.size).map(idx => DataStreamPartition(idx, rdd, endPointsToParentPartitionsMap(idx))).toArray override protected def getPreferredLocations(split: Partition) = { logDebug(s"getPreferredLocations is called for partition ${split.index} and we are returning ${writeConf.vectorEndpoints(split.index).host}") Seq(writeConf.vectorEndpoints(split.index).host) } override def compute(split: Partition, taskContext: TaskContext): Iterator[R] = split.asInstanceOf[DataStreamPartition].parents.toIterator.flatMap(firstParent[R].iterator(_, taskContext)) override def getDependencies: Seq[NarrowDependency[_]] = Seq(new NarrowDependency(rdd) { def getParents(partitionId: Int) = endPointsToParentPartitionsMap(partitionId) }) }