org.apache.spark.NarrowDependency Scala Example

Source File: LeftJoin.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd.function.join

import com.twosigma.flint.rdd.{ PartitionsIterator, PeekableIterator }
import org.apache.spark.{ NarrowDependency, OneToOneDependency }
import com.twosigma.flint.rdd.OrderedRDD

import scala.collection.immutable.TreeMap
import scala.reflect.ClassTag
import java.util.{ HashMap => JHashMap }

protected[flint] object LeftJoin {

  val skMapInitialSize = 1024

  def apply[K: ClassTag, SK, V, V2](
    leftRdd: OrderedRDD[K, V],
    rightRdd: OrderedRDD[K, V2],
    toleranceFn: K => K,
    leftSk: V => SK,
    rightSk: V2 => SK
  )(implicit ord: Ordering[K]): OrderedRDD[K, (V, Option[(K, V2)])] = {
    // A map from left partition index to left range split and right partitions.
    val leftIndexToJoinSplits = TreeMap(RangeMergeJoin.leftJoinSplits(
      toleranceFn, leftRdd.rangeSplits, rightRdd.rangeSplits
    ).map { case (split, parts) => (split.partition.index, (split, parts)) }: _*)

    val leftDep = new OneToOneDependency(leftRdd)
    val rightDep = new NarrowDependency(rightRdd) {
      override def getParents(partitionId: Int) =
        leftIndexToJoinSplits(partitionId)._2.map(_.index)
    }

    // A map from left partition index to right partitions
    val rightPartitions = leftRdd.sc.broadcast(leftIndexToJoinSplits.map {
      case (idx, joinSplit) => (idx, joinSplit._2)
    })

    val joinedSplits = leftIndexToJoinSplits.map { case (_, (split, _)) => split }.toArray

    // We don't need the left dependency as we will just load it on demand here
    new OrderedRDD[K, (V, Option[(K, V2)])](leftRdd.sc, joinedSplits, Seq(leftDep, rightDep))(
      (part, context) => {
        val parts = rightPartitions.value(part.index)
        val rightIter = PeekableIterator(PartitionsIterator(rightRdd, parts, context))
        val lastSeen = new JHashMap[SK, (K, V2)](skMapInitialSize)
        leftRdd.iterator(part, context).map {
          case (k, v) =>
            // Catch-up the iterator for the right table to match the left key. In the
            // process, we'll have the last-seen row for each SK in the right table.
            val sk = leftSk(v)
            catchUp(k, rightSk, rightIter, lastSeen)
            val lastSeenRight = lastSeen.get(sk)
            if (lastSeenRight != null && ord.gteq(lastSeenRight._1, toleranceFn(k))) {
              (k, (v, Some(lastSeenRight)))
            } else {
              (k, (v, None))
            }
        }
      }
    )
  }

  
  @annotation.tailrec
  private[rdd] def catchUp[K, SK, V](
    cur: K,
    skFn: V => SK,
    iter: PeekableIterator[(K, V)],
    lastSeen: JHashMap[SK, (K, V)]
  )(implicit ord: Ordering[K]) {
    val peek = iter.peek
    if (peek.nonEmpty && ord.lteq(peek.get._1, cur)) {
      val (k, v) = iter.next
      val sk = skFn(v)
      lastSeen.put(sk, (k, v))
      catchUp(cur, skFn, iter, lastSeen)
    }
  }

}

Source File: Merge.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.rdd.function.join

import com.twosigma.flint.rdd.{ PeekableIterator, PartitionsIterator, MergeIterator, RangeSplit }
import org.apache.spark.NarrowDependency
import com.twosigma.flint.rdd._

import scala.reflect.ClassTag

protected[flint] object Merge {

  def apply[K: Ordering: ClassTag, V: ClassTag](
    left: OrderedRDD[K, V],
    right: OrderedRDD[K, V]
  ): OrderedRDD[K, V] = ++(left, right).mapValues {
    case (_, Left(v)) => v
    case (_, Right(v)) => v
  }

  def ++[K: ClassTag, V: ClassTag, V2: ClassTag](
    left: OrderedRDD[K, V],
    right: OrderedRDD[K, V2]
  )(
    implicit
    ord: Ordering[K]
  ): OrderedRDD[K, Either[V, V2]] = {
    // A map from new partition to a RangeMergeJoin.
    val partToMergeJoin = RangeMergeJoin.mergeSplits(left.rangeSplits, right.rangeSplits).zipWithIndex.map {
      case (mergeJoin, idx) => (OrderedRDDPartition(idx), mergeJoin)
    }.toMap

    // A map from partition index to a RangeMergeJoin.
    val partitionIndexToMergeJoin = partToMergeJoin.map { case (p, m) => (p.index, m) }
    val leftDep = new NarrowDependency(left) {
      override def getParents(partitionId: Int) =
        partitionIndexToMergeJoin(partitionId).left.map(_.partition.index)
    }
    val rightDep = new NarrowDependency(right) {
      override def getParents(partitionId: Int) =
        partitionIndexToMergeJoin(partitionId).right.map(_.partition.index)
    }
    val mergedSplits = partToMergeJoin.map {
      case (p, mergeJoin) => RangeSplit(p, mergeJoin.range)
    }.toArray

    new OrderedRDD[K, Either[V, V2]](left.sc, mergedSplits, Seq(leftDep, rightDep))(
      (part, context) => {
        val mergedJoin = partitionIndexToMergeJoin(part.index)
        // Select rows from both RDDs whose key belongs to this RangeMergeJoin's range
        val leftParts = mergedJoin.left.map(_.partition)
        val leftIter = PeekableIterator(PartitionsIterator(left, leftParts, context).filter {
          case (k, _) => mergedJoin.range.contains(k)
        })
        val rightParts = mergedJoin.right.map(_.partition)
        val rightIter = PeekableIterator(PartitionsIterator(right, rightParts, context).filter {
          case (k, _) => mergedJoin.range.contains(k)
        })
        // Perform an ordered merge of the selected rows.
        MergeIterator(leftIter, rightIter)
      }
    )
  }

}

Source File: ReorderedPartitionsRDD.scala From hail with MIT License

5 votes

package is.hail.sparkextras

import is.hail.utils.FastSeq
import org.apache.spark.rdd.RDD
import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext}

import scala.reflect.ClassTag

case class ReorderedPartitionsRDDPartition(index: Int, oldPartition: Partition) extends Partition

class ReorderedPartitionsRDD[T](@transient var prev: RDD[T], @transient val oldIndices: Array[Int])(implicit tct: ClassTag[T])
  extends RDD[T](prev.sparkContext, Nil) {

  override def getPartitions: Array[Partition] = {
    val parentPartitions = dependencies.head.rdd.asInstanceOf[RDD[T]].partitions
    Array.tabulate(oldIndices.length) { i =>
      val oldIndex = oldIndices(i)
      val oldPartition = parentPartitions(oldIndex)
      ReorderedPartitionsRDDPartition(i, oldPartition)
    }
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val parent = dependencies.head.rdd.asInstanceOf[RDD[T]]
    parent.compute(split.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition, context)
  }

  override def getDependencies: Seq[Dependency[_]] = FastSeq(new NarrowDependency[T](prev) {
    override def getParents(partitionId: Int): Seq[Int] = FastSeq(oldIndices(partitionId))
  })

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }

  override def getPreferredLocations(partition: Partition): Seq[String] =
    prev.preferredLocations(partition.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition)
}

Source File: BlockedRDD.scala From hail with MIT License

5 votes

package is.hail.sparkextras

import is.hail.utils._
import org.apache.spark.rdd.RDD
import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext}

import scala.language.existentials
import scala.reflect.ClassTag

case class BlockedRDDPartition(@transient rdd: RDD[_],
  index: Int,
  first: Int,
  last: Int) extends Partition {
  require(first <= last)

  val parentPartitions: Array[Partition] = range.map(rdd.partitions).toArray

  def range: Range = first to last
}

class BlockedRDD[T](@transient var prev: RDD[T],
  @transient val partFirst: Array[Int],
  @transient val partLast: Array[Int]
)(implicit tct: ClassTag[T]) extends RDD[T](prev.sparkContext, Nil) {
  assert(partFirst.length == partLast.length)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](partFirst.length)(i =>
      BlockedRDDPartition(prev, i, partFirst(i), partLast(i)))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val parent = dependencies.head.rdd.asInstanceOf[RDD[T]]
    split.asInstanceOf[BlockedRDDPartition].parentPartitions.iterator.flatMap(p =>
      parent.iterator(p, context))
  }

  override def getDependencies: Seq[Dependency[_]] = {
    FastSeq(new NarrowDependency(prev) {
      def getParents(id: Int): Seq[Int] =
        partitions(id).asInstanceOf[BlockedRDDPartition].range
    })
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }

  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val prevPartitions = prev.partitions
    val range = partition.asInstanceOf[BlockedRDDPartition].range

    val locationAvail = range.flatMap(i =>
      prev.preferredLocations(prevPartitions(i)))
      .groupBy(identity)
      .mapValues(_.length)

    if (locationAvail.isEmpty)
      return FastSeq.empty[String]

    val m = locationAvail.values.max
    locationAvail.filter(_._2 == m)
      .keys
      .toFastSeq
  }
}

Source File: InsertRDD.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.datastream.writer

import scala.annotation.tailrec
import scala.reflect.ClassTag

import org.apache.spark.{ OneToOneDependency, NarrowDependency, Partition, TaskContext }
import org.apache.spark.rdd.RDD

import com.actian.spark_vector.datastream.{ DataStreamPartition, DataStreamPartitionAssignment, VectorEndpointConf }


  private val endPointsToParentPartitionsMap = {
    val affinities = rdd.partitions.map(getPreferredLocationsRec(rdd, _))

    val ret = DataStreamPartitionAssignment(affinities, writeConf.vectorEndpoints)
    logDebug(s"Computed endPointsToParentPartitionsMap and got: ${
      (0 until ret.length).map {
        case idx =>
          val vals = ret(idx)
          s"Datastream $idx -> RDD partitions ${vals.length}: ${vals.take(partitionsPerDataStreamToPrint).mkString(",")} ${if (vals.length > partitionsPerDataStreamToPrint) "..." else ""}"
      }
    }")
    ret.map(_.map(rdd.partitions(_).index))
  }

  override protected def getPartitions = (0 until writeConf.size).map(idx =>
    DataStreamPartition(idx, rdd, endPointsToParentPartitionsMap(idx))).toArray

  override protected def getPreferredLocations(split: Partition) = {
    logDebug(s"getPreferredLocations is called for partition ${split.index} and we are returning ${writeConf.vectorEndpoints(split.index).host}")
    Seq(writeConf.vectorEndpoints(split.index).host)
  }

  override def compute(split: Partition, taskContext: TaskContext): Iterator[R] =
    split.asInstanceOf[DataStreamPartition].parents.toIterator.flatMap(firstParent[R].iterator(_, taskContext))

  override def getDependencies: Seq[NarrowDependency[_]] = Seq(new NarrowDependency(rdd) {
    def getParents(partitionId: Int) = endPointsToParentPartitionsMap(partitionId)
  })
}

org.apache.spark.NarrowDependency Scala Examples