org.apache.spark.util.collection.CompactBuffer Scala Example

Source File: VOrderedRDDFunctions.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.Partitioner
import org.apache.spark.internal.Logging
import org.apache.spark.util.collection.CompactBuffer

import scala.reflect.ClassTag

class VOrderedRDDFunctions[K, V](self: RDD[(K, V)])
    (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K])
  extends Logging with Serializable {

  def groupByKeyUsingSort(partitioner: Partitioner): RDD[(K, Iterable[V])] = {
    self.repartitionAndSortWithinPartitions(partitioner)
      .mapPartitions { (iter: Iterator[(K, V)]) =>
        new Iterator[(K, CompactBuffer[V])] {
          private var firstElemInNextGroup: (K, V) = null

          override def hasNext: Boolean = firstElemInNextGroup != null || iter.hasNext

          override def next(): (K, CompactBuffer[V]) = {
            if (firstElemInNextGroup == null) {
              firstElemInNextGroup = iter.next()
            }
            val key = firstElemInNextGroup._1
            val group = CompactBuffer[V](firstElemInNextGroup._2)
            firstElemInNextGroup = null
            var reachNewGroup = false
            while (iter.hasNext && !reachNewGroup) {
              val currElem = iter.next()
              if (currElem._1 == key) {
                group += currElem._2
              } else {
                firstElemInNextGroup = currElem
                reachNewGroup = true
              }
            }
            (key, group)
          }
        }
      }
  }
}

private[spark] object VOrderedRDDFunctions {

  implicit def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)])(implicit ord: Ordering[K]):
      VOrderedRDDFunctions[K, V] = {
    new VOrderedRDDFunctions(rdd)
  }
}

Source File: HashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.util.collection.CompactBuffer


trait HashJoin {
  self: SparkPlan =>

  val leftKeys: Seq[Expression]
  val rightKeys: Seq[Expression]
  val buildSide: BuildSide
  val left: SparkPlan
  val right: SparkPlan

  protected lazy val (buildPlan, streamedPlan) = buildSide match {
    case BuildLeft => (left, right)
    case BuildRight => (right, left)
  }

  protected lazy val (buildKeys, streamedKeys) = buildSide match {
    case BuildLeft => (leftKeys, rightKeys)
    case BuildRight => (rightKeys, leftKeys)
  }

  override def output: Seq[Attribute] = left.output ++ right.output

  @transient protected lazy val buildSideKeyGenerator: Projection =
    newProjection(buildKeys, buildPlan.output)

  @transient protected lazy val streamSideKeyGenerator: () => MutableProjection =
    newMutableProjection(streamedKeys, streamedPlan.output)

  protected def hashJoin(streamIter: Iterator[Row], hashedRelation: HashedRelation): Iterator[Row] =
  {
    new Iterator[Row] {
      private[this] var currentStreamedRow: Row = _
      private[this] var currentHashMatches: CompactBuffer[Row] = _
      private[this] var currentMatchPosition: Int = -1

      // Mutable per row objects.
      private[this] val joinRow = new JoinedRow2

      private[this] val joinKeys = streamSideKeyGenerator()

      override final def hasNext: Boolean =
        (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) ||
          (streamIter.hasNext && fetchNext())

      override final def next(): Row = {
        val ret = buildSide match {
          case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition))
          case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
        }
        currentMatchPosition += 1
        ret
      }

      
      private final def fetchNext(): Boolean = {
        currentHashMatches = null
        currentMatchPosition = -1

        while (currentHashMatches == null && streamIter.hasNext) {
          currentStreamedRow = streamIter.next()
          if (!joinKeys(currentStreamedRow).anyNull) {
            currentHashMatches = hashedRelation.get(joinKeys.currentValue)
          }
        }

        if (currentHashMatches == null) {
          false
        } else {
          currentMatchPosition = 0
          true
        }
      }
    }
  }
}

Source File: HashedRelationSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Projection, Row}
import org.apache.spark.util.collection.CompactBuffer


class HashedRelationSuite extends SparkFunSuite {

  // Key is simply the record itself
  private val keyProjection = new Projection {
    override def apply(row: Row): Row = row
  }

  test("GeneralHashedRelation") {
    val data = Array(Row(0), Row(1), Row(2), Row(2))
    val hashed = HashedRelation(data.iterator, keyProjection)
    assert(hashed.isInstanceOf[GeneralHashedRelation])

    assert(hashed.get(data(0)) == CompactBuffer[Row](data(0)))
    assert(hashed.get(data(1)) == CompactBuffer[Row](data(1)))
    assert(hashed.get(Row(10)) === null)

    val data2 = CompactBuffer[Row](data(2))
    data2 += data(2)
    assert(hashed.get(data(2)) == data2)
  }

  test("UniqueKeyHashedRelation") {
    val data = Array(Row(0), Row(1), Row(2))
    val hashed = HashedRelation(data.iterator, keyProjection)
    assert(hashed.isInstanceOf[UniqueKeyHashedRelation])

    assert(hashed.get(data(0)) == CompactBuffer[Row](data(0)))
    assert(hashed.get(data(1)) == CompactBuffer[Row](data(1)))
    assert(hashed.get(data(2)) == CompactBuffer[Row](data(2)))
    assert(hashed.get(Row(10)) === null)

    val uniqHashed = hashed.asInstanceOf[UniqueKeyHashedRelation]
    assert(uniqHashed.getValue(data(0)) == data(0))
    assert(uniqHashed.getValue(data(1)) == data(1))
    assert(uniqHashed.getValue(data(2)) == data(2))
    assert(uniqHashed.getValue(Row(10)) == null)
  }
}

org.apache.spark.util.collection.CompactBuffer Scala Examples