org.apache.spark.sql.catalyst.plans.JoinType Scala Example

Source File: basicOperators.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.mv.plans.modular

import org.apache.spark.sql.catalyst.expressions.{Attribute, _}
import org.apache.spark.sql.catalyst.plans.JoinType

import org.apache.carbondata.mv.plans._
import org.apache.carbondata.mv.plans.modular.Flags._

trait Matchable extends ModularPlan {
  def outputList: Seq[NamedExpression]

  def predicateList: Seq[Expression]
}

case class GroupBy(
    outputList: Seq[NamedExpression],
    inputList: Seq[Expression],
    predicateList: Seq[Expression],
    alias: Option[String],
    child: ModularPlan,
    flags: FlagSet,
    flagSpec: Seq[Seq[Any]],
    modularPlan: Option[ModularPlan] = None) extends UnaryNode with Matchable {
  override def output: Seq[Attribute] = outputList.map(_.toAttribute)

  override def makeCopy(newArgs: Array[AnyRef]): GroupBy = {
    val groupBy = super.makeCopy(newArgs).asInstanceOf[GroupBy]
    if (rewritten) groupBy.setRewritten()
    groupBy
  }
}

case class Select(
    outputList: Seq[NamedExpression],
    inputList: Seq[Expression],
    predicateList: Seq[Expression],
    aliasMap: Map[Int, String],
    joinEdges: Seq[JoinEdge],
    children: Seq[ModularPlan],
    flags: FlagSet,
    flagSpec: Seq[Seq[Any]],
    windowSpec: Seq[Seq[Any]],
    modularPlan: Option[ModularPlan] = None) extends ModularPlan with Matchable {
  override def output: Seq[Attribute] = outputList.map(_.toAttribute)

  override def adjacencyList: scala.collection.immutable.Map[Int, Seq[(Int, JoinType)]] = {
    joinEdges.groupBy { _.left }.map { case (k, v) => (k, v.map(e => (e.right, e.joinType))) }
  }

  override def extractJoinConditions(
      left: ModularPlan, right: ModularPlan): Seq[Expression] = {
    predicateList.filter(p => p.references.intersect(left.outputSet).nonEmpty &&
                              p.references.intersect(right.outputSet).nonEmpty &&
                              p.references.subsetOf(left.outputSet ++ right.outputSet))
  }

  override def extractRightEvaluableConditions(
      left: ModularPlan, right: ModularPlan): Seq[Expression] = {
    predicateList.filter(p => p.references.subsetOf(left.outputSet ++ right.outputSet) &&
                              p.references.intersect(right.outputSet).nonEmpty)
  }

  override def extractEvaluableConditions(plan: ModularPlan): Seq[Expression] = {
    predicateList.filter(p => canEvaluate(p, plan))
  }

  override def makeCopy(newArgs: Array[AnyRef]): Select = {
    val select = super.makeCopy(newArgs).asInstanceOf[Select]
    if (rewritten) select.setRewritten()
    select
  }
}

case class Union(children: Seq[ModularPlan], flags: FlagSet, flagSpec: Seq[Seq[Any]])
  extends ModularPlan {
  override def output: Seq[Attribute] = children.head.output
}

case object OneRowTable extends LeafNode {
  override def output: Seq[Attribute] = Nil
}

Source File: ShuffledHashOuterJoin.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import scala.collection.JavaConversions._

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


@DeveloperApi
case class ShuffledHashOuterJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan) extends BinaryNode with HashOuterJoin {

  override private[sql] lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  override def outputPartitioning: Partitioning = joinType match {
    case LeftOuter => left.outputPartitioning
    case RightOuter => right.outputPartitioning
    case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
    case x =>
      throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType")
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numLeftRows = longMetric("numLeftRows")
    val numRightRows = longMetric("numRightRows")
    val numOutputRows = longMetric("numOutputRows")

    val joinedRow = new JoinedRow()
    left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
      // TODO this probably can be replaced by external sort (sort merged join?)
      joinType match {
        case LeftOuter =>
          val hashed = HashedRelation(rightIter, numRightRows, buildKeyGenerator)
          val keyGenerator = streamedKeyGenerator
          val resultProj = resultProjection
          leftIter.flatMap( currentRow => {
            numLeftRows += 1
            val rowKey = keyGenerator(currentRow)
            joinedRow.withLeft(currentRow)
            leftOuterIterator(rowKey, joinedRow, hashed.get(rowKey), resultProj, numOutputRows)
          })

        case RightOuter =>
          val hashed = HashedRelation(leftIter, numLeftRows, buildKeyGenerator)
          val keyGenerator = streamedKeyGenerator
          val resultProj = resultProjection
          rightIter.flatMap ( currentRow => {
            numRightRows += 1
            val rowKey = keyGenerator(currentRow)
            joinedRow.withRight(currentRow)
            rightOuterIterator(rowKey, hashed.get(rowKey), joinedRow, resultProj, numOutputRows)
          })

        case FullOuter =>
          // TODO(davies): use UnsafeRow
          val leftHashTable =
            buildHashTable(leftIter, numLeftRows, newProjection(leftKeys, left.output))
          val rightHashTable =
            buildHashTable(rightIter, numRightRows, newProjection(rightKeys, right.output))
          (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
            fullOuterIterator(key,
              leftHashTable.getOrElse(key, EMPTY_LIST),
              rightHashTable.getOrElse(key, EMPTY_LIST),
              joinedRow,
              numOutputRows)
          }

        case x =>
          throw new IllegalArgumentException(
            s"ShuffledHashOuterJoin should not take $x as the JoinType")
      }
    }
  }
}

Source File: StarryHashJoinExec.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import com.github.passionke.starry.SparkPlanExecutor
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}


case class StarryHashJoinExec(
                               leftKeys: Seq[Expression],
                               rightKeys: Seq[Expression],
                               joinType: JoinType,
                               buildSide: BuildSide,
                               condition: Option[Expression],
                               left: SparkPlan,
                               right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    val avgHashProbe = longMetric("avgHashProbe")
    val rows = SparkPlanExecutor.doExec(buildPlan)
    val hashed = HashedRelation(rows.iterator, buildKeys, rows.length, null)
    streamedPlan.execute().mapPartitions { streamedIter =>
      join(streamedIter, hashed, numOutputRows, avgHashProbe)
    }
  }
}

org.apache.spark.sql.catalyst.plans.JoinType Scala Examples