org.apache.spark.sql.catalyst.plans.JoinType Scala Examples

The following examples show how to use org.apache.spark.sql.catalyst.plans.JoinType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: basicOperators.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.mv.plans.modular

import org.apache.spark.sql.catalyst.expressions.{Attribute, _}
import org.apache.spark.sql.catalyst.plans.JoinType

import org.apache.carbondata.mv.plans._
import org.apache.carbondata.mv.plans.modular.Flags._

trait Matchable extends ModularPlan {
  def outputList: Seq[NamedExpression]

  def predicateList: Seq[Expression]
}

case class GroupBy(
    outputList: Seq[NamedExpression],
    inputList: Seq[Expression],
    predicateList: Seq[Expression],
    alias: Option[String],
    child: ModularPlan,
    flags: FlagSet,
    flagSpec: Seq[Seq[Any]],
    modularPlan: Option[ModularPlan] = None) extends UnaryNode with Matchable {
  override def output: Seq[Attribute] = outputList.map(_.toAttribute)

  override def makeCopy(newArgs: Array[AnyRef]): GroupBy = {
    val groupBy = super.makeCopy(newArgs).asInstanceOf[GroupBy]
    if (rewritten) groupBy.setRewritten()
    groupBy
  }
}

case class Select(
    outputList: Seq[NamedExpression],
    inputList: Seq[Expression],
    predicateList: Seq[Expression],
    aliasMap: Map[Int, String],
    joinEdges: Seq[JoinEdge],
    children: Seq[ModularPlan],
    flags: FlagSet,
    flagSpec: Seq[Seq[Any]],
    windowSpec: Seq[Seq[Any]],
    modularPlan: Option[ModularPlan] = None) extends ModularPlan with Matchable {
  override def output: Seq[Attribute] = outputList.map(_.toAttribute)

  override def adjacencyList: scala.collection.immutable.Map[Int, Seq[(Int, JoinType)]] = {
    joinEdges.groupBy { _.left }.map { case (k, v) => (k, v.map(e => (e.right, e.joinType))) }
  }

  override def extractJoinConditions(
      left: ModularPlan, right: ModularPlan): Seq[Expression] = {
    predicateList.filter(p => p.references.intersect(left.outputSet).nonEmpty &&
                              p.references.intersect(right.outputSet).nonEmpty &&
                              p.references.subsetOf(left.outputSet ++ right.outputSet))
  }

  override def extractRightEvaluableConditions(
      left: ModularPlan, right: ModularPlan): Seq[Expression] = {
    predicateList.filter(p => p.references.subsetOf(left.outputSet ++ right.outputSet) &&
                              p.references.intersect(right.outputSet).nonEmpty)
  }

  override def extractEvaluableConditions(plan: ModularPlan): Seq[Expression] = {
    predicateList.filter(p => canEvaluate(p, plan))
  }

  override def makeCopy(newArgs: Array[AnyRef]): Select = {
    val select = super.makeCopy(newArgs).asInstanceOf[Select]
    if (rewritten) select.setRewritten()
    select
  }
}

case class Union(children: Seq[ModularPlan], flags: FlagSet, flagSpec: Seq[Seq[Any]])
  extends ModularPlan {
  override def output: Seq[Attribute] = children.head.output
}

case object OneRowTable extends LeafNode {
  override def output: Seq[Attribute] = Nil
} 
Example 2
Source File: ShuffledHashOuterJoin.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.joins

import scala.collection.JavaConversions._

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


@DeveloperApi
case class ShuffledHashOuterJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan) extends BinaryNode with HashOuterJoin {

  override private[sql] lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  override def outputPartitioning: Partitioning = joinType match {
    case LeftOuter => left.outputPartitioning
    case RightOuter => right.outputPartitioning
    case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
    case x =>
      throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType")
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numLeftRows = longMetric("numLeftRows")
    val numRightRows = longMetric("numRightRows")
    val numOutputRows = longMetric("numOutputRows")

    val joinedRow = new JoinedRow()
    left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
      // TODO this probably can be replaced by external sort (sort merged join?)
      joinType match {
        case LeftOuter =>
          val hashed = HashedRelation(rightIter, numRightRows, buildKeyGenerator)
          val keyGenerator = streamedKeyGenerator
          val resultProj = resultProjection
          leftIter.flatMap( currentRow => {
            numLeftRows += 1
            val rowKey = keyGenerator(currentRow)
            joinedRow.withLeft(currentRow)
            leftOuterIterator(rowKey, joinedRow, hashed.get(rowKey), resultProj, numOutputRows)
          })

        case RightOuter =>
          val hashed = HashedRelation(leftIter, numLeftRows, buildKeyGenerator)
          val keyGenerator = streamedKeyGenerator
          val resultProj = resultProjection
          rightIter.flatMap ( currentRow => {
            numRightRows += 1
            val rowKey = keyGenerator(currentRow)
            joinedRow.withRight(currentRow)
            rightOuterIterator(rowKey, hashed.get(rowKey), joinedRow, resultProj, numOutputRows)
          })

        case FullOuter =>
          // TODO(davies): use UnsafeRow
          val leftHashTable =
            buildHashTable(leftIter, numLeftRows, newProjection(leftKeys, left.output))
          val rightHashTable =
            buildHashTable(rightIter, numRightRows, newProjection(rightKeys, right.output))
          (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
            fullOuterIterator(key,
              leftHashTable.getOrElse(key, EMPTY_LIST),
              rightHashTable.getOrElse(key, EMPTY_LIST),
              joinedRow,
              numOutputRows)
          }

        case x =>
          throw new IllegalArgumentException(
            s"ShuffledHashOuterJoin should not take $x as the JoinType")
      }
    }
  }
} 
Example 3
Source File: StarryHashJoinExec.scala    From starry   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.joins

import com.github.passionke.starry.SparkPlanExecutor
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}


case class StarryHashJoinExec(
                               leftKeys: Seq[Expression],
                               rightKeys: Seq[Expression],
                               joinType: JoinType,
                               buildSide: BuildSide,
                               condition: Option[Expression],
                               left: SparkPlan,
                               right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    val avgHashProbe = longMetric("avgHashProbe")
    val rows = SparkPlanExecutor.doExec(buildPlan)
    val hashed = HashedRelation(rows.iterator, buildKeys, rows.length, null)
    streamedPlan.execute().mapPartitions { streamedIter =>
      join(streamedIter, hashed, numOutputRows, avgHashProbe)
    }
  }
}