org.apache.spark.sql.execution.joins.BuildRight Scala Examples
The following examples show how to use org.apache.spark.sql.execution.joins.BuildRight.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ColumnarShuffledHashJoinExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import java.util.concurrent.TimeUnit._ import com.intel.sparkColumnarPlugin.vectorized._ import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import scala.collection.mutable.ListBuffer import org.apache.arrow.vector.ipc.message.ArrowFieldNode import org.apache.arrow.vector.ipc.message.ArrowRecordBatch import org.apache.arrow.vector.types.pojo.ArrowType import org.apache.arrow.vector.types.pojo.Field import org.apache.arrow.vector.types.pojo.Schema import org.apache.arrow.gandiva.expression._ import org.apache.arrow.gandiva.evaluator._ import io.netty.buffer.ArrowBuf import com.google.common.collect.Lists; import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized.ExpressionEvaluator import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide} class ColumnarShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends ShuffledHashJoinExec( leftKeys, rightKeys, joinType, buildSide, condition, left, right) { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "joinTime" -> SQLMetrics.createTimingMetric(sparkContext, "join time"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def supportsColumnar = true //TODO() Disable code generation //override def supportCodegen: Boolean = false override def doExecuteColumnar(): RDD[ColumnarBatch] = { val numOutputRows = longMetric("numOutputRows") val joinTime = longMetric("joinTime") val buildTime = longMetric("buildTime") val resultSchema = this.schema streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) => //val hashed = buildHashedRelation(buildIter) //join(streamIter, hashed, numOutputRows) val vjoin = ColumnarShuffledHashJoin.create(leftKeys, rightKeys, resultSchema, joinType, buildSide, condition, left, right, buildTime, joinTime, numOutputRows) val vjoinResult = vjoin.columnarInnerJoin(streamIter, buildIter) TaskContext.get().addTaskCompletionListener[Unit](_ => { vjoin.close() }) new CloseableColumnBatchIterator(vjoinResult) } } }
Example 2
Source File: OTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class OTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } @transient private lazy val broadcastFuture = future { prevBatch match { case None => // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation: Broadcast[JHashSet[Row]] = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = OTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 3
Source File: MTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class MTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } val watcher = controller.getWatcher @transient private lazy val broadcastFuture = future { // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() prevBatch match { case None => val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => // TODO: fix this integrity error by supporting join whose both branches may grow val hashed = HashedSet(input.iterator) val previous = controller.broadcasts((opId, bId)).value.asInstanceOf[JHashSet[Row]] if (!previous.containsAll(hashed)) { watcher += -1 logError(s"Integrity Error in MTBLeftSemiHashJoin(Op $opId, Batch $currentBatch)") } controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = MTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 4
Source File: StarryJoinLocalStrategy.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.RowOrdering import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, StarryHashJoinExec, StarryNestedLoopJoinExec} import org.apache.spark.sql.internal.SQLConf private def canRunInLocalMemory(plan: LogicalPlan) = { plan.stats.sizeInBytes >= 0 && plan.stats.sizeInBytes <= conf.getConfString("spark.sql.maxLocalMemoryJoin", "10485760").toLong } private def canBuildRight(joinType: JoinType): Boolean = joinType match { case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => true case _ => false } private def canBuildLeft(joinType: JoinType): Boolean = joinType match { case _: InnerLike | RightOuter => true case _ => false } def decideBuildSide(joinType: JoinType, left: LogicalPlan, right: LogicalPlan) = { val buildLeft = canBuildLeft(joinType) && canRunInLocalMemory(left) val buildRight = canBuildRight(joinType) && canRunInLocalMemory(right) def smallerSide = if (right.stats.sizeInBytes <= left.stats.sizeInBytes) BuildRight else BuildLeft if (buildRight && buildLeft) { smallerSide } else if (buildRight) { BuildRight } else if (buildLeft) { BuildLeft } else { smallerSide } } override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) => val buildSide = decideBuildSide(joinType, left, right) Seq(StarryHashJoinExec( leftKeys, rightKeys, joinType, buildSide, condition, planLater(left), planLater(right))) // --- SortMergeJoin ------------------------------------------------------------ case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) if RowOrdering.isOrderable(leftKeys) => joins.SortMergeJoinExec( leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil // --- Without joining keys ------------------------------------------------------------ // Pick BroadcastNestedLoopJoin if one side could be broadcast case [email protected](left, right, joinType, condition) => val buildSide = decideBuildSide(joinType, left, right) StarryNestedLoopJoinExec( planLater(left), planLater(right), buildSide, joinType, condition) :: Nil // Pick CartesianProduct for InnerJoin case logical.Join(left, right, _: InnerLike, condition) => joins.CartesianProductExec(planLater(left), planLater(right), condition) :: Nil case _ => Nil } }
Example 5
Source File: ShuffleHashJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, PartitioningCollection} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffleHashJoin(leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { @transient final protected val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext] val cacheBuildSide = bigDatalogContext.getConf.getBoolean("spark.datalog.shufflehashjoin.cachebuildside", true) override lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) var cachedBuildPlan: RDD[HashedRelation] = null override def output: Seq[Attribute] = left.output ++ right.output override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected override def doExecute(): RDD[InternalRow] = { val numStreamedRows = buildSide match { case BuildLeft => longMetric("numRightRows") case BuildRight => longMetric("numLeftRows") } val numOutputRows = longMetric("numOutputRows") if (cacheBuildSide) { if (cachedBuildPlan == null) { cachedBuildPlan = buildPlan.execute() .mapPartitionsInternal(iter => Iterator(HashedRelation(iter, SQLMetrics.nullLongMetric, buildSideKeyGenerator))) .persist() } cachedBuildPlan.zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => hashJoin(streamedIter, numStreamedRows, buildIter.next(), numOutputRows)} } else { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => val hashedRelation = HashedRelation(buildIter, SQLMetrics.nullLongMetric, buildSideKeyGenerator) hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows) } } } }
Example 6
Source File: BinaryHashJoinNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.joins.{HashedRelation, BuildLeft, BuildRight, BuildSide} case class BinaryHashJoinNode( conf: SQLConf, leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: LocalNode, right: LocalNode) extends BinaryLocalNode(conf) with HashJoinNode { protected override val (streamedNode, streamedKeys) = buildSide match { case BuildLeft => (right, rightKeys) case BuildRight => (left, leftKeys) } private val (buildNode, buildKeys) = buildSide match { case BuildLeft => (left, leftKeys) case BuildRight => (right, rightKeys) } override def output: Seq[Attribute] = left.output ++ right.output private def buildSideKeyGenerator: Projection = { // We are expecting the data types of buildKeys and streamedKeys are the same. assert(buildKeys.map(_.dataType) == streamedKeys.map(_.dataType)) UnsafeProjection.create(buildKeys, buildNode.output) } protected override def doOpen(): Unit = { buildNode.open() val hashedRelation = HashedRelation(buildNode, buildSideKeyGenerator) // We have built the HashedRelation. So, close buildNode. buildNode.close() streamedNode.open() // Set the HashedRelation used by the HashJoinNode. withHashedRelation(hashedRelation) } override def close(): Unit = { // Please note that we do not need to call the close method of our buildNode because // it has been called in this.open. streamedNode.close() } }
Example 7
Source File: BroadcastHashJoinNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashedRelation} case class BroadcastHashJoinNode( conf: SQLConf, streamedKeys: Seq[Expression], streamedNode: LocalNode, buildSide: BuildSide, buildOutput: Seq[Attribute], hashedRelation: Broadcast[HashedRelation]) extends UnaryLocalNode(conf) with HashJoinNode { override val child = streamedNode // Because we do not pass in the buildNode, we take the output of buildNode to // create the inputSet properly. override def inputSet: AttributeSet = AttributeSet(child.output ++ buildOutput) override def output: Seq[Attribute] = buildSide match { case BuildRight => streamedNode.output ++ buildOutput case BuildLeft => buildOutput ++ streamedNode.output } protected override def doOpen(): Unit = { streamedNode.open() // Set the HashedRelation used by the HashJoinNode. withHashedRelation(hashedRelation.value) } override def close(): Unit = { streamedNode.close() } }