org.apache.spark.sql.catalyst.expressions.JoinedRow Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.JoinedRow.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DeclarativeAggregateEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 2
Source File: CDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class CDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SparkPlan { override def outputPartitioning: Partitioning = left.outputPartitioning override def output: Seq[Attribute] = left.output ++ right.output final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = left.execute().cartesian(right.execute()).mapPartitions { iter => val joinedRow = new JoinedRow iter.filter { row => val point1 = ShapeUtils.getShape(left_key, left.output, row._1).asInstanceOf[Point] val point2 = ShapeUtils.getShape(right_key, right.output, row._2).asInstanceOf[Point] point1.minDist(point2) <= r }.map(row => joinedRow(row._1, row._2)) } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 3
Source File: CKJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class CKJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def outputPartitioning: Partitioning = left.outputPartitioning override def output: Seq[Attribute] = left.output ++ right.output final val k = l.value.asInstanceOf[Number].intValue() override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute() val right_rdd = right.execute() left_rdd.map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ).cartesian(right_rdd).map { case (l: (Point, InternalRow), r: InternalRow) => val tmp_point = ShapeUtils.getShape(right_key, right.output, r).asInstanceOf[Point] l._2 -> List((tmp_point.minDist(l._1), r)) }.reduceByKey { case (l_list: Seq[(Double, InternalRow)], r_list: Seq[(Double, InternalRow)]) => (l_list ++ r_list).sortWith(_._1 < _._1).take(k) }.flatMapValues(list => list).mapPartitions { iter => val joinedRow = new JoinedRow iter.map(r => joinedRow(r._1, r._2._2)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 4
Source File: RDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition} import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable case class RDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val sample_rate = simbaSessionState.simbaConf.sampleRate final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute().map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ) val right_rdd = right.execute().map(row => (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row) ) val dimension = right_rdd.first()._1.coord.length val (left_partitioned, left_mbr_bound) = STRPartition(left_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val left_part_size = left_partitioned.mapPartitions { iter => Array(iter.length).iterator }.collect() val left_rt = RTree(left_mbr_bound.zip(left_part_size).map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) val bc_rt = sparkContext.broadcast(left_rt) val right_dup = right_rdd.flatMap {x => bc_rt.value.circleRange(x._1, r).map(now => (now._2, x)) } val right_dup_partitioned = MapDPartition(right_dup, left_mbr_bound.length) left_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) => val ans = mutable.ListBuffer[InternalRow]() val right_data = rightIter.map(_._2).toArray if (right_data.length > 0) { val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node) leftIter.foreach {now => ans ++= right_index.circleRange(now._1, r) .map(x => new JoinedRow(now._2, right_data(x._2)._2)) } } ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 5
Source File: DJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition} import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable case class DJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val sample_rate = simbaSessionState.simbaConf.sampleRate final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute().map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ) val right_rdd = right.execute().map(row => (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row) ) val dimension = right_rdd.first()._1.coord.length val (left_partitioned, left_mbr_bound) = STRPartition(left_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val (right_partitioned, right_mbr_bound) = STRPartition(right_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val right_rt = RTree(right_mbr_bound.zip(Array.fill[Int](right_mbr_bound.length)(0)) .map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) val left_dup = new Array[Array[Int]](left_mbr_bound.length) val right_dup = new Array[Array[Int]](right_mbr_bound.length) var tot = 0 left_mbr_bound.foreach { now => val res = right_rt.circleRange(now._1, r) val tmp_arr = mutable.ArrayBuffer[Int]() res.foreach {x => if (right_dup(x._2) == null) right_dup(x._2) = Array(tot) else right_dup(x._2) = right_dup(x._2) :+ tot tmp_arr += tot tot += 1 } left_dup(now._2) = tmp_arr.toArray } val bc_left_dup = sparkContext.broadcast(left_dup) val bc_right_dup = sparkContext.broadcast(right_dup) val left_dup_rdd = left_partitioned.mapPartitionsWithIndex { (id, iter) => iter.flatMap {now => val tmp_list = bc_left_dup.value(id) if (tmp_list != null) tmp_list.map(x => (x, now)) else Array[(Int, (Point, InternalRow))]() } } val right_dup_rdd = right_partitioned.mapPartitionsWithIndex { (id, iter) => iter.flatMap {now => val tmp_list = bc_right_dup.value(id) if (tmp_list != null) tmp_list.map(x => (x, now)) else Array[(Int, (Point, InternalRow))]() } } val left_dup_partitioned = MapDPartition(left_dup_rdd, tot).map(_._2) val right_dup_partitioned = MapDPartition(right_dup_rdd, tot).map(_._2) left_dup_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) => val ans = mutable.ListBuffer[InternalRow]() val right_data = rightIter.toArray if (right_data.nonEmpty) { val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node) leftIter.foreach {now => ans ++= right_index.circleRange(now._1, r) .map(x => new JoinedRow(now._2, right_data(x._2)._2)) } } ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 6
Source File: BDJSparkR.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BDJSparkR(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val r = NumberUtil.literalToDouble(l) final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[InternalRow]() if (right_data.nonEmpty) { val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node) left_data.foreach(left => right_rtree.circleRange(left._1, r) .foreach(x => joined_ans += new JoinedRow(left._2, right_data(x._2)._2))) } joined_ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 7
Source File: BKJSparkR.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BKJSparkR(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val k = l.value.asInstanceOf[Number].intValue() private class DisOrdering extends Ordering[(InternalRow, Double)] { override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int = -x._2.compare(y._2) } override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]() if (right_data.nonEmpty) { val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node) left_data.foreach(left => joined_ans += ((left._2, right_rtree.kNN(left._1, k, keepSame = false) .map(x => (right_data(x._2)._2, x._1.minDist(left._1))))) ) } joined_ans.iterator }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions) .flatMap { now => now._2.map(x => new JoinedRow(now._1, x._1)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 8
Source File: BKJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.util.BoundedPriorityQueue import scala.collection.mutable import scala.util.Random case class BKJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val k = l.value.asInstanceOf[Number].intValue() private class DisOrdering extends Ordering[(InternalRow, Double)] { override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int = -x._2.compare(y._2) } override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]() left_data.foreach(left => { var pq = new BoundedPriorityQueue[(InternalRow, Double)](k)(new DisOrdering) right_data.foreach(right => pq += ((right._2, right._1.minDist(left._1)))) joined_ans += ((left._2, pq.toArray)) }) joined_ans.iterator }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions) .flatMap { now => now._2.map(x => new JoinedRow(now._1, x._1)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 9
Source File: BDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[InternalRow]() left_data.foreach {left => right_data.foreach {right => if (left._1.minDist(right._1) <= r) { joined_ans += new JoinedRow(left._2, right._2) } } } joined_ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 10
Source File: CartesianProduct.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().map { row => numLeftRows += 1 row.copy() } val rightResults = right.execute().map { row => numRightRows += 1 row.copy() } leftResults.cartesian(rightResults).mapPartitionsInternal { iter => val joinedRow = new JoinedRow iter.map { r => numOutputRows += 1 joinedRow(r._1, r._2) } } } }
Example 11
Source File: CartesianProductExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 12
Source File: DeclarativeAggregateEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 13
Source File: CartesianProduct.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().map { row => numLeftRows += 1 row.copy() } val rightResults = right.execute().map { row => numRightRows += 1 row.copy() } leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow iter.map { r => numOutputRows += 1 joinedRow(r._1, r._2) } } } }
Example 14
Source File: CartesianProduct.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output protected override def doExecute(): RDD[Row] = { val leftResults = left.execute().map(_.copy()) val rightResults = right.execute().map(_.copy()) leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow iter.map(r => joinedRow(r._1, r._2)) } } }
Example 15
Source File: CartesianProductExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.hadoop.security.UserGroupInformation import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { private[this] val user = UserGroupInformation.getCurrentUser.getShortUserName override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get(user).blockManager, SparkEnv.get(user).serializerManager, context, null, null, 1024, SparkEnv.get(user).memoryManager.pageSizeBytes, SparkEnv.get(user).conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 16
Source File: DeclarativeAggregateEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 17
Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 18
Source File: DeclarativeAggregateEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 19
Source File: CartesianProductExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 20
Source File: DeclarativeAggregateEvaluator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 21
Source File: CartesianProductExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsInternal { iter => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition: (InternalRow) => Boolean = newPredicate(condition.get, left.output ++ right.output) val joined = new JoinedRow iter.filter { r => boundCondition(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }