org.apache.spark.sql.execution.SparkPlan Scala Examples
The following examples show how to use org.apache.spark.sql.execution.SparkPlan.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FilterExec.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution import org.apache.spark.sql.simba.expression._ import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal, PredicateHelper} import org.apache.spark.sql.catalyst.expressions.{SortOrder, And => SQLAnd, Not => SQLNot, Or => SQLOr} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class FilterExec(condition: Expression, child: SparkPlan) extends SimbaPlan with PredicateHelper { override def output: Seq[Attribute] = child.output private class DistanceOrdering(point: Expression, target: Point) extends Ordering[InternalRow] { override def compare(x: InternalRow, y: InternalRow): Int = { val shape_x = ShapeUtils.getShape(point, child.output, x) val shape_y = ShapeUtils.getShape(point, child.output, y) val dis_x = target.minDist(shape_x) val dis_y = target.minDist(shape_y) dis_x.compare(dis_y) } } // TODO change target partition from 1 to some good value // Note that target here must be an point literal in WHERE clause, // hence we can consider it as Point safely def knn(rdd: RDD[InternalRow], point: Expression, target: Point, k: Int): RDD[InternalRow] = sparkContext.parallelize(rdd.map(_.copy()).takeOrdered(k)(new DistanceOrdering(point, target)), 1) def applyCondition(rdd: RDD[InternalRow], condition: Expression): RDD[InternalRow] = { condition match { case InKNN(point, target, k) => val _target = target.asInstanceOf[Literal].value.asInstanceOf[Point] knn(rdd, point, _target, k.value.asInstanceOf[Number].intValue()) case now@And(left, right) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else applyCondition(rdd, left).map(_.copy()).intersection(applyCondition(rdd, right).map(_.copy())) case now@Or(left, right) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else applyCondition(rdd, left).map(_.copy()).union(applyCondition(rdd, right).map(_.copy())).distinct() case now@Not(c) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else rdd.map(_.copy()).subtract(applyCondition(rdd, c).map(_.copy())) case _ => rdd.mapPartitions(iter => iter.filter(newPredicate(condition, child.output).eval(_))) } } protected def doExecute(): RDD[InternalRow] = { val root_rdd = child.execute() condition transformUp { case SQLAnd(left, right) => And(left, right) case SQLOr(left, right)=> Or(left, right) case SQLNot(c) => Not(c) } applyCondition(root_rdd, condition) } override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 2
Source File: HashMapIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.partitioner.HashPartition import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class HashMapIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String)(var _indexedRDD: IndexedRDD = null) extends IndexedRelation with MultiInstanceRelation { require(column_keys.length == 1) require(column_keys.head.dataType.isInstanceOf[NumericType]) if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val dataRDD = child.execute().map(row => { val eval_key = BindReferences.bindReference(column_keys.head, child.output).eval(row) (eval_key, row) }) val partitionedRDD = HashPartition(dataRDD, numShufflePartitions) val indexed = partitionedRDD.mapPartitions(iter => { val data = iter.toArray val index = HashMapIndex(data) Array(IPartition(data.map(_._2), index)).iterator }).persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { HashMapIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD).asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { HashMapIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD) } }
Example 3
Source File: TreapIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.partitioner.RangePartition import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class TreapIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String) (var _indexedRDD: IndexedRDD = null, var range_bounds: Array[Double] = null) extends IndexedRelation with MultiInstanceRelation { require(column_keys.length == 1) require(column_keys.head.dataType.isInstanceOf[NumericType]) val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val dataRDD = child.execute().map(row => { val eval_key = BindReferences.bindReference(column_keys.head, child.output).eval(row) .asInstanceOf[Double] (eval_key, row) }) val (partitionedRDD, tmp_bounds) = RangePartition.rowPartition(dataRDD, numShufflePartitions) range_bounds = tmp_bounds val indexed = partitionedRDD.mapPartitions(iter => { val data = iter.toArray val index = Treap(data) Array(IPartition(data.map(_._2), index)).iterator }).persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { TreapIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD) .asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { TreapIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, range_bounds) } }
Example 4
Source File: TreeMapIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.partitioner.RangePartition import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class TreeMapIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String) (var _indexedRDD: IndexedRDD = null, var range_bounds: Array[Double] = null) extends IndexedRelation with MultiInstanceRelation { require(column_keys.length == 1) require(column_keys.head.dataType.isInstanceOf[NumericType]) if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val dataRDD = child.execute().map(row => { val eval_key = BindReferences.bindReference(column_keys.head, child.output).eval(row) .asInstanceOf[Double] (eval_key, row) }) val (partitionedRDD, tmp_bounds) = RangePartition.rowPartition(dataRDD, numShufflePartitions) range_bounds = tmp_bounds val indexed = partitionedRDD.mapPartitions(iter => { val data = iter.toArray val index = TreeMapIndex(data) Array(IPartition(data.map(_._2), index)).iterator }).persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { TreeMapIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD) .asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { TreeMapIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, range_bounds) } }
Example 5
Source File: QuadTreeIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{DoubleType, IntegerType} import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.simba.partitioner.QuadTreePartitioner import org.apache.spark.sql.simba.spatial.Point private[simba] case class QuadTreeIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String)(var _indexedRDD: IndexedRDD = null, var global_index: QuadTree = null) extends IndexedRelation with MultiInstanceRelation { private def checkKeys: Boolean = { for (i <- column_keys.indices) if (!(column_keys(i).dataType.isInstanceOf[DoubleType] || column_keys(i).dataType.isInstanceOf[IntegerType])) { return false } true } require(checkKeys) if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val sampleRate = simbaSession.sessionState.simbaConf.sampleRate val tranferThreshold = simbaSession.sessionState.simbaConf.transferThreshold val dataRDD = child.execute().map(row => { val now = column_keys.map(x => BindReferences.bindReference(x, child.output).eval(row).asInstanceOf[Number].doubleValue() ).toArray (new Point(now), row) }) val dimension = column_keys.length val (partitionedRDD, _, global_qtree) = QuadTreePartitioner(dataRDD, dimension, numShufflePartitions, sampleRate, tranferThreshold) val indexed = partitionedRDD.mapPartitions { iter => val data = iter.toArray val index: QuadTree = if (data.length > 0) QuadTree(data.map(_._1).zipWithIndex) else null Array(IPartition(data.map(_._2), index)).iterator }.persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(name => s"$name $index_name").getOrElse(child.toString)) _indexedRDD = indexed global_index = global_qtree } override def newInstance(): IndexedRelation = { new QuadTreeIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD) .asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { new QuadTreeIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, global_index) } }
Example 6
Source File: IndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.SimbaSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan private[simba] case class IPartition(data: Array[InternalRow], index: Index) private[simba] object IndexedRelation { def apply(child: SparkPlan, table_name: Option[String], index_type: IndexType, column_keys: List[Attribute], index_name: String): IndexedRelation = { index_type match { case TreeMapType => TreeMapIndexedRelation(child.output, child, table_name, column_keys, index_name)() case TreapType => TreapIndexedRelation(child.output, child, table_name, column_keys, index_name)() case RTreeType => RTreeIndexedRelation(child.output, child, table_name, column_keys, index_name)() case HashMapType => HashMapIndexedRelation(child.output, child, table_name, column_keys, index_name)() case _ => null } } } private[simba] abstract class IndexedRelation extends LogicalPlan { self: Product => var _indexedRDD: IndexedRDD def indexedRDD: IndexedRDD = _indexedRDD def simbaSession = SimbaSession.getActiveSession.orNull override def children: Seq[LogicalPlan] = Nil def output: Seq[Attribute] def withOutput(newOutput: Seq[Attribute]): IndexedRelation }
Example 7
Source File: RTreeIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.ShapeType import org.apache.spark.sql.simba.partitioner.STRPartition import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class RTreeIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String)(var _indexedRDD: IndexedRDD = null, var global_rtree: RTree = null) extends IndexedRelation with MultiInstanceRelation { var isPoint = false private def checkKeys: Boolean = { if (column_keys.length > 1) { for (i <- column_keys.indices) if (!column_keys(i).dataType.isInstanceOf[NumericType]) { return false } true } else { // length = 1; we do not support one dimension R-tree column_keys.head.dataType match { case t: ShapeType => isPoint = true true case _ => false } } } require(checkKeys) val dimension = ShapeUtils.getPointFromRow(child.execute().first(), column_keys, child, isPoint).coord.length if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val maxEntriesPerNode = simbaSession.sessionState.simbaConf.maxEntriesPerNode val sampleRate = simbaSession.sessionState.simbaConf.sampleRate val transferThreshold = simbaSession.sessionState.simbaConf.transferThreshold val dataRDD = child.execute().map(row => { (ShapeUtils.getPointFromRow(row, column_keys, child, isPoint), row) }) val max_entries_per_node = maxEntriesPerNode val (partitionedRDD, mbr_bounds) = STRPartition(dataRDD, dimension, numShufflePartitions, sampleRate, transferThreshold, max_entries_per_node) val indexed = partitionedRDD.mapPartitions { iter => val data = iter.toArray var index: RTree = null if (data.length > 0) index = RTree(data.map(_._1).zipWithIndex, max_entries_per_node) Array(IPartition(data.map(_._2), index)).iterator }.persist(StorageLevel.MEMORY_AND_DISK_SER) val partitionSize = indexed.mapPartitions(iter => iter.map(_.data.length)).collect() global_rtree = RTree(mbr_bounds.zip(partitionSize) .map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { RTreeIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD).asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { RTreeIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, global_rtree) } }
Example 8
Source File: ShapeUtils.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.util import org.apache.spark.sql.simba.{ShapeSerializer, ShapeType} import org.apache.spark.sql.simba.expression.PointWrapper import org.apache.spark.sql.simba.spatial.{Point, Shape} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, Expression, UnsafeArrayData} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan object ShapeUtils { def getPointFromRow(row: InternalRow, columns: List[Attribute], plan: SparkPlan, isPoint: Boolean): Point = { if (isPoint) { ShapeSerializer.deserialize(BindReferences.bindReference(columns.head, plan.output) .eval(row).asInstanceOf[UnsafeArrayData].toByteArray).asInstanceOf[Point] } else { Point(columns.toArray.map(BindReferences.bindReference(_, plan.output).eval(row) .asInstanceOf[Number].doubleValue())) } } def getPointFromRow(row: InternalRow, columns: List[Attribute], plan: LogicalPlan, isPoint: Boolean): Point = { if (isPoint) { ShapeSerializer.deserialize(BindReferences.bindReference(columns.head, plan.output) .eval(row).asInstanceOf[UnsafeArrayData].toByteArray).asInstanceOf[Point] } else { Point(columns.toArray.map(BindReferences.bindReference(_, plan.output).eval(row) .asInstanceOf[Number].doubleValue())) } } def getShape(expression: Expression, input: InternalRow): Shape = { if (!expression.isInstanceOf[PointWrapper] && expression.dataType.isInstanceOf[ShapeType]) { ShapeSerializer.deserialize(expression.eval(input).asInstanceOf[UnsafeArrayData].toByteArray) } else if (expression.isInstanceOf[PointWrapper]) { expression.eval(input).asInstanceOf[Shape] } else throw new UnsupportedOperationException("Query shape should be of ShapeType") } def getShape(expression: Expression, schema: Seq[Attribute], input: InternalRow): Shape = { if (!expression.isInstanceOf[PointWrapper] && expression.dataType.isInstanceOf[ShapeType]) { ShapeSerializer.deserialize(BindReferences.bindReference(expression, schema) .eval(input).asInstanceOf[UnsafeArrayData].toByteArray) } else if (expression.isInstanceOf[PointWrapper]) { BindReferences.bindReference(expression, schema).eval(input).asInstanceOf[Shape] } else throw new UnsupportedOperationException("Query shape should be of ShapeType") } }
Example 9
Source File: SnowflakePlan.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake.pushdowns import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class SnowflakePlan(output: Seq[Attribute], rdd: RDD[InternalRow]) extends SparkPlan { override def children: Seq[SparkPlan] = Nil protected override def doExecute(): RDD[InternalRow] = { val schema = StructType( output.map(attr => StructField(attr.name, attr.dataType, attr.nullable)) ) rdd.mapPartitions { iter => val project = UnsafeProjection.create(schema) iter.map(project) } } }
Example 10
Source File: ShuffledHashJoinExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 11
Source File: CartesianProductExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsInternal { iter => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition: (InternalRow) => Boolean = newPredicate(condition.get, left.output ++ right.output) val joined = new JoinedRow iter.filter { r => boundCondition(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 12
Source File: Exchange.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 13
Source File: commands.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 14
Source File: ExtraStrategiesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 15
Source File: HBaseSQLContext.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.OverrideCatalog import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan} import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies} class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override protected[sql] lazy val catalog: HBaseCatalog = new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource) @transient override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] { val batches = Batch("Add exchange", Once, EnsureRequirements(self)) :: Batch("Add coprocessor", Once, AddCoprocessor(self)) :: Nil } }
Example 16
Source File: CreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } // For CTAS, there is no static partition values to insert. val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( tableDesc, partition, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable( tableDesc.copy(schema = outputColumns.toStructType), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 17
Source File: XSQLCreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand import org.apache.spark.sql.xsql.XSQLSessionCatalog case class XSQLCreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] val tableIdentifier = catalog.getUsedTableIdentifier(tableDesc.identifier) val newTableDesc = tableDesc.copy(identifier = tableIdentifier) if (catalog.tableExists(tableIdentifier)) { assert( mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } XSQLInsertIntoHiveTable( newTableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(newTableDesc.schema.isEmpty) catalog.createTable(newTableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(newTableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap XSQLInsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 18
Source File: ShuffledHashJoinExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener[Unit](_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 19
Source File: CartesianProductExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 20
Source File: DataSourceV2Strategy.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.{sources, Strategy} import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition} import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns} import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader object DataSourceV2Strategy extends Strategy { // TODO: nested column pruning. private def pruneColumns( reader: DataSourceReader, relation: DataSourceV2Relation, exprs: Seq[Expression]): Seq[AttributeReference] = { reader match { case r: SupportsPushDownRequiredColumns => val requiredColumns = AttributeSet(exprs.flatMap(_.references)) val neededOutput = relation.output.filter(requiredColumns.contains) if (neededOutput != relation.output) { r.pruneColumns(neededOutput.toStructType) val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap r.readSchema().toAttributes.map { // We have to keep the attribute id during transformation. a => a.withExprId(nameToAttr(a.name).exprId) } } else { relation.output } case _ => relation.output } } override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => val reader = relation.newReader() // `pushedFilters` will be pushed down and evaluated in the underlying data sources. // `postScanFilters` need to be evaluated after the scan. // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. val (pushedFilters, postScanFilters) = pushFilters(reader, filters) val output = pruneColumns(reader, relation, project ++ postScanFilters) logInfo( s""" |Pushing operators to ${relation.source.getClass} |Pushed Filters: ${pushedFilters.mkString(", ")} |Post-Scan Filters: ${postScanFilters.mkString(",")} |Output: ${output.mkString(", ")} """.stripMargin) val scan = DataSourceV2ScanExec( output, relation.source, relation.options, pushedFilters, reader) val filterCondition = postScanFilters.reduceLeftOption(And) val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan) // always add the projection, which will produce unsafe rows required by some operators ProjectExec(project, withFilter) :: Nil case r: StreamingDataSourceV2Relation => // ensure there is a projection, which will produce unsafe rows required by some operators ProjectExec(r.output, DataSourceV2ScanExec(r.output, r.source, r.options, r.pushedFilters, r.reader)) :: Nil case WriteToDataSourceV2(writer, query) => WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil case AppendData(r: DataSourceV2Relation, query, _) => WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil case WriteToContinuousDataSource(writer, query) => WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil case Repartition(1, false, child) => val isContinuous = child.collectFirst { case StreamingDataSourceV2Relation(_, _, _, r: ContinuousReader) => r }.isDefined if (isContinuous) { ContinuousCoalesceExec(1, planLater(child)) :: Nil } else { Nil } case _ => Nil } }
Example 21
Source File: Exchange.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 22
Source File: EvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import java.io.File import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.util.Utils abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends SparkPlan { def children: Seq[SparkPlan] = child :: Nil override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length)) private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = { udf.children match { case Seq(u: PythonUDF) => val (chained, children) = collectFunctions(u) (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty)) (ChainedPythonFunctions(Seq(udf.func)), udf.children) } } protected def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] protected override def doExecute(): RDD[InternalRow] = { val inputRDD = child.execute().map(_.copy()) inputRDD.mapPartitions { iter => val context = TaskContext.get() // The queue used to buffer input rows so we can drain it to // combine input with output from Python. val queue = HybridRowQueue(context.taskMemoryManager(), new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length) context.addTaskCompletionListener[Unit] { ctx => queue.close() } val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip // flatten all the arguments val allInputs = new ArrayBuffer[Expression] val dataTypes = new ArrayBuffer[DataType] val argOffsets = inputs.map { input => input.map { e => if (allInputs.exists(_.semanticEquals(e))) { allInputs.indexWhere(_.semanticEquals(e)) } else { allInputs += e dataTypes += e.dataType allInputs.length - 1 } }.toArray }.toArray val projection = newMutableProjection(allInputs, child.output) val schema = StructType(dataTypes.zipWithIndex.map { case (dt, i) => StructField(s"_$i", dt) }) // Add rows to queue to join later with the result. val projectedRowIter = iter.map { inputRow => queue.add(inputRow.asInstanceOf[UnsafeRow]) projection(inputRow) } val outputRowIterator = evaluate( pyFuncs, argOffsets, projectedRowIter, schema, context) val joined = new JoinedRow val resultProj = UnsafeProjection.create(output, output) outputRowIterator.map { outputRow => resultProj(joined(queue.remove(), outputRow)) } } } }
Example 23
Source File: ArrowEvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.arrow.ArrowUtils import org.apache.spark.sql.types.StructType case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { private val batchSize = conf.arrowMaxRecordsPerBatch private val sessionLocalTimeZone = conf.sessionLocalTimeZone private val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) protected override def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { val outputTypes = output.drop(child.output.length).map(_.dataType) // DO NOT use iter.grouped(). See BatchIterator. val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter) val columnarBatchIter = new ArrowPythonRunner( funcs, PythonEvalType.SQL_SCALAR_PANDAS_UDF, argOffsets, schema, sessionLocalTimeZone, pythonRunnerConf).compute(batchIter, context.partitionId(), context) new Iterator[InternalRow] { private var currentIter = if (columnarBatchIter.hasNext) { val batch = columnarBatchIter.next() val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType()) assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " + s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}") batch.rowIterator.asScala } else { Iterator.empty } override def hasNext: Boolean = currentIter.hasNext || { if (columnarBatchIter.hasNext) { currentIter = columnarBatchIter.next().rowIterator.asScala hasNext } else { false } } override def next(): InternalRow = currentIter.next() } } }
Example 24
Source File: BatchEvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import net.razorvine.pickle.{Pickler, Unpickler} import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { protected override def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { EvaluatePython.registerPicklers() // register pickler for Row val dataTypes = schema.map(_.dataType) val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython) // enable memo iff we serialize the row with schema (schema and class should be memorized) val pickle = new Pickler(needConversion) // Input iterator to Python: input rows are grouped so we send them in batches to Python. // For each row, add it to the queue. val inputIterator = iter.map { row => if (needConversion) { EvaluatePython.toJava(row, schema) } else { // fast path for these types that does not need conversion in Python val fields = new Array[Any](row.numFields) var i = 0 while (i < row.numFields) { val dt = dataTypes(i) fields(i) = EvaluatePython.toJava(row.get(i, dt), dt) i += 1 } fields } }.grouped(100).map(x => pickle.dumps(x.toArray)) // Output iterator for results from Python. val outputIterator = new PythonUDFRunner(funcs, PythonEvalType.SQL_BATCHED_UDF, argOffsets) .compute(inputIterator, context.partitionId(), context) val unpickle = new Unpickler val mutableRow = new GenericInternalRow(1) val resultType = if (udfs.length == 1) { udfs.head.dataType } else { StructType(udfs.map(u => StructField("", u.dataType, u.nullable))) } val fromJava = EvaluatePython.makeFromJava(resultType) outputIterator.flatMap { pickedResult => val unpickledBatch = unpickle.loads(pickedResult) unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala }.map { result => if (udfs.length == 1) { // fast path for single UDF mutableRow(0) = fromJava(result) mutableRow } else { fromJava(result).asInstanceOf[InternalRow] } } } }
Example 25
Source File: DataWritingCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.util.SerializableConfiguration def logicalPlanOutputWithNames( query: LogicalPlan, names: Seq[String]): Seq[Attribute] = { // Save the output attributes to a variable to avoid duplicated function calls. val outputAttributes = query.output assert(outputAttributes.length == names.length, "The length of provided names doesn't match the length of output attributes.") outputAttributes.zip(names).map { case (attr, outputName) => attr.withName(outputName) } } }
Example 26
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 27
Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.UUID import org.apache.spark.{HashPartitioner, SparkEnv} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD} case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan { override def output: Seq[Attribute] = child.output override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = SinglePartition override def doExecute(): RDD[InternalRow] = { assert(numPartitions == 1) new ContinuousCoalesceRDD( sparkContext, numPartitions, conf.continuousStreamingExecutorQueueSize, sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong, child.execute()) } }
Example 28
Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan) extends SparkPlan with Logging { override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { val writerFactory = writer.createWriterFactory() val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) logInfo(s"Start processing data source writer: $writer. " + s"The input RDD has ${rdd.partitions.length} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions)) try { // Force the RDD to run so continuous processing starts; no data is actually being collected // to the driver, as ContinuousWriteRDD outputs nothing. rdd.collect() } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. case _ if StreamExecution.isInterruptionException(cause) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause } } sparkContext.emptyRDD } }
Example 29
Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 30
Source File: BenchmarkQueryTest.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils abstract class BenchmarkQueryTest extends QueryTest with SharedSQLContext with BeforeAndAfterAll { // When Utils.isTesting is true, the RuleExecutor will issue an exception when hitting // the max iteration of analyzer/optimizer batches. assert(Utils.isTesting, "spark.testing is not set to true") protected override def afterAll(): Unit = { try { // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) spark.sessionState.catalog.reset() } finally { super.afterAll() } } override def beforeAll() { super.beforeAll() RuleExecutor.resetMetrics() } protected def checkGeneratedCode(plan: SparkPlan): Unit = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() plan foreach { case s: WholeStageCodegenExec => codegenSubtrees += s case _ => } codegenSubtrees.toSeq.foreach { subtree => val code = subtree.doCodeGen()._2 try { // Just check the generated code can be properly compiled CodeGenerator.compile(code) } catch { case e: Exception => val msg = s""" |failed to compile: |Subtree: |$subtree |Generated code: |${CodeFormatter.format(code)} """.stripMargin throw new Exception(msg, e) } } } }
Example 31
Source File: ExtractPythonUDFsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest} import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SharedSQLContext class ExtractPythonUDFsSuite extends SparkPlanTest with SharedSQLContext { import testImplicits.newProductEncoder import testImplicits.localSeqToDatasetHolder val batchedPythonUDF = new MyDummyPythonUDF val scalarPandasUDF = new MyDummyScalarPandasUDF private def collectBatchExec(plan: SparkPlan): Seq[BatchEvalPythonExec] = plan.collect { case b: BatchEvalPythonExec => b } private def collectArrowExec(plan: SparkPlan): Seq[ArrowEvalPythonExec] = plan.collect { case b: ArrowEvalPythonExec => b } test("Chained Batched Python UDFs should be combined to a single physical node") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c", batchedPythonUDF(col("a"))) .withColumn("d", batchedPythonUDF(col("c"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 1) } test("Chained Scalar Pandas UDFs should be combined to a single physical node") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c", scalarPandasUDF(col("a"))) .withColumn("d", scalarPandasUDF(col("c"))) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(arrowEvalNodes.size == 1) } test("Mixed Batched Python UDFs and Pandas UDF should be separate physical node") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c", batchedPythonUDF(col("a"))) .withColumn("d", scalarPandasUDF(col("b"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 1) assert(arrowEvalNodes.size == 1) } test("Independent Batched Python UDFs and Scalar Pandas UDFs should be combined separately") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c1", batchedPythonUDF(col("a"))) .withColumn("c2", batchedPythonUDF(col("c1"))) .withColumn("d1", scalarPandasUDF(col("a"))) .withColumn("d2", scalarPandasUDF(col("d1"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 1) assert(arrowEvalNodes.size == 1) } test("Dependent Batched Python UDFs and Scalar Pandas UDFs should not be combined") { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = df.withColumn("c1", batchedPythonUDF(col("a"))) .withColumn("d1", scalarPandasUDF(col("c1"))) .withColumn("c2", batchedPythonUDF(col("d1"))) .withColumn("d2", scalarPandasUDF(col("c2"))) val pythonEvalNodes = collectBatchExec(df2.queryExecution.executedPlan) val arrowEvalNodes = collectArrowExec(df2.queryExecution.executedPlan) assert(pythonEvalNodes.size == 2) assert(arrowEvalNodes.size == 2) } }
Example 32
Source File: ExtraStrategiesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 33
Source File: ExtendedPlanner.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.extension import org.apache.spark.Logging import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{SparkPlan, SparkPlanner} def planLaterExt(p: LogicalPlan): SparkPlan = planLater(p) def optimizedPlan(p: LogicalPlan): LogicalPlan = sqlContext.executePlan(p).optimizedPlan def optimizedRelationLookup(u: UnresolvedRelation): Option[LogicalPlan] = { if (sqlContext.catalog.tableExists(u.tableIdentifier)) { Some(optimizedPlan(u)) } else { None } } // TODO (AC) Remove this once table-valued function are rebased on top. def analyze(p: LogicalPlan): LogicalPlan = sqlContext.analyzer.execute(p) override def plan(p: LogicalPlan): Iterator[SparkPlan] = { val iter = strategies.view.flatMap({ strategy => val plans = strategy(p) if (plans.isEmpty) { logTrace(s"Strategy $strategy did not produce plans for $p") } else { logDebug(s"Strategy $strategy produced a plan for $p: ${plans.head}") } plans }).toIterator assert(iter.hasNext, s"No plan for $p") iter } }
Example 34
Source File: CreateTableStrategy.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{DatasourceResolver, SQLContext, Strategy} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{ExecutedCommand, SparkPlan} import org.apache.spark.sql.sources.TemporaryAndPersistentNature private[sql] case class CreateTableStrategy(sqlContext: SQLContext) extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { // Currently we only handle cases where the user wants to instantiate a // persistent relation any other cases has to be handled by the datasource itself case CreateTableUsing(tableName, userSpecifiedSchema, provider, temporary, options, allowExisting, _) => DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider) match { case _: TemporaryAndPersistentNature => ExecutedCommand(CreateTableUsingTemporaryAwareCommand(tableName, userSpecifiedSchema, Array.empty[String], None, None, provider, options, temporary, allowExisting)) :: Nil case _ => Nil } case CreateTablePartitionedByUsing(tableId, userSpecifiedSchema, provider, partitioningFunction, partitioningColumns, temporary, options, allowExisting, _) => ResolvedDataSource.lookupDataSource(provider).newInstance() match { case _: TemporaryAndPersistentNature => ExecutedCommand(CreateTableUsingTemporaryAwareCommand( tableId, userSpecifiedSchema, Array.empty[String], Some(partitioningFunction), Some(partitioningColumns), provider, options, isTemporary = false, allowExisting)) :: Nil case _ => Nil } case _ => Nil } }
Example 35
Source File: RawSqlSourceProvider.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.util.concurrent.atomic.AtomicReference import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.execution.{PhysicalRDD, RDDConversions, SparkPlan} import org.apache.spark.sql.sources.RawDDLObjectType.RawDDLObjectType import org.apache.spark.sql.sources.RawDDLStatementType.RawDDLStatementType import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext} case object RawDDLObjectType { sealed trait RawDDLObjectType { val name: String override def toString: String = name } sealed abstract class BaseRawDDLObjectType(val name: String) extends RawDDLObjectType sealed trait RawData case object PartitionFunction extends BaseRawDDLObjectType("partition function") case object PartitionScheme extends BaseRawDDLObjectType("partition scheme") case object Collection extends BaseRawDDLObjectType("collection") with RawData case object Series extends BaseRawDDLObjectType("table") with RawData case object Graph extends BaseRawDDLObjectType("graph") with RawData } case object RawDDLStatementType { sealed trait RawDDLStatementType case object Create extends RawDDLStatementType case object Drop extends RawDDLStatementType case object Append extends RawDDLStatementType case object Load extends RawDDLStatementType } protected def calculateSchema(): StructType }
Example 36
Source File: ColumnarShuffledHashJoinExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import java.util.concurrent.TimeUnit._ import com.intel.sparkColumnarPlugin.vectorized._ import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import scala.collection.mutable.ListBuffer import org.apache.arrow.vector.ipc.message.ArrowFieldNode import org.apache.arrow.vector.ipc.message.ArrowRecordBatch import org.apache.arrow.vector.types.pojo.ArrowType import org.apache.arrow.vector.types.pojo.Field import org.apache.arrow.vector.types.pojo.Schema import org.apache.arrow.gandiva.expression._ import org.apache.arrow.gandiva.evaluator._ import io.netty.buffer.ArrowBuf import com.google.common.collect.Lists; import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized.ExpressionEvaluator import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide} class ColumnarShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends ShuffledHashJoinExec( leftKeys, rightKeys, joinType, buildSide, condition, left, right) { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "joinTime" -> SQLMetrics.createTimingMetric(sparkContext, "join time"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def supportsColumnar = true //TODO() Disable code generation //override def supportCodegen: Boolean = false override def doExecuteColumnar(): RDD[ColumnarBatch] = { val numOutputRows = longMetric("numOutputRows") val joinTime = longMetric("joinTime") val buildTime = longMetric("buildTime") val resultSchema = this.schema streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) => //val hashed = buildHashedRelation(buildIter) //join(streamIter, hashed, numOutputRows) val vjoin = ColumnarShuffledHashJoin.create(leftKeys, rightKeys, resultSchema, joinType, buildSide, condition, left, right, buildTime, joinTime, numOutputRows) val vjoinResult = vjoin.columnarInnerJoin(streamIter, buildIter) TaskContext.get().addTaskCompletionListener[Unit](_ => { vjoin.close() }) new CloseableColumnBatchIterator(vjoinResult) } } }
Example 37
Source File: OapAggUtils.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, Partial} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.oap.OapAggregationFileScanExec object OapAggUtils { private def createAggregate( requiredChildDistributionExpressions: Option[Seq[Expression]] = None, groupingExpressions: Seq[NamedExpression] = Nil, aggregateExpressions: Seq[AggregateExpression] = Nil, aggregateAttributes: Seq[Attribute] = Nil, initialInputBufferOffset: Int = 0, resultExpressions: Seq[NamedExpression] = Nil, child: SparkPlan): SparkPlan = { if (requiredChildDistributionExpressions.isDefined) { // final aggregate, fall back to Spark HashAggregateExec. HashAggregateExec( requiredChildDistributionExpressions = requiredChildDistributionExpressions, groupingExpressions = groupingExpressions, aggregateExpressions = aggregateExpressions, aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, child = child) } else { // Apply partial aggregate optimizations. OapAggregateExec( requiredChildDistributionExpressions = None, groupingExpressions = groupingExpressions, aggregateExpressions = aggregateExpressions, aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, child = child) } } def planAggregateWithoutDistinct( groupingExpressions: Seq[NamedExpression], aggregateExpressions: Seq[AggregateExpression], resultExpressions: Seq[NamedExpression], child: SparkPlan): Seq[SparkPlan] = { val useHash = HashAggregateExec.supportsAggregate( aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)) if (!child.isInstanceOf[OapAggregationFileScanExec] || !useHash) { // Child can not leverage oap optimization reading. Nil } else { // 1. Create an Aggregate Operator for partial aggregations. val groupingAttributes = groupingExpressions.map(_.toAttribute) val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial)) val partialAggregateAttributes = partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) val partialResultExpressions = groupingAttributes ++ partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes) val partialAggregate = createAggregate( requiredChildDistributionExpressions = None, groupingExpressions = groupingExpressions, aggregateExpressions = partialAggregateExpressions, aggregateAttributes = partialAggregateAttributes, initialInputBufferOffset = 0, resultExpressions = partialResultExpressions, child = child) // 2. Create an Aggregate Operator for final aggregations. val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final)) // The attributes of the final aggregation buffer, which is presented as input to the result // projection: val finalAggregateAttributes = finalAggregateExpressions.map(_.resultAttribute) val finalAggregate = createAggregate( requiredChildDistributionExpressions = Some(groupingAttributes), groupingExpressions = groupingAttributes, aggregateExpressions = finalAggregateExpressions, aggregateAttributes = finalAggregateAttributes, initialInputBufferOffset = groupingExpressions.length, resultExpressions = resultExpressions, child = partialAggregate) finalAggregate :: Nil } } }
Example 38
Source File: SharedOapContext.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.test.oap import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.{OapExtensions, SparkSession} import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, SparkPlan} import org.apache.spark.sql.execution.datasources.oap.{IndexType, OapFileFormat} import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.oap.{OapDriverRuntime, OapRuntime} import org.apache.spark.sql.test.OapSharedSQLContext trait SharedOapContext extends SharedOapContextBase { protected override def createSparkSession: SparkSession = { SparkSession.cleanupAnyExistingSession() val session = SparkSession.builder() .master("local[2]") .appName("test-oap-context") .config(oapSparkConf).getOrCreate() OapRuntime.getOrCreate.asInstanceOf[OapDriverRuntime].setTestSession(session) session } } protected def withFileSystem(f: FileSystem => Unit): Unit = { var fs: FileSystem = null try { fs = FileSystem.get(configuration) f(fs) } finally { if (fs != null) { fs.close() } } } } case class TestPartition(key: String, value: String) case class TestIndex( tableName: String, indexName: String, partitions: TestPartition*)
Example 39
Source File: ShuffledHashJoinExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 40
Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 41
Source File: Exchange.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 42
Source File: commands.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 43
Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 44
Source File: ExtraStrategiesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 45
Source File: StreamingTableStrategy.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.strategy import org.apache.spark.sql.{CarbonEnv, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{SparkPlan, SparkStrategy} import org.apache.spark.sql.execution.command.{AlterTableAddColumnsCommand, AlterTableChangeColumnCommand, AlterTableRenameCommand} import org.apache.spark.sql.execution.command.mutation.{CarbonProjectForDeleteCommand, CarbonProjectForUpdateCommand} import org.apache.spark.sql.execution.command.schema.{CarbonAlterTableAddColumnCommand, CarbonAlterTableColRenameDataTypeChangeCommand, CarbonAlterTableDropColumnCommand} import org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException private def rejectIfStreamingTable(tableIdentifier: TableIdentifier, operation: String): Unit = { var streaming = false try { streaming = CarbonEnv.getCarbonTable( tableIdentifier.database, tableIdentifier.table)(sparkSession) .isStreamingSink } catch { case e: Exception => streaming = false } if (streaming) { throw new MalformedCarbonCommandException( s"$operation is not allowed for streaming table") } } def isCarbonTable(tableIdent: TableIdentifier): Boolean = { CarbonPlanHelper.isCarbonTable(tableIdent, sparkSession) } }
Example 46
Source File: TestSecondaryIndexUtils.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.secondaryindex import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.secondaryindex.joins.BroadCastSIFilterPushJoin object TestSecondaryIndexUtils { def isFilterPushedDownToSI(sparkPlan: SparkPlan): Boolean = { var isValidPlan = false sparkPlan.transform { case broadCastSIFilterPushDown: BroadCastSIFilterPushJoin => isValidPlan = true broadCastSIFilterPushDown } isValidPlan } }
Example 47
Source File: ExistingDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.rdd.{EmptyRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow]) extends SparkPlan with StreamPlan { def children = Nil override def doExecute() = { assert(validTime != null) Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime)) .asInstanceOf[Option[RDD[InternalRow]]] .getOrElse(new EmptyRDD[InternalRow](sparkContext)) } }
Example 48
Source File: DeltaInvariantCheckerExec.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.schema import org.apache.spark.sql.delta.DeltaErrors import org.apache.spark.sql.delta.schema.Invariants.NotNull import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BindReferences, Expression, GetStructField, Literal, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.{NullType, StructType} private def buildExtractors(invariant: Invariant): Option[Expression] = { assert(invariant.column.nonEmpty) val topLevelColumn = invariant.column.head val topLevelRefOpt = output.collectFirst { case a: AttributeReference if SchemaUtils.DELTA_COL_RESOLVER(a.name, topLevelColumn) => a } val rejectColumnNotFound = isNullNotOkay(invariant) if (topLevelRefOpt.isEmpty) { if (rejectColumnNotFound) { throw DeltaErrors.notNullInvariantException(invariant) } } if (invariant.column.length == 1) { topLevelRefOpt.map(BindReferences.bindReference[Expression](_, output)) } else { topLevelRefOpt.flatMap { topLevelRef => val boundTopLevel = BindReferences.bindReference[Expression](topLevelRef, output) try { val nested = invariant.column.tail.foldLeft(boundTopLevel) { case (e, fieldName) => e.dataType match { case StructType(fields) => val ordinal = fields.indexWhere(f => SchemaUtils.DELTA_COL_RESOLVER(f.name, fieldName)) if (ordinal == -1) { throw new IndexOutOfBoundsException(s"Not nullable column not found in struct: " + s"${fields.map(_.name).mkString("[", ",", "]")}") } GetStructField(e, ordinal, Some(fieldName)) case _ => throw new UnsupportedOperationException( "Invariants on nested fields other than StructTypes are not supported.") } } Some(nested) } catch { case i: IndexOutOfBoundsException if rejectColumnNotFound => throw InvariantViolationException(invariant, i.getMessage) case _: IndexOutOfBoundsException if !rejectColumnNotFound => None } } } } override protected def doExecute(): RDD[InternalRow] = { if (invariants.isEmpty) return child.execute() val boundRefs = invariants.map { invariant => CheckDeltaInvariant(buildExtractors(invariant).getOrElse(Literal(null, NullType)), invariant) } child.execute().mapPartitionsInternal { rows => val assertions = GenerateUnsafeProjection.generate(boundRefs) rows.map { row => assertions(row) row } } } override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 49
Source File: PostAggregate.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.execution.SparkPlan import org.sparklinedata.druid._ class PostAggregate(val druidOpSchema : DruidOperatorSchema) { val dqb = druidOpSchema.dqb private def attrRef(dOpAttr : DruidOperatorAttribute) : AttributeReference = AttributeReference(dOpAttr.name, dOpAttr.dataType)(dOpAttr.exprId) lazy val groupExpressions = dqb.dimensions.map { d => attrRef(druidOpSchema.druidAttrMap(d.outputName)) } def namedGroupingExpressions = groupExpressions private def toSparkAgg(dAggSpec : AggregationSpec) : Option[AggregateFunction] = { val dOpAttr = druidOpSchema.druidAttrMap(dAggSpec.name) dAggSpec match { case FunctionAggregationSpec("count", nm, _) => Some(Sum(attrRef(dOpAttr))) case FunctionAggregationSpec("longSum", nm, _) => Some(Sum(attrRef(dOpAttr))) case FunctionAggregationSpec("doubleSum", nm, _) => Some(Sum(attrRef(dOpAttr))) case FunctionAggregationSpec("longMin", nm, _) => Some(Min(attrRef(dOpAttr))) case FunctionAggregationSpec("doubleMin", nm, _) => Some(Min(attrRef(dOpAttr))) case FunctionAggregationSpec("longMax", nm, _) => Some(Max(attrRef(dOpAttr))) case FunctionAggregationSpec("doubleMax", nm, _) => Some(Max(attrRef(dOpAttr))) case JavascriptAggregationSpec(_, aggnm, _, _, _, _) if aggnm.startsWith("MIN") => Some(Min(attrRef(dOpAttr))) case JavascriptAggregationSpec(_, aggnm, _, _, _, _) if aggnm.startsWith("MAX") => Some(Max(attrRef(dOpAttr))) case JavascriptAggregationSpec(_, aggnm, _, _, _, _) if aggnm.startsWith("SUM") => Some(Sum(attrRef(dOpAttr))) case JavascriptAggregationSpec(_, aggnm, _, _, _, _) if aggnm.startsWith("COUNT") => Some(Sum(attrRef(dOpAttr))) case _ => None } } lazy val aggregatesO : Option[List[NamedExpression]] = Utils.sequence( dqb.aggregations.map { da => val dOpAttr = druidOpSchema.druidAttrMap(da.name) toSparkAgg(da).map { aggFunc => Alias(AggregateExpression(aggFunc, Complete, false), dOpAttr.name)(dOpAttr.exprId) } }) def canBeExecutedInHistorical : Boolean = dqb.canPushToHistorical && aggregatesO.isDefined lazy val resultExpressions = groupExpressions ++ aggregatesO.get lazy val aggregateExpressions = resultExpressions.flatMap { expr => expr.collect { case agg: AggregateExpression => agg } }.distinct lazy val aggregateFunctionToAttribute = aggregateExpressions.map { agg => val aggregateFunction = agg.aggregateFunction val attribute = Alias(aggregateFunction, aggregateFunction.toString)().toAttribute (aggregateFunction, agg.isDistinct) -> attribute }.toMap lazy val rewrittenResultExpressions = resultExpressions.map { expr => expr.transformDown { case aE@AggregateExpression(aggregateFunction, _, isDistinct, _) => // The final aggregation buffer's attributes will be `finalAggregationAttributes`, // so replace each aggregate expression by its corresponding attribute in the set: // aggregateFunctionToAttribute(aggregateFunction, isDistinct) aE.resultAttribute case expression => expression }.asInstanceOf[NamedExpression] } def aggOp(child : SparkPlan) : Seq[SparkPlan] = { org.apache.spark.sql.execution.aggregate.AggUtils.planAggregateWithoutPartial( namedGroupingExpressions, aggregateExpressions, rewrittenResultExpressions, child) } }
Example 50
Source File: ShuffledHashJoinExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 51
Source File: CartesianProductExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.hadoop.security.UserGroupInformation import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { private[this] val user = UserGroupInformation.getCurrentUser.getShortUserName override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get(user).blockManager, SparkEnv.get(user).serializerManager, context, null, null, 1024, SparkEnv.get(user).memoryManager.pageSizeBytes, SparkEnv.get(user).conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 52
Source File: Exchange.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get, plan.user) } else { sameSchema += exchange exchange } } } }
Example 53
Source File: commands.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 54
Source File: EventTimeWatermarkExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { override def user: String = child.user val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 55
Source File: ExtraStrategiesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 56
Source File: OTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class OTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } @transient private lazy val broadcastFuture = future { prevBatch match { case None => // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation: Broadcast[JHashSet[Row]] = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = OTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 57
Source File: MTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class MTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } val watcher = controller.getWatcher @transient private lazy val broadcastFuture = future { // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() prevBatch match { case None => val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => // TODO: fix this integrity error by supporting join whose both branches may grow val hashed = HashedSet(input.iterator) val previous = controller.broadcasts((opId, bId)).value.asInstanceOf[JHashSet[Row]] if (!previous.containsAll(hashed)) { watcher += -1 logError(s"Integrity Error in MTBLeftSemiHashJoin(Op $opId, Batch $currentBatch)") } controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = MTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 58
Source File: OTShuffledHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning} import org.apache.spark.sql.execution.joins.{BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.ComposeRDDFunctions._ import org.apache.spark.sql.hive.online._ import org.apache.spark.storage.{OLABlockId, StorageLevel} case class OTShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil def retrieveState(): RDD[HashedRelation] = prevBatch match { case Some(bId) => val numParts = controller.olaBlocks(opId, bId) OLABlockRDD.create[HashedRelation](sparkContext, opId.id, Array((numParts, bId)), numParts) case None => sys.error(s"Unexpected prevBatch = $prevBatch") } override def doExecute() = { prevBatch match { case None => val buildRdd = buildPlan.execute() controller.olaBlocks((opId, currentBatch)) = buildRdd.partitions.length buildRdd.zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = HashedRelation(buildIter, buildSideKeyGenerator) SparkEnv.get.blockManager.putSingle( OLABlockId(opId.id, currentBatch, index), hashed, StorageLevel.MEMORY_AND_DISK) hashJoin(streamIter, hashed) } case Some(_) => retrieveState().zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = buildIter.next() hashJoin(streamIter, hashed) } } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = OTShuffledHashJoin(leftKeys, rightKeys, buildSide, left, right)(controller, newTrace, opId) }
Example 59
Source File: OTBroadcastHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent._ import scala.concurrent.duration._ case class OTBroadcastHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } @transient private lazy val broadcastFuture = future { prevBatch match { case None => // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute().map(_.copy()).collect() val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[HashedRelation]] } }(BroadcastHashJoin.broadcastHashJoinExecutionContext) override def doExecute() = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => hashJoin(streamedIter, broadcastRelation.value) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = OTBroadcastHashJoin(leftKeys, rightKeys, buildSide, left, right)( controller, newTrace, opId) join.broadcastFuture join } }
Example 60
Source File: DescribeHiveTableCommand.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.collection.JavaConversions._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation} import org.apache.spark.sql.hive.HiveShim import org.apache.spark.sql.SQLContext private[hive] case class DescribeHiveTableCommand( table: MetastoreRelation, override val output: Seq[Attribute], isExtended: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { // Trying to mimic the format of Hive's output. But not exactly the same. var results: Seq[(String, String, String)] = Nil val columns: Seq[FieldSchema] = table.hiveQlTable.getCols val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols results ++= columns.map(field => (field.getName, field.getType, field.getComment)) if (partitionColumns.nonEmpty) { val partColumnInfo = partitionColumns.map(field => (field.getName, field.getType, field.getComment)) results ++= partColumnInfo ++ Seq(("# Partition Information", "", "")) ++ Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++ partColumnInfo } if (isExtended) { results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, "")) } results.map { case (name, dataType, comment) => Row(name, dataType, comment) } } }
Example 61
Source File: LeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class LeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } }
Example 62
Source File: BroadcastHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.util.ThreadUtils import scala.concurrent._ import scala.concurrent.duration._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Row, Expression} import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class BroadcastHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { val timeout: Duration = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil @transient lazy val broadcastFuture = future { // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute().map(_.copy()).collect() val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length) sparkContext.broadcast(hashed) }(BroadcastHashJoin.broadcastHashJoinExecutionContext) protected override def doExecute(): RDD[Row] = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => hashJoin(streamedIter, broadcastRelation.value) } } } object BroadcastHashJoin { private[sql] val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 128)) }
Example 63
Source File: BroadcastLeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class BroadcastLeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val broadcastedRelation = sparkContext.broadcast(hashSet) streamedPlan.execute().mapPartitions { streamIter => val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue) }) } } }
Example 64
Source File: LeftSemiJoinBNL.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} override def right: SparkPlan = broadcast @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected override def doExecute(): RDD[Row] = { val broadcastedRelation = sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq) streamed.execute().mapPartitions { streamedIter => val joinedRow = new JoinedRow streamedIter.filter(streamedRow => { var i = 0 var matched = false while (i < broadcastedRelation.value.size && !matched) { val broadcastedRow = broadcastedRelation.value(i) if (boundCondition(joinedRow(streamedRow, broadcastedRow))) { matched = true } i += 1 } matched }) } } }
Example 65
Source File: HashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.util.collection.CompactBuffer trait HashJoin { self: SparkPlan => val leftKeys: Seq[Expression] val rightKeys: Seq[Expression] val buildSide: BuildSide val left: SparkPlan val right: SparkPlan protected lazy val (buildPlan, streamedPlan) = buildSide match { case BuildLeft => (left, right) case BuildRight => (right, left) } protected lazy val (buildKeys, streamedKeys) = buildSide match { case BuildLeft => (leftKeys, rightKeys) case BuildRight => (rightKeys, leftKeys) } override def output: Seq[Attribute] = left.output ++ right.output @transient protected lazy val buildSideKeyGenerator: Projection = newProjection(buildKeys, buildPlan.output) @transient protected lazy val streamSideKeyGenerator: () => MutableProjection = newMutableProjection(streamedKeys, streamedPlan.output) protected def hashJoin(streamIter: Iterator[Row], hashedRelation: HashedRelation): Iterator[Row] = { new Iterator[Row] { private[this] var currentStreamedRow: Row = _ private[this] var currentHashMatches: CompactBuffer[Row] = _ private[this] var currentMatchPosition: Int = -1 // Mutable per row objects. private[this] val joinRow = new JoinedRow2 private[this] val joinKeys = streamSideKeyGenerator() override final def hasNext: Boolean = (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) || (streamIter.hasNext && fetchNext()) override final def next(): Row = { val ret = buildSide match { case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition)) case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow) } currentMatchPosition += 1 ret } private final def fetchNext(): Boolean = { currentHashMatches = null currentMatchPosition = -1 while (currentHashMatches == null && streamIter.hasNext) { currentStreamedRow = streamIter.next() if (!joinKeys(currentStreamedRow).anyNull) { currentHashMatches = hashedRelation.get(joinKeys.currentValue) } } if (currentHashMatches == null) { false } else { currentMatchPosition = 0 true } } } } }
Example 66
Source File: ShuffledHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class ShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[Row] = { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashed = HashedRelation(buildIter, buildSideKeyGenerator) hashJoin(streamIter, hashed) } } }
Example 67
Source File: CartesianProduct.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output protected override def doExecute(): RDD[Row] = { val leftResults = left.execute().map(_.copy()) val rightResults = right.execute().map(_.copy()) leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow iter.map(r => joinedRow(r._1, r._2)) } } }
Example 68
Source File: LeftSemiJoinHash.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, Distribution, ClusteredDistribution} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class LeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) => if (condition.isEmpty) { val hashSet = buildKeyHashSet(buildIter, numRightRows) hashSemiJoin(streamIter, numLeftRows, hashSet, numOutputRows) } else { val hashRelation = HashedRelation(buildIter, numRightRows, rightKeyGenerator) hashSemiJoin(streamIter, numLeftRows, hashRelation, numOutputRows) } } } }
Example 69
Source File: BroadcastLeftSemiJoinHash.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.{InternalAccumulator, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class BroadcastLeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val input = right.execute().map { row => numRightRows += 1 row.copy() }.collect() if (condition.isEmpty) { val hashSet = buildKeyHashSet(input.toIterator, SQLMetrics.nullLongMetric) val broadcastedRelation = sparkContext.broadcast(hashSet) left.execute().mapPartitions { streamIter => hashSemiJoin(streamIter, numLeftRows, broadcastedRelation.value, numOutputRows) } } else { val hashRelation = HashedRelation(input.toIterator, SQLMetrics.nullLongMetric, rightKeyGenerator, input.size) val broadcastedRelation = sparkContext.broadcast(hashRelation) left.execute().mapPartitions { streamIter => val hashedRelation = broadcastedRelation.value hashedRelation match { case unsafe: UnsafeHashedRelation => TaskContext.get().internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(unsafe.getUnsafeSize) case _ => } hashSemiJoin(streamIter, numLeftRows, hashedRelation, numOutputRows) } } } }
Example 70
Source File: LeftSemiJoinBNL.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics override def right: SparkPlan = broadcast @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val broadcastedRelation = sparkContext.broadcast(broadcast.execute().map { row => numRightRows += 1 row.copy() }.collect().toIndexedSeq) streamed.execute().mapPartitions { streamedIter => val joinedRow = new JoinedRow streamedIter.filter(streamedRow => { numLeftRows += 1 var i = 0 var matched = false while (i < broadcastedRelation.value.size && !matched) { val broadcastedRow = broadcastedRelation.value(i) if (boundCondition(joinedRow(streamedRow, broadcastedRow))) { matched = true } i += 1 } if (matched) { numOutputRows += 1 } matched }) } } }
Example 71
Source File: ShuffledHashJoin.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class ShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[InternalRow] = { val (numBuildRows, numStreamedRows) = buildSide match { case BuildLeft => (longMetric("numLeftRows"), longMetric("numRightRows")) case BuildRight => (longMetric("numRightRows"), longMetric("numLeftRows")) } val numOutputRows = longMetric("numOutputRows") buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashed = HashedRelation(buildIter, numBuildRows, buildSideKeyGenerator) hashJoin(streamIter, numStreamedRows, hashed, numOutputRows) } } }
Example 72
Source File: ShuffledHashOuterJoin.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import scala.collection.JavaConversions._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class ShuffledHashOuterJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashOuterJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def outputPartitioning: Partitioning = joinType match { case LeftOuter => left.outputPartitioning case RightOuter => right.outputPartitioning case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions) case x => throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType") } protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val joinedRow = new JoinedRow() left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) => // TODO this probably can be replaced by external sort (sort merged join?) joinType match { case LeftOuter => val hashed = HashedRelation(rightIter, numRightRows, buildKeyGenerator) val keyGenerator = streamedKeyGenerator val resultProj = resultProjection leftIter.flatMap( currentRow => { numLeftRows += 1 val rowKey = keyGenerator(currentRow) joinedRow.withLeft(currentRow) leftOuterIterator(rowKey, joinedRow, hashed.get(rowKey), resultProj, numOutputRows) }) case RightOuter => val hashed = HashedRelation(leftIter, numLeftRows, buildKeyGenerator) val keyGenerator = streamedKeyGenerator val resultProj = resultProjection rightIter.flatMap ( currentRow => { numRightRows += 1 val rowKey = keyGenerator(currentRow) joinedRow.withRight(currentRow) rightOuterIterator(rowKey, hashed.get(rowKey), joinedRow, resultProj, numOutputRows) }) case FullOuter => // TODO(davies): use UnsafeRow val leftHashTable = buildHashTable(leftIter, numLeftRows, newProjection(leftKeys, left.output)) val rightHashTable = buildHashTable(rightIter, numRightRows, newProjection(rightKeys, right.output)) (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key => fullOuterIterator(key, leftHashTable.getOrElse(key, EMPTY_LIST), rightHashTable.getOrElse(key, EMPTY_LIST), joinedRow, numOutputRows) } case x => throw new IllegalArgumentException( s"ShuffledHashOuterJoin should not take $x as the JoinType") } } } }
Example 73
Source File: CartesianProduct.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().map { row => numLeftRows += 1 row.copy() } val rightResults = right.execute().map { row => numRightRows += 1 row.copy() } leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow iter.map { r => numOutputRows += 1 joinedRow(r._1, r._2) } } } }
Example 74
Source File: SemiJoinSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.{SQLConf, DataFrame, Row} import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression} import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} //半连接测试套件 class SemiJoinSuite extends SparkPlanTest with SharedSQLContext { private lazy val left = ctx.createDataFrame( ctx.sparkContext.parallelize(Seq( Row(1, 2.0), Row(1, 2.0), Row(2, 1.0), Row(2, 1.0), Row(3, 3.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("a", IntegerType).add("b", DoubleType)) private lazy val right = ctx.createDataFrame( ctx.sparkContext.parallelize(Seq( Row(2, 3.0), Row(2, 3.0), Row(3, 2.0), Row(4, 1.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("c", IntegerType).add("d", DoubleType)) private lazy val condition = { And((left.col("a") === right.col("c")).expr, LessThan(left.col("b").expr, right.col("d").expr)) } // Note: the input dataframes and expression must be evaluated lazily because // the SQLContext should be used only within a test to keep SQL tests stable private def testLeftSemiJoin( testName: String, leftRows: => DataFrame, rightRows: => DataFrame, condition: => Expression, expectedAnswer: Seq[Product]): Unit = { def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) ExtractEquiJoinKeys.unapply(join) } test(s"$testName using LeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => EnsureRequirements(left.sqlContext).apply( LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using BroadcastLeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using LeftSemiJoinBNL") { withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => LeftSemiJoinBNL(left, right, Some(condition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } //测试左半连接 testLeftSemiJoin( "basic test", left, right, condition, Seq( (2, 1.0), (2, 1.0) ) ) }
Example 75
Source File: ExtraStrategiesSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String //快速操作 case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } //Nil是一个空的List override def children: Seq[SparkPlan] = Nil } //测试策略 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 FastOperator(attr.toAttribute :: Nil) :: Nil //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 case _ => Nil } } //额外的策略集 class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") {//插入一个额外的策略 try { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = Nil } } }
Example 76
Source File: HBaseSQLTableScan.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.hbase._ import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils @DeveloperApi case class HBaseSQLTableScan( relation: HBaseRelation, output: Seq[Attribute], result: RDD[InternalRow]) extends SparkPlan { override def children: Seq[SparkPlan] = Nil override def outputPartitioning = { var ordering = List[SortOrder]() for (key <- relation.partitionKeys) { ordering = ordering :+ SortOrder(key, Ascending) } RangePartitioning(ordering, relation.partitions.size) } override protected def doExecute(): RDD[InternalRow] = { val schema = StructType.fromAttributes(output) result.mapPartitionsInternal { iter => val proj = UnsafeProjection.create(schema) iter.map(proj) } } override def nodeName: String = getClass.getSimpleName override def argString: String = (Utils.truncatedString(output, "[", ", ", "]") :: Nil).mkString(", ") }
Example 77
Source File: HBaseSQLContext.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.OverrideCatalog import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan} import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies} class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override protected[sql] lazy val catalog: HBaseCatalog = new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource) @transient override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] { val batches = Batch("Add exchange", Once, EnsureRequirements(self)) :: Batch("Add coprocessor", Once, AddCoprocessor(self)) :: Nil } }
Example 78
Source File: InsertIntoHiveDirCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.language.existentials import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.FileUtils import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.mapred._ import org.apache.spark.SparkException import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.hive.client.HiveClientImpl case class InsertIntoHiveDirCommand( isLocal: Boolean, storage: CatalogStorageFormat, query: LogicalPlan, overwrite: Boolean, outputColumns: Seq[Attribute]) extends SaveAsHiveFile { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { assert(storage.locationUri.nonEmpty) val hiveTable = HiveClientImpl.toHiveTable(CatalogTable( identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")), tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW, storage = storage, schema = query.schema )) hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB, storage.serde.getOrElse(classOf[LazySimpleSerDe].getName)) val tableDesc = new TableDesc( hiveTable.getInputFormatClass, hiveTable.getOutputFormatClass, hiveTable.getMetadata ) val hadoopConf = sparkSession.sessionState.newHadoopConf() val jobConf = new JobConf(hadoopConf) val targetPath = new Path(storage.locationUri.get) val writeToPath = if (isLocal) { val localFileSystem = FileSystem.getLocal(jobConf) localFileSystem.makeQualified(targetPath) } else { val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf) val dfs = qualifiedPath.getFileSystem(jobConf) if (!dfs.exists(qualifiedPath)) { dfs.mkdirs(qualifiedPath.getParent) } qualifiedPath } val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath) val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc( tmpPath.toString, tableDesc, false) try { saveAsHiveFile( sparkSession = sparkSession, plan = child, hadoopConf = hadoopConf, fileSinkConf = fileSinkConf, outputLocation = tmpPath.toString, allColumns = outputColumns) val fs = writeToPath.getFileSystem(hadoopConf) if (overwrite && fs.exists(writeToPath)) { fs.listStatus(writeToPath).foreach { existFile => if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true) } } fs.listStatus(tmpPath).foreach { tmpFile => fs.rename(tmpFile.getPath, writeToPath) } } catch { case e: Throwable => throw new SparkException( "Failed inserting overwrite directory " + storage.locationUri.get, e) } finally { deleteExternalTmpPath(hadoopConf) } Seq.empty[Row] } }
Example 79
Source File: CreateHiveTableAsSelectCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumns: Seq[Attribute], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } InsertIntoHiveTable( tableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 80
Source File: ShuffledHashJoinExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"), "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe")) override def requiredChildDistribution: Seq[Distribution] = HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val avgHashProbe = longMetric("avgHashProbe") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows, avgHashProbe) } } }
Example 81
Source File: CartesianProductExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 82
Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 83
Source File: FlatMapGroupsInPandasExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning} import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.StructType case class FlatMapGroupsInPandasExec( groupingAttributes: Seq[Attribute], func: Expression, output: Seq[Attribute], child: SparkPlan) extends UnaryExecNode { private val pandasFunction = func.asInstanceOf[PythonUDF].func override def outputPartitioning: Partitioning = child.outputPartitioning override def producedAttributes: AttributeSet = AttributeSet(output) override def requiredChildDistribution: Seq[Distribution] = { if (groupingAttributes.isEmpty) { AllTuples :: Nil } else { ClusteredDistribution(groupingAttributes) :: Nil } } override def requiredChildOrdering: Seq[Seq[SortOrder]] = Seq(groupingAttributes.map(SortOrder(_, Ascending))) override protected def doExecute(): RDD[InternalRow] = { val inputRDD = child.execute() val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536) val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true) val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) val argOffsets = Array((0 until (child.output.length - groupingAttributes.length)).toArray) val schema = StructType(child.schema.drop(groupingAttributes.length)) val sessionLocalTimeZone = conf.sessionLocalTimeZone val pandasRespectSessionTimeZone = conf.pandasRespectSessionTimeZone inputRDD.mapPartitionsInternal { iter => val grouped = if (groupingAttributes.isEmpty) { Iterator(iter) } else { val groupedIter = GroupedIterator(iter, groupingAttributes, child.output) val dropGrouping = UnsafeProjection.create(child.output.drop(groupingAttributes.length), child.output) groupedIter.map { case (_, groupedRowIter) => groupedRowIter.map(dropGrouping) } } val context = TaskContext.get() val columnarBatchIter = new ArrowPythonRunner( chainedFunc, bufferSize, reuseWorker, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, argOffsets, schema, sessionLocalTimeZone, pandasRespectSessionTimeZone) .compute(grouped, context.partitionId(), context) columnarBatchIter.flatMap(_.rowIterator.asScala).map(UnsafeProjection.create(output, output)) } } }
Example 84
Source File: ArrowEvalPythonExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.StructType case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { private val batchSize = conf.arrowMaxRecordsPerBatch private val sessionLocalTimeZone = conf.sessionLocalTimeZone private val pandasRespectSessionTimeZone = conf.pandasRespectSessionTimeZone protected override def evaluate( funcs: Seq[ChainedPythonFunctions], bufferSize: Int, reuseWorker: Boolean, argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { val outputTypes = output.drop(child.output.length).map(_.dataType) // DO NOT use iter.grouped(). See BatchIterator. val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter) val columnarBatchIter = new ArrowPythonRunner( funcs, bufferSize, reuseWorker, PythonEvalType.SQL_SCALAR_PANDAS_UDF, argOffsets, schema, sessionLocalTimeZone, pandasRespectSessionTimeZone) .compute(batchIter, context.partitionId(), context) new Iterator[InternalRow] { private var currentIter = if (columnarBatchIter.hasNext) { val batch = columnarBatchIter.next() val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType()) assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " + s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}") batch.rowIterator.asScala } else { Iterator.empty } override def hasNext: Boolean = currentIter.hasNext || { if (columnarBatchIter.hasNext) { currentIter = columnarBatchIter.next().rowIterator.asScala hasNext } else { false } } override def next(): InternalRow = currentIter.next() } } }
Example 85
Source File: BatchEvalPythonExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import net.razorvine.pickle.{Pickler, Unpickler} import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { protected override def evaluate( funcs: Seq[ChainedPythonFunctions], bufferSize: Int, reuseWorker: Boolean, argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { EvaluatePython.registerPicklers() // register pickler for Row val dataTypes = schema.map(_.dataType) val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython) // enable memo iff we serialize the row with schema (schema and class should be memorized) val pickle = new Pickler(needConversion) // Input iterator to Python: input rows are grouped so we send them in batches to Python. // For each row, add it to the queue. val inputIterator = iter.map { row => if (needConversion) { EvaluatePython.toJava(row, schema) } else { // fast path for these types that does not need conversion in Python val fields = new Array[Any](row.numFields) var i = 0 while (i < row.numFields) { val dt = dataTypes(i) fields(i) = EvaluatePython.toJava(row.get(i, dt), dt) i += 1 } fields } }.grouped(100).map(x => pickle.dumps(x.toArray)) // Output iterator for results from Python. val outputIterator = new PythonUDFRunner( funcs, bufferSize, reuseWorker, PythonEvalType.SQL_BATCHED_UDF, argOffsets) .compute(inputIterator, context.partitionId(), context) val unpickle = new Unpickler val mutableRow = new GenericInternalRow(1) val resultType = if (udfs.length == 1) { udfs.head.dataType } else { StructType(udfs.map(u => StructField("", u.dataType, u.nullable))) } val fromJava = EvaluatePython.makeFromJava(resultType) outputIterator.flatMap { pickedResult => val unpickledBatch = unpickle.loads(pickedResult) unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala }.map { result => if (udfs.length == 1) { // fast path for single UDF mutableRow(0) = fromJava(result) mutableRow } else { fromJava(result).asInstanceOf[InternalRow] } } } }
Example 86
Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 87
Source File: BenchmarkQueryTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils abstract class BenchmarkQueryTest extends QueryTest with SharedSQLContext with BeforeAndAfterAll { // When Utils.isTesting is true, the RuleExecutor will issue an exception when hitting // the max iteration of analyzer/optimizer batches. assert(Utils.isTesting, "spark.testing is not set to true") protected override def afterAll(): Unit = { try { // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) spark.sessionState.catalog.reset() } finally { super.afterAll() } } override def beforeAll() { super.beforeAll() RuleExecutor.resetMetrics() } protected def checkGeneratedCode(plan: SparkPlan): Unit = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() plan foreach { case s: WholeStageCodegenExec => codegenSubtrees += s case s => s } codegenSubtrees.toSeq.foreach { subtree => val code = subtree.doCodeGen()._2 try { // Just check the generated code can be properly compiled CodeGenerator.compile(code) } catch { case e: Exception => val msg = s""" |failed to compile: |Subtree: |$subtree |Generated code: |${CodeFormatter.format(code)} """.stripMargin throw new Exception(msg, e) } } } }
Example 88
Source File: ExtraStrategiesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 89
Source File: StarryHashJoinExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import com.github.passionke.starry.SparkPlanExecutor import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} case class StarryHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val avgHashProbe = longMetric("avgHashProbe") val rows = SparkPlanExecutor.doExec(buildPlan) val hashed = HashedRelation(rows.iterator, buildKeys, rows.length, null) streamedPlan.execute().mapPartitions { streamedIter => join(streamedIter, hashed, numOutputRows, avgHashProbe) } } }
Example 90
Source File: StarryTakeOrderedAndProjectExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.util.Utils case class StarryTakeOrderedAndProjectExec( limit: Int, sortOrder: Seq[SortOrder], projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryExecNode { override def output: Seq[Attribute] = { projectList.map(_.toAttribute) } override def executeCollect(): Array[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val data = child.execute().map(_.copy()).takeOrdered(limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) data.map(r => proj(r).copy()) } else { data } } protected override def doExecute(): RDD[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val localTopK: RDD[InternalRow] = { child.execute().map(_.copy()).mapPartitions { iter => org.apache.spark.util.collection.Utils.takeOrdered(iter, limit)(ord) } } localTopK.mapPartitions { iter => val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) topK.map(r => proj(r)) } else { topK } } } override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = SinglePartition override def simpleString: String = { val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]") val outputString = Utils.truncatedString(output, "[", ",", "]") s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)" } }
Example 91
Source File: SparkPlanExecutor.scala From starry with Apache License 2.0 | 5 votes |
package com.github.passionke.starry import org.apache.spark.{Partition, StarryTaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.{ReuseSubquery, SparkPlan} object SparkPlanExecutor { def exec(plan: SparkPlan, sparkSession: SparkSession) = { val newPlan = Seq( ReuseSubquery(sparkSession.sessionState.conf)) .foldLeft(plan) { case (sp, rule) => rule.apply(sp) } doExec(newPlan) } def firstPartition(rdd: RDD[InternalRow]): Partition = { rdd.partitions.head } def doExec(sparkPlan: SparkPlan): List[InternalRow] = { val rdd = sparkPlan.execute().map(ite => ite.copy()) val partition = firstPartition(rdd) rdd.compute(partition, new StarryTaskContext).toList } def rddCompute(rdd: RDD[InternalRow]): List[InternalRow] = { val partition = firstPartition(rdd) rdd.compute(partition, new StarryTaskContext).toList } }
Example 92
Source File: ShuffleHashJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, PartitioningCollection} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffleHashJoin(leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { @transient final protected val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext] val cacheBuildSide = bigDatalogContext.getConf.getBoolean("spark.datalog.shufflehashjoin.cachebuildside", true) override lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) var cachedBuildPlan: RDD[HashedRelation] = null override def output: Seq[Attribute] = left.output ++ right.output override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected override def doExecute(): RDD[InternalRow] = { val numStreamedRows = buildSide match { case BuildLeft => longMetric("numRightRows") case BuildRight => longMetric("numLeftRows") } val numOutputRows = longMetric("numOutputRows") if (cacheBuildSide) { if (cachedBuildPlan == null) { cachedBuildPlan = buildPlan.execute() .mapPartitionsInternal(iter => Iterator(HashedRelation(iter, SQLMetrics.nullLongMetric, buildSideKeyGenerator))) .persist() } cachedBuildPlan.zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => hashJoin(streamedIter, numStreamedRows, buildIter.next(), numOutputRows)} } else { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => val hashedRelation = HashedRelation(buildIter, SQLMetrics.nullLongMetric, buildSideKeyGenerator) hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows) } } } }
Example 93
Source File: LeftSemiJoinHash.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, Distribution, ClusteredDistribution} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class LeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) => if (condition.isEmpty) { val hashSet = buildKeyHashSet(buildIter, numRightRows) hashSemiJoin(streamIter, numLeftRows, hashSet, numOutputRows) } else { val hashRelation = HashedRelation(buildIter, numRightRows, rightKeyGenerator) hashSemiJoin(streamIter, numLeftRows, hashRelation, numOutputRows) } } } }
Example 94
Source File: HashSemiJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.LongSQLMetric trait HashSemiJoin { self: SparkPlan => val leftKeys: Seq[Expression] val rightKeys: Seq[Expression] val left: SparkPlan val right: SparkPlan val condition: Option[Expression] override def output: Seq[Attribute] = left.output override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected def leftKeyGenerator: Projection = UnsafeProjection.create(leftKeys, left.output) protected def rightKeyGenerator: Projection = UnsafeProjection.create(rightKeys, right.output) @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected def buildKeyHashSet( buildIter: Iterator[InternalRow], numBuildRows: LongSQLMetric): java.util.Set[InternalRow] = { val hashSet = new java.util.HashSet[InternalRow]() // Create a Hash set of buildKeys val rightKey = rightKeyGenerator while (buildIter.hasNext) { val currentRow = buildIter.next() numBuildRows += 1 val rowKey = rightKey(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey.copy()) } } } hashSet } protected def hashSemiJoin( streamIter: Iterator[InternalRow], numStreamRows: LongSQLMetric, hashSet: java.util.Set[InternalRow], numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val joinKeys = leftKeyGenerator streamIter.filter(current => { numStreamRows += 1 val key = joinKeys(current) val r = !key.anyNull && hashSet.contains(key) if (r) numOutputRows += 1 r }) } protected def hashSemiJoin( streamIter: Iterator[InternalRow], numStreamRows: LongSQLMetric, hashedRelation: HashedRelation, numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val joinKeys = leftKeyGenerator val joinedRow = new JoinedRow streamIter.filter { current => numStreamRows += 1 val key = joinKeys(current) lazy val rowBuffer = hashedRelation.get(key) val r = !key.anyNull && rowBuffer != null && rowBuffer.exists { (row: InternalRow) => boundCondition(joinedRow(current, row)) } if (r) numOutputRows += 1 r } } }
Example 95
Source File: BroadcastLeftSemiJoinHash.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.{InternalAccumulator, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class BroadcastLeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val input = right.execute().map { row => numRightRows += 1 row.copy() }.collect() if (condition.isEmpty) { val hashSet = buildKeyHashSet(input.toIterator, SQLMetrics.nullLongMetric) val broadcastedRelation = sparkContext.broadcast(hashSet) left.execute().mapPartitionsInternal { streamIter => hashSemiJoin(streamIter, numLeftRows, broadcastedRelation.value, numOutputRows) } } else { val hashRelation = HashedRelation(input.toIterator, SQLMetrics.nullLongMetric, rightKeyGenerator, input.size) val broadcastedRelation = sparkContext.broadcast(hashRelation) left.execute().mapPartitionsInternal { streamIter => val hashedRelation = broadcastedRelation.value hashedRelation match { case unsafe: UnsafeHashedRelation => TaskContext.get().internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(unsafe.getUnsafeSize) case _ => } hashSemiJoin(streamIter, numLeftRows, hashedRelation, numOutputRows) } } } }
Example 96
Source File: LeftSemiJoinBNL.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics override def right: SparkPlan = broadcast @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val broadcastedRelation = sparkContext.broadcast(broadcast.execute().map { row => numRightRows += 1 row.copy() }.collect().toIndexedSeq) streamed.execute().mapPartitions { streamedIter => val joinedRow = new JoinedRow streamedIter.filter(streamedRow => { numLeftRows += 1 var i = 0 var matched = false while (i < broadcastedRelation.value.size && !matched) { val broadcastedRow = broadcastedRelation.value(i) if (boundCondition(joinedRow(streamedRow, broadcastedRow))) { matched = true } i += 1 } if (matched) { numOutputRows += 1 } matched }) } } }
Example 97
Source File: HashJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.LongSQLMetric trait HashJoin { self: SparkPlan => val leftKeys: Seq[Expression] val rightKeys: Seq[Expression] val buildSide: BuildSide val left: SparkPlan val right: SparkPlan protected lazy val (buildPlan, streamedPlan) = buildSide match { case BuildLeft => (left, right) case BuildRight => (right, left) } protected lazy val (buildKeys, streamedKeys) = buildSide match { case BuildLeft => (leftKeys, rightKeys) case BuildRight => (rightKeys, leftKeys) } override def output: Seq[Attribute] = left.output ++ right.output override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected def buildSideKeyGenerator: Projection = UnsafeProjection.create(buildKeys, buildPlan.output) protected def streamSideKeyGenerator: Projection = UnsafeProjection.create(streamedKeys, streamedPlan.output) protected def hashJoin( streamIter: Iterator[InternalRow], numStreamRows: LongSQLMetric, hashedRelation: HashedRelation, numOutputRows: LongSQLMetric): Iterator[InternalRow] = { new Iterator[InternalRow] { private[this] var currentStreamedRow: InternalRow = _ private[this] var currentHashMatches: Seq[InternalRow] = _ private[this] var currentMatchPosition: Int = -1 // Mutable per row objects. private[this] val joinRow = new JoinedRow private[this] val resultProjection: (InternalRow) => InternalRow = UnsafeProjection.create(self.schema) private[this] val joinKeys = streamSideKeyGenerator override final def hasNext: Boolean = (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) || (streamIter.hasNext && fetchNext()) override final def next(): InternalRow = { val ret = buildSide match { case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition)) case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow) } currentMatchPosition += 1 numOutputRows += 1 resultProjection(ret) } private final def fetchNext(): Boolean = { currentHashMatches = null currentMatchPosition = -1 while (currentHashMatches == null && streamIter.hasNext) { currentStreamedRow = streamIter.next() numStreamRows += 1 val key = joinKeys(currentStreamedRow) if (!key.anyNull) { currentHashMatches = hashedRelation.get(key) } } if (currentHashMatches == null) { false } else { currentMatchPosition = 0 true } } } } }
Example 98
Source File: CartesianProduct.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().map { row => numLeftRows += 1 row.copy() } val rightResults = right.execute().map { row => numRightRows += 1 row.copy() } leftResults.cartesian(rightResults).mapPartitionsInternal { iter => val joinedRow = new JoinedRow iter.map { r => numOutputRows += 1 joinedRow(r._1, r._2) } } } }
Example 99
Source File: SemiJoinSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.{SQLConf, DataFrame, Row} import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression} import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class SemiJoinSuite extends SparkPlanTest with SharedSQLContext { private lazy val left = sqlContext.createDataFrame( sparkContext.parallelize(Seq( Row(1, 2.0), Row(1, 2.0), Row(2, 1.0), Row(2, 1.0), Row(3, 3.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("a", IntegerType).add("b", DoubleType)) private lazy val right = sqlContext.createDataFrame( sparkContext.parallelize(Seq( Row(2, 3.0), Row(2, 3.0), Row(3, 2.0), Row(4, 1.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("c", IntegerType).add("d", DoubleType)) private lazy val condition = { And((left.col("a") === right.col("c")).expr, LessThan(left.col("b").expr, right.col("d").expr)) } // Note: the input dataframes and expression must be evaluated lazily because // the SQLContext should be used only within a test to keep SQL tests stable private def testLeftSemiJoin( testName: String, leftRows: => DataFrame, rightRows: => DataFrame, condition: => Expression, expectedAnswer: Seq[Product]): Unit = { def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) ExtractEquiJoinKeys.unapply(join) } test(s"$testName using LeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => EnsureRequirements(left.sqlContext).apply( LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using BroadcastLeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using LeftSemiJoinBNL") { withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => LeftSemiJoinBNL(left, right, Some(condition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } testLeftSemiJoin( "basic test", left, right, condition, Seq( (2, 1.0), (2, 1.0) ) ) }
Example 100
Source File: ExtraStrategiesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { sqlContext.experimental.extraStrategies = Nil } } }
Example 101
Source File: GenomicIntervalStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.utvf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{DataFrame, GenomicInterval, SparkSession, Strategy} import org.apache.spark.unsafe.types.UTF8String case class GIntervalRow(contigName: String, start: Int, end: Int) class GenomicIntervalStrategy( spark: SparkSession) extends Strategy with Serializable { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case GenomicInterval(contigName, start, end,output) => GenomicIntervalPlan(plan,spark,GIntervalRow(contigName,start,end),output) :: Nil case _ => Nil } } case class GenomicIntervalPlan(plan: LogicalPlan, spark: SparkSession,interval:GIntervalRow, output: Seq[Attribute]) extends SparkPlan with Serializable { def doExecute(): org.apache.spark.rdd.RDD[InternalRow] = { import spark.implicits._ lazy val genomicInterval = spark.createDataset(Seq(interval)) genomicInterval .rdd .map(r=>{ val proj = UnsafeProjection.create(schema) proj.apply(InternalRow.fromSeq(Seq(UTF8String.fromString(r.contigName),r.start,r.end))) } ) } def children: Seq[SparkPlan] = Nil }
Example 102
Source File: PileupStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.pileup import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{PileupTemplate, SparkSession, Strategy} import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter import org.biodatageeks.sequila.datasources.InputDataType import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat import org.biodatageeks.sequila.utils.TableFuncs import org.seqdoop.hadoop_bam.{BAMBDGInputFormat, CRAMBDGInputFormat} import scala.reflect.ClassTag class PileupStrategy (spark:SparkSession) extends Strategy with Serializable { override def apply(plan: LogicalPlan): Seq[SparkPlan] = { plan match { case PileupTemplate(tableName, sampleId, refPath, output) => val inputFormat = TableFuncs.getTableMetadata(spark, tableName).provider inputFormat match { case Some(f) => if (f == InputDataType.BAMInputDataType) PileupPlan[BAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else if (f == InputDataType.CRAMInputDataType) PileupPlan[CRAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else Nil case None => throw new RuntimeException("Only BAM and CRAM file formats are supported in pileup function.") } case _ => Nil } } } case class PileupPlan [T<:BDGAlignInputFormat](plan:LogicalPlan, spark:SparkSession, tableName:String, sampleId:String, refPath: String, output:Seq[Attribute])(implicit c: ClassTag[T]) extends SparkPlan with Serializable with BDGAlignFileReaderWriter [T]{ override def children: Seq[SparkPlan] = Nil override protected def doExecute(): RDD[InternalRow] = { new Pileup(spark).handlePileup(tableName, sampleId, refPath, output) } }
Example 103
Source File: NCListsJoin.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.NCList import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{SparkPlan, _} @DeveloperApi case class NCListsJoin(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)), List(condition(2), condition(3))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v2 = right.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) (new Interval[Int](v1Key.getInt(0), v1Key.getInt(1)), x.copy()) } ) val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) (new Interval[Int](v2Key.getInt(0), v2Key.getInt(1)), x.copy()) } ) if (v1.count <= v2.count) { val v3 = NCListsJoinImpl.overlapJoin(context.sparkContext, v1kv, v2kv).flatMap(l => l._2.map(r => (l._1, r))) v3.map { case (l: InternalRow, r: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } else { val v3 = NCListsJoinImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.map { case (r: InternalRow, l: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } } }
Example 104
Source File: NCListsJoinStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.NCList import org.biodatageeks.sequila.rangejoins.common.{ExtractRangeJoinKeys, ExtractRangeJoinKeysWithEquality} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{SparkSession, Strategy} import org.biodatageeks.sequila.rangejoins.methods.NCList.NCListsJoinChromosome class NCListsJoinStrategy(spark: SparkSession) extends Strategy with Serializable with PredicateHelper { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ExtractRangeJoinKeys(joinType, rangeJoinKeys, left, right) => NCListsJoin(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case ExtractRangeJoinKeysWithEquality(joinType, rangeJoinKeys, left, right) => NCListsJoinChromosome(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case _ => Nil } }
Example 105
Source File: NCListsJoinChromosome.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.methods.NCList import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{SparkPlan, _} import org.biodatageeks.sequila.rangejoins.NCList.{Interval, NCListsJoinImpl} @DeveloperApi case class NCListsJoinChromosome(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1),condition(4)), List(condition(2), condition(3),condition(5))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v2 = right.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) ((v1Key.getString(2),new Interval[Int](v1Key.getInt(0), v1Key.getInt(1))), x.copy()) } ) val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) ((v2Key.getString(2),new Interval[Int](v2Key.getInt(0), v2Key.getInt(1))), x.copy()) } ) if (v1.count <= v2.count) { val v3 = NCListsJoinChromosomeImpl.overlapJoin(context.sparkContext, v1kv, v2kv) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) p.map(r => joiner.join(r._1.asInstanceOf[UnsafeRow], r._2.asInstanceOf[UnsafeRow])) } ) } else { val v3 = NCListsJoinChromosomeImpl.overlapJoin(context.sparkContext, v2kv, v1kv) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(right.schema, left.schema) p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow])) } ) } } }
Example 106
Source File: IntervalTreeJoinOptim.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.IntervalTree import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{SparkPlan, _} import org.apache.spark.sql.internal.SQLConf @DeveloperApi case class IntervalTreeJoinOptim(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession,leftLogicalPlan: LogicalPlan, righLogicalPlan: LogicalPlan) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)), List(condition(2), condition(3))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) (new IntervalWithRow[Int](v1Key.getInt(0), v1Key.getInt(1), x) ) }) val v2 = right.execute() val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) (new IntervalWithRow[Int](v2Key.getInt(0), v2Key.getInt(1), x) ) }) val conf = new SQLConf() val v1Size = if(leftLogicalPlan .stats .sizeInBytes >0) leftLogicalPlan.stats.sizeInBytes.toLong else v1.count val v2Size = if(righLogicalPlan .stats .sizeInBytes >0) righLogicalPlan.stats.sizeInBytes.toLong else v2.count if ( v1Size <= v2Size ) { val v3 = IntervalTreeJoinOptimImpl.overlapJoin(context.sparkContext, v1kv, v2kv,v1.count()) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) p.map(r=>joiner.join(r._1.asInstanceOf[UnsafeRow],r._2.asInstanceOf[UnsafeRow])) } ) } else { val v3 = IntervalTreeJoinOptimImpl.overlapJoin(context.sparkContext, v2kv, v1kv, v2.count()) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow])) } ) } } }
Example 107
Source File: IntervalTreeJoinChromosome.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.methods.genApp import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow} import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.biodatageeks.sequila.rangejoins.genApp.Interval @DeveloperApi case class IntervalTreeJoinChromosome(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1),condition(4)), List(condition(2), condition(3),condition(5))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) ((v1Key.getString(2),new Interval[Int](v1Key.getInt(0), v1Key.getInt(1))), x.copy()) }) val v2 = right.execute() val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) ((v2Key.getString(2),new Interval[Int](v2Key.getInt(0), v2Key.getInt(1))), x.copy()) }) if (v1.count <= v2.count) { val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v1kv, v2kv) .flatMap(l => l._2 .map(r => (l._1, r))) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) p.map(r => joiner.join(r._1.asInstanceOf[UnsafeRow], r._2.asInstanceOf[UnsafeRow])) } ) } else { val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(right.schema, left.schema) p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow])) } ) } } }
Example 108
Source File: IntervalTreeJoin.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.genApp import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow} import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} @DeveloperApi case class IntervalTreeJoin(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)), List(condition(2), condition(3))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) (new Interval[Int](v1Key.getInt(0), v1Key.getInt(1)), x.copy()) }) val v2 = right.execute() val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) (new Interval[Int](v2Key.getInt(0), v2Key.getInt(1)), x.copy()) }) if (v1.count <= v2.count) { val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v1kv, v2kv) .flatMap(l => l._2 .map(r => (l._1, r))) v3.map { case (l: InternalRow, r: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } else { val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.map { case (r: InternalRow, l: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } } }
Example 109
Source File: IntervalTreeJoinStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.genApp import org.biodatageeks.sequila.rangejoins.common.{ExtractRangeJoinKeys, ExtractRangeJoinKeysWithEquality} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{SparkSession, Strategy} import org.biodatageeks.sequila.rangejoins.methods.genApp.IntervalTreeJoinChromosome class IntervalTreeJoinStrategy(spark: SparkSession) extends Strategy with Serializable with PredicateHelper { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ExtractRangeJoinKeys(joinType, rangeJoinKeys, left, right) => IntervalTreeJoin(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case ExtractRangeJoinKeysWithEquality(joinType, rangeJoinKeys, left, right) => IntervalTreeJoinChromosome(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case _ => Nil } }
Example 110
Source File: BDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[InternalRow]() left_data.foreach {left => right_data.foreach {right => if (left._1.minDist(right._1) <= r) { joined_ans += new JoinedRow(left._2, right._2) } } } joined_ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 111
Source File: BKJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.util.BoundedPriorityQueue import scala.collection.mutable import scala.util.Random case class BKJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val k = l.value.asInstanceOf[Number].intValue() private class DisOrdering extends Ordering[(InternalRow, Double)] { override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int = -x._2.compare(y._2) } override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]() left_data.foreach(left => { var pq = new BoundedPriorityQueue[(InternalRow, Double)](k)(new DisOrdering) right_data.foreach(right => pq += ((right._2, right._1.minDist(left._1)))) joined_ans += ((left._2, pq.toArray)) }) joined_ans.iterator }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions) .flatMap { now => now._2.map(x => new JoinedRow(now._1, x._1)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 112
Source File: BKJSparkR.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BKJSparkR(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val k = l.value.asInstanceOf[Number].intValue() private class DisOrdering extends Ordering[(InternalRow, Double)] { override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int = -x._2.compare(y._2) } override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]() if (right_data.nonEmpty) { val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node) left_data.foreach(left => joined_ans += ((left._2, right_rtree.kNN(left._1, k, keepSame = false) .map(x => (right_data(x._2)._2, x._1.minDist(left._1))))) ) } joined_ans.iterator }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions) .flatMap { now => now._2.map(x => new JoinedRow(now._1, x._1)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 113
Source File: BDJSparkR.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BDJSparkR(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val r = NumberUtil.literalToDouble(l) final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[InternalRow]() if (right_data.nonEmpty) { val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node) left_data.foreach(left => right_rtree.circleRange(left._1, r) .foreach(x => joined_ans += new JoinedRow(left._2, right_data(x._2)._2))) } joined_ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 114
Source File: DJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition} import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable case class DJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val sample_rate = simbaSessionState.simbaConf.sampleRate final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute().map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ) val right_rdd = right.execute().map(row => (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row) ) val dimension = right_rdd.first()._1.coord.length val (left_partitioned, left_mbr_bound) = STRPartition(left_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val (right_partitioned, right_mbr_bound) = STRPartition(right_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val right_rt = RTree(right_mbr_bound.zip(Array.fill[Int](right_mbr_bound.length)(0)) .map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) val left_dup = new Array[Array[Int]](left_mbr_bound.length) val right_dup = new Array[Array[Int]](right_mbr_bound.length) var tot = 0 left_mbr_bound.foreach { now => val res = right_rt.circleRange(now._1, r) val tmp_arr = mutable.ArrayBuffer[Int]() res.foreach {x => if (right_dup(x._2) == null) right_dup(x._2) = Array(tot) else right_dup(x._2) = right_dup(x._2) :+ tot tmp_arr += tot tot += 1 } left_dup(now._2) = tmp_arr.toArray } val bc_left_dup = sparkContext.broadcast(left_dup) val bc_right_dup = sparkContext.broadcast(right_dup) val left_dup_rdd = left_partitioned.mapPartitionsWithIndex { (id, iter) => iter.flatMap {now => val tmp_list = bc_left_dup.value(id) if (tmp_list != null) tmp_list.map(x => (x, now)) else Array[(Int, (Point, InternalRow))]() } } val right_dup_rdd = right_partitioned.mapPartitionsWithIndex { (id, iter) => iter.flatMap {now => val tmp_list = bc_right_dup.value(id) if (tmp_list != null) tmp_list.map(x => (x, now)) else Array[(Int, (Point, InternalRow))]() } } val left_dup_partitioned = MapDPartition(left_dup_rdd, tot).map(_._2) val right_dup_partitioned = MapDPartition(right_dup_rdd, tot).map(_._2) left_dup_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) => val ans = mutable.ListBuffer[InternalRow]() val right_data = rightIter.toArray if (right_data.nonEmpty) { val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node) leftIter.foreach {now => ans ++= right_index.circleRange(now._1, r) .map(x => new JoinedRow(now._2, right_data(x._2)._2)) } } ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 115
Source File: RDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition} import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable case class RDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val sample_rate = simbaSessionState.simbaConf.sampleRate final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute().map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ) val right_rdd = right.execute().map(row => (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row) ) val dimension = right_rdd.first()._1.coord.length val (left_partitioned, left_mbr_bound) = STRPartition(left_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val left_part_size = left_partitioned.mapPartitions { iter => Array(iter.length).iterator }.collect() val left_rt = RTree(left_mbr_bound.zip(left_part_size).map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) val bc_rt = sparkContext.broadcast(left_rt) val right_dup = right_rdd.flatMap {x => bc_rt.value.circleRange(x._1, r).map(now => (now._2, x)) } val right_dup_partitioned = MapDPartition(right_dup, left_mbr_bound.length) left_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) => val ans = mutable.ListBuffer[InternalRow]() val right_data = rightIter.map(_._2).toArray if (right_data.length > 0) { val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node) leftIter.foreach {now => ans ++= right_index.circleRange(now._1, r) .map(x => new JoinedRow(now._2, right_data(x._2)._2)) } } ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 116
Source File: CKJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class CKJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def outputPartitioning: Partitioning = left.outputPartitioning override def output: Seq[Attribute] = left.output ++ right.output final val k = l.value.asInstanceOf[Number].intValue() override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute() val right_rdd = right.execute() left_rdd.map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ).cartesian(right_rdd).map { case (l: (Point, InternalRow), r: InternalRow) => val tmp_point = ShapeUtils.getShape(right_key, right.output, r).asInstanceOf[Point] l._2 -> List((tmp_point.minDist(l._1), r)) }.reduceByKey { case (l_list: Seq[(Double, InternalRow)], r_list: Seq[(Double, InternalRow)]) => (l_list ++ r_list).sortWith(_._1 < _._1).take(k) }.flatMapValues(list => list).mapPartitions { iter => val joinedRow = new JoinedRow iter.map(r => joinedRow(r._1, r._2._2)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 117
Source File: CDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class CDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SparkPlan { override def outputPartitioning: Partitioning = left.outputPartitioning override def output: Seq[Attribute] = left.output ++ right.output final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = left.execute().cartesian(right.execute()).mapPartitions { iter => val joinedRow = new JoinedRow iter.filter { row => val point1 = ShapeUtils.getShape(left_key, left.output, row._1).asInstanceOf[Point] val point2 = ShapeUtils.getShape(right_key, right.output, row._2).asInstanceOf[Point] point1.minDist(point2) <= r }.map(row => joinedRow(row._1, row._2)) } override def children: Seq[SparkPlan] = Seq(left, right) }