org.apache.spark.sql.catalyst.expressions.SortOrder Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.SortOrder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.strategy import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} class CarbonDataSourceScan( override val output: Seq[Attribute], val rdd: RDD[InternalRow], @transient override val relation: HadoopFsRelation, val partitioning: Partitioning, val md: Map[String, String], identifier: Option[TableIdentifier], @transient private val logicalRelation: LogicalRelation) extends FileSourceScanExec( relation, output, relation.dataSchema, Seq.empty, Seq.empty, identifier) { // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val supportsBatch: Boolean = true // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = (partitioning, Nil) // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val metadata: Map[String, String] = md override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil }
Example 2
Source File: ReferenceSort.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.{InternalAccumulator, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder }
Example 3
Source File: TakeOrderedAndProjectNodeSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import scala.util.Random import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.SortOrder class TakeOrderedAndProjectNodeSuite extends LocalNodeTest { private def testTakeOrderedAndProject(desc: Boolean): Unit = { val limit = 10 val ascOrDesc = if (desc) "desc" else "asc" test(ascOrDesc) { val inputData = Random.shuffle((1 to 100).toList).map { i => (i, i) }.toArray val inputNode = new DummyNode(kvIntAttributes, inputData) val firstColumn = inputNode.output(0) val sortDirection = if (desc) Descending else Ascending val sortOrder = SortOrder(firstColumn, sortDirection) val takeOrderAndProjectNode = new TakeOrderedAndProjectNode( conf, limit, Seq(sortOrder), Some(Seq(firstColumn)), inputNode) val expectedOutput = inputData .map { case (k, _) => k } .sortBy { k => k * (if (desc) -1 else 1) } .take(limit) val actualOutput = takeOrderAndProjectNode.collect().map { row => row.getInt(0) } assert(actualOutput === expectedOutput) } } testTakeOrderedAndProject(desc = false) testTakeOrderedAndProject(desc = true) }
Example 4
Source File: CoGroupedIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, SortOrder, Attribute} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 5
Source File: StarryTakeOrderedAndProjectExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.util.Utils case class StarryTakeOrderedAndProjectExec( limit: Int, sortOrder: Seq[SortOrder], projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryExecNode { override def output: Seq[Attribute] = { projectList.map(_.toAttribute) } override def executeCollect(): Array[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val data = child.execute().map(_.copy()).takeOrdered(limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) data.map(r => proj(r).copy()) } else { data } } protected override def doExecute(): RDD[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val localTopK: RDD[InternalRow] = { child.execute().map(_.copy()).mapPartitions { iter => org.apache.spark.util.collection.Utils.takeOrdered(iter, limit)(ord) } } localTopK.mapPartitions { iter => val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) topK.map(r => proj(r)) } else { topK } } } override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = SinglePartition override def simpleString: String = { val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]") val outputString = Utils.truncatedString(output, "[", ",", "]") s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)" } }
Example 6
Source File: ReferenceSort.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 7
Source File: CoGroupedIterator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 8
Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 9
Source File: SubstituteUnresolvedOrdinals.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 10
Source File: ReferenceSort.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 11
Source File: CoGroupedIterator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 12
Source File: SubstituteUnresolvedOrdinals.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.CatalystConf import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transform { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 13
Source File: DeltaInvariantCheckerExec.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.schema import org.apache.spark.sql.delta.DeltaErrors import org.apache.spark.sql.delta.schema.Invariants.NotNull import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BindReferences, Expression, GetStructField, Literal, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.{NullType, StructType} private def buildExtractors(invariant: Invariant): Option[Expression] = { assert(invariant.column.nonEmpty) val topLevelColumn = invariant.column.head val topLevelRefOpt = output.collectFirst { case a: AttributeReference if SchemaUtils.DELTA_COL_RESOLVER(a.name, topLevelColumn) => a } val rejectColumnNotFound = isNullNotOkay(invariant) if (topLevelRefOpt.isEmpty) { if (rejectColumnNotFound) { throw DeltaErrors.notNullInvariantException(invariant) } } if (invariant.column.length == 1) { topLevelRefOpt.map(BindReferences.bindReference[Expression](_, output)) } else { topLevelRefOpt.flatMap { topLevelRef => val boundTopLevel = BindReferences.bindReference[Expression](topLevelRef, output) try { val nested = invariant.column.tail.foldLeft(boundTopLevel) { case (e, fieldName) => e.dataType match { case StructType(fields) => val ordinal = fields.indexWhere(f => SchemaUtils.DELTA_COL_RESOLVER(f.name, fieldName)) if (ordinal == -1) { throw new IndexOutOfBoundsException(s"Not nullable column not found in struct: " + s"${fields.map(_.name).mkString("[", ",", "]")}") } GetStructField(e, ordinal, Some(fieldName)) case _ => throw new UnsupportedOperationException( "Invariants on nested fields other than StructTypes are not supported.") } } Some(nested) } catch { case i: IndexOutOfBoundsException if rejectColumnNotFound => throw InvariantViolationException(invariant, i.getMessage) case _: IndexOutOfBoundsException if !rejectColumnNotFound => None } } } } override protected def doExecute(): RDD[InternalRow] = { if (invariants.isEmpty) return child.execute() val boundRefs = invariants.map { invariant => CheckDeltaInvariant(buildExtractors(invariant).getOrElse(Literal(null, NullType)), invariant) } child.execute().mapPartitionsInternal { rows => val assertions = GenerateUnsafeProjection.generate(boundRefs) rows.map { row => assertions(row) row } } } override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 14
Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.strategy import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} class CarbonDataSourceScan( override val output: Seq[Attribute], val rdd: RDD[InternalRow], @transient override val relation: HadoopFsRelation, val partitioning: Partitioning, val md: Map[String, String], identifier: Option[TableIdentifier], @transient private val logicalRelation: LogicalRelation) extends FileSourceScanExec( relation, output, relation.dataSchema, Seq.empty, None, Seq.empty, identifier) { // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val supportsBatch: Boolean = true // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = (partitioning, Nil) // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val metadata: Map[String, String] = md override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil }
Example 15
Source File: SubstituteUnresolvedOrdinals.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.CatalystConf import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transform { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 16
Source File: ReferenceSort.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 17
Source File: CoGroupedIterator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 18
Source File: SubstituteUnresolvedOrdinals.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.CatalystConf import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transform { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 19
Source File: ColumnarSortExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized._ import java.util.concurrent.TimeUnit._ import org.apache.spark.{SparkEnv, TaskContext, SparkContext} import org.apache.spark.executor.TaskMetrics import org.apache.spark.sql.execution._ import org.apache.spark.sql.catalyst.expressions.SortOrder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} class ColumnarSortExec( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan, testSpillFrequency: Int = 0) extends SortExec(sortOrder, global, child, testSpillFrequency) { override def supportsColumnar = true // Disable code generation override def supportCodegen: Boolean = false override lazy val metrics = Map( "totalSortTime" -> SQLMetrics .createTimingMetric(sparkContext, "time in sort + shuffle process"), "sortTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in sort process"), "shuffleTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in shuffle process"), "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches")) override def doExecuteColumnar(): RDD[ColumnarBatch] = { val elapse = longMetric("totalSortTime") val sortTime = longMetric("sortTime") val shuffleTime = longMetric("shuffleTime") val numOutputRows = longMetric("numOutputRows") val numOutputBatches = longMetric("numOutputBatches") child.executeColumnar().mapPartitions { iter => val hasInput = iter.hasNext val res = if (!hasInput) { Iterator.empty } else { val sorter = ColumnarSorter.create( sortOrder, true, child.output, sortTime, numOutputBatches, numOutputRows, shuffleTime, elapse) TaskContext .get() .addTaskCompletionListener[Unit](_ => { sorter.close() }) new CloseableColumnBatchIterator(sorter.createColumnarIterator(iter)) } res } } }
Example 20
Source File: HierarchyPlan.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.LevelMatcher import org.apache.spark.sql.hierarchy._ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.RddUtils val schemaWithNode = StructType(child.schema.fields ++ Seq(StructField("", NodeType, nullable = false))) val resultInternalRdd = RDDConversions.rowToRowRdd(cachedResultRdd, schemaWithNode.fields.map(_.dataType)) resultInternalRdd } } private[sql] case class AdjacencyListHierarchyPlan(child: SparkPlan, parenthoodExp: Expression, startWhere: Option[Expression], orderBy: Seq[SortOrder], node: Attribute, dataType: DataType) extends HierarchyPlan(child, node) { override protected val builder: HierarchyBuilder[Row, Row] = HierarchyRowBroadcastBuilder(child.output, parenthoodExp, startWhere, orderBy) override protected val pathDataType = dataType } private[sql] case class LevelHierarchyPlan(child: SparkPlan, levels: Seq[Expression], startWhere: Option[Expression], orderBy: Seq[SortOrder], matcher: LevelMatcher, node: Attribute, dataType: DataType) extends HierarchyPlan(child, node) { override protected val builder: HierarchyBuilder[Row, Row] = HierarchyRowLevelBasedBuilder( child.output, levels, startWhere, orderBy, matcher) override protected val pathDataType = dataType }
Example 21
Source File: ReferenceSort.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 22
Source File: CoGroupedIterator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 23
Source File: GroupedIterator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection} object GroupedIterator { def apply( input: Iterator[InternalRow], keyExpressions: Seq[Expression], inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { if (input.hasNext) { new GroupedIterator(input.buffered, keyExpressions, inputSchema) } else { Iterator.empty } } } def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator def next(): (InternalRow, Iterator[InternalRow]) = { assert(hasNext) // Ensure we have fetched the next iterator. val ret = (keyProjection(currentGroup), currentIterator) currentIterator = null ret } private def fetchNextGroupIterator(): Boolean = { assert(currentIterator == null) if (currentRow == null && input.hasNext) { currentRow = input.next() } if (currentRow == null) { // These is no data left, return false. false } else { // Skip to next group. // currentRow may be overwritten by `hasNext`, so we should compare them first. while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) { currentRow = input.next() } if (keyOrdering.compare(currentGroup, currentRow) == 0) { // We are in the last group, there is no more groups, return false. false } else { // Now the `currentRow` is the first row of next group. currentGroup = currentRow.copy() currentIterator = createGroupValuesIterator() true } } } private def createGroupValuesIterator(): Iterator[InternalRow] = { new Iterator[InternalRow] { def hasNext: Boolean = currentRow != null || fetchNextRowInGroup() def next(): InternalRow = { assert(hasNext) val res = currentRow currentRow = null res } private def fetchNextRowInGroup(): Boolean = { assert(currentRow == null) if (input.hasNext) { // The inner iterator should NOT consume the input into next group, here we use `head` to // peek the next input, to see if we should continue to process it. if (keyOrdering.compare(currentGroup, input.head) == 0) { // Next input is in the current group. Continue the inner iterator. currentRow = input.next() true } else { // Next input is not in the right group. End this inner iterator. false } } else { // There is no more data, return false. false } } } } }
Example 24
Source File: Exchange.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 25
Source File: SubstituteUnresolvedOrdinals.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 26
Source File: ReferenceSort.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 27
Source File: CoGroupedIterator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 28
Source File: GroupedIterator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection} object GroupedIterator { def apply( input: Iterator[InternalRow], keyExpressions: Seq[Expression], inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { if (input.hasNext) { new GroupedIterator(input.buffered, keyExpressions, inputSchema) } else { Iterator.empty } } } def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator def next(): (InternalRow, Iterator[InternalRow]) = { assert(hasNext) // Ensure we have fetched the next iterator. val ret = (keyProjection(currentGroup), currentIterator) currentIterator = null ret } private def fetchNextGroupIterator(): Boolean = { assert(currentIterator == null) if (currentRow == null && input.hasNext) { currentRow = input.next() } if (currentRow == null) { // These is no data left, return false. false } else { // Skip to next group. // currentRow may be overwritten by `hasNext`, so we should compare them first. while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) { currentRow = input.next() } if (keyOrdering.compare(currentGroup, currentRow) == 0) { // We are in the last group, there is no more groups, return false. false } else { // Now the `currentRow` is the first row of next group. currentGroup = currentRow.copy() currentIterator = createGroupValuesIterator() true } } } private def createGroupValuesIterator(): Iterator[InternalRow] = { new Iterator[InternalRow] { def hasNext: Boolean = currentRow != null || fetchNextRowInGroup() def next(): InternalRow = { assert(hasNext) val res = currentRow currentRow = null res } private def fetchNextRowInGroup(): Boolean = { assert(currentRow == null) if (input.hasNext) { // The inner iterator should NOT consume the input into next group, here we use `head` to // peek the next input, to see if we should continue to process it. if (keyOrdering.compare(currentGroup, input.head) == 0) { // Next input is in the current group. Continue the inner iterator. currentRow = input.next() true } else { // Next input is not in the right group. End this inner iterator. false } } else { // There is no more data, return false. false } } } } }