org.apache.spark.sql.catalyst.expressions.Attribute Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.Attribute.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LocalTableScanExec.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 2
Source File: SnowflakePlan.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake.pushdowns import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class SnowflakePlan(output: Seq[Attribute], rdd: RDD[InternalRow]) extends SparkPlan { override def children: Seq[SparkPlan] = Nil protected override def doExecute(): RDD[InternalRow] = { val schema = StructType( output.map(attr => StructField(attr.name, attr.dataType, attr.nullable)) ) rdd.mapPartitions { iter => val project = UnsafeProjection.create(schema) iter.map(project) } } }
Example 3
Source File: joinTypes.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.Attribute object JoinType { def apply(typ: String): JoinType = typ.toLowerCase.replace("_", "") match { case "inner" => Inner case "outer" | "full" | "fullouter" => FullOuter case "leftouter" | "left" => LeftOuter case "rightouter" | "right" => RightOuter case "leftsemi" => LeftSemi case "leftanti" => LeftAnti case "cross" => Cross case _ => val supported = Seq( "inner", "outer", "full", "fullouter", "leftouter", "left", "rightouter", "right", "leftsemi", "leftanti", "cross") throw new IllegalArgumentException(s"Unsupported join type '$typ'. " + "Supported join types include: " + supported.mkString("'", "', '", "'") + ".") } } sealed abstract class JoinType { def sql: String } sealed abstract class InnerLike extends JoinType { def explicitCartesian: Boolean } case object Inner extends InnerLike { override def explicitCartesian: Boolean = false override def sql: String = "INNER" } case object Cross extends InnerLike { override def explicitCartesian: Boolean = true override def sql: String = "CROSS" } case object LeftOuter extends JoinType { override def sql: String = "LEFT OUTER" } case object RightOuter extends JoinType { override def sql: String = "RIGHT OUTER" } case object FullOuter extends JoinType { override def sql: String = "FULL OUTER" } case object LeftSemi extends JoinType { override def sql: String = "LEFT SEMI" } case object LeftAnti extends JoinType { override def sql: String = "LEFT ANTI" } case class ExistenceJoin(exists: Attribute) extends JoinType { override def sql: String = { // This join type is only used in the end of optimizer and physical plans, we will not // generate SQL for this join type throw new UnsupportedOperationException } } case class NaturalJoin(tpe: JoinType) extends JoinType { require(Seq(Inner, LeftOuter, RightOuter, FullOuter).contains(tpe), "Unsupported natural join type " + tpe) override def sql: String = "NATURAL " + tpe.sql } case class UsingJoin(tpe: JoinType, usingColumns: Seq[UnresolvedAttribute]) extends JoinType { require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe), "Unsupported using join type " + tpe) override def sql: String = "USING " + tpe.sql } object LeftExistence { def unapply(joinType: JoinType): Option[JoinType] = joinType match { case LeftSemi | LeftAnti => Some(joinType) case j: ExistenceJoin => Some(joinType) case _ => None } }
Example 4
Source File: ScriptTransformation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 5
Source File: LocalRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 6
Source File: LogicalPlanSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types.IntegerType class LogicalPlanSuite extends SparkFunSuite { private var invocationCount = 0 private val function: PartialFunction[LogicalPlan, LogicalPlan] = { case p: Project => invocationCount += 1 p } private val testRelation = LocalRelation() test("resolveOperator runs on operators") { invocationCount = 0 val plan = Project(Nil, testRelation) plan resolveOperators function assert(invocationCount === 1) } test("resolveOperator runs on operators recursively") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan resolveOperators function assert(invocationCount === 2) } test("resolveOperator skips all ready resolved plans") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan.foreach(_.setAnalyzed()) plan resolveOperators function assert(invocationCount === 0) } test("resolveOperator skips partially resolved plans") { invocationCount = 0 val plan1 = Project(Nil, testRelation) val plan2 = Project(Nil, plan1) plan1.foreach(_.setAnalyzed()) plan2 resolveOperators function assert(invocationCount === 1) } test("isStreaming") { val relation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) val incrementalRelation = new LocalRelation( Seq(AttributeReference("a", IntegerType, nullable = true)())) { override def isStreaming(): Boolean = true } case class TestBinaryRelation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output } require(relation.isStreaming === false) require(incrementalRelation.isStreaming === true) assert(TestBinaryRelation(relation, relation).isStreaming === false) assert(TestBinaryRelation(incrementalRelation, relation).isStreaming === true) assert(TestBinaryRelation(relation, incrementalRelation).isStreaming === true) assert(TestBinaryRelation(incrementalRelation, incrementalRelation).isStreaming) } }
Example 7
Source File: DeclarativeAggregateEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 8
Source File: package.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.util.Collections import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.util.{AccumulatorV2, LongAccumulator} case class ColumnMetrics() { val elementTypes = new SetAccumulator[String] sparkContext.register(elementTypes) } val tupleCount: LongAccumulator = sparkContext.longAccumulator val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { debugPrint(s"== ${child.simpleString} ==") debugPrint(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case (attr, metric) => // This is called on driver. All accumulator updates have a fixed value. So it's safe to use // `asScala` which accesses the internal values using `java.util.Iterator`. val actualDataTypes = metric.elementTypes.value.asScala.mkString("{", ",", "}") debugPrint(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount.add(1) var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes.add(value.getClass.getName) } i += 1 } currentRow } } } } override def outputPartitioning: Partitioning = child.outputPartitioning override def inputRDDs(): Seq[RDD[InternalRow]] = { child.asInstanceOf[CodegenSupport].inputRDDs() } override def doProduce(ctx: CodegenContext): String = { child.asInstanceOf[CodegenSupport].produce(ctx, this) } override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { consume(ctx, input) } } }
Example 9
Source File: ShuffledHashJoinExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 10
Source File: CartesianProductExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsInternal { iter => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition: (InternalRow) => Boolean = newPredicate(condition.get, left.output ++ right.output) val joined = new JoinedRow iter.filter { r => boundCondition(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 11
Source File: LogicalRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 12
Source File: InsertIntoHadoopFsRelationCommand.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.IOException import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand )) { throw new IOException(s"Unable to clear output " + s"directory $qualifiedOutputPath prior to writing to it") } true case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => true case (SaveMode.Ignore, exists) => !exists case (s, exists) => throw new IllegalStateException(s"unsupported save mode $s ($exists)") } // If we are appending data to an existing dir. val isAppend = pathExists && (mode == SaveMode.Append) if (doInsertion) { WriteOutput.write( sparkSession, query, fileFormat, qualifiedOutputPath, hadoopConf, partitionColumns, bucketSpec, refreshFunction, options, isAppend) } else { logInfo("Skipping insertion into a relation that already exists.") } Seq.empty[Row] } }
Example 13
Source File: Exchange.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 14
Source File: GroupedIterator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection} object GroupedIterator { def apply( input: Iterator[InternalRow], keyExpressions: Seq[Expression], inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { if (input.hasNext) { new GroupedIterator(input.buffered, keyExpressions, inputSchema) } else { Iterator.empty } } } def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator def next(): (InternalRow, Iterator[InternalRow]) = { assert(hasNext) // Ensure we have fetched the next iterator. val ret = (keyProjection(currentGroup), currentIterator) currentIterator = null ret } private def fetchNextGroupIterator(): Boolean = { assert(currentIterator == null) if (currentRow == null && input.hasNext) { currentRow = input.next() } if (currentRow == null) { // These is no data left, return false. false } else { // Skip to next group. // currentRow may be overwritten by `hasNext`, so we should compare them first. while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) { currentRow = input.next() } if (keyOrdering.compare(currentGroup, currentRow) == 0) { // We are in the last group, there is no more groups, return false. false } else { // Now the `currentRow` is the first row of next group. currentGroup = currentRow.copy() currentIterator = createGroupValuesIterator() true } } } private def createGroupValuesIterator(): Iterator[InternalRow] = { new Iterator[InternalRow] { def hasNext: Boolean = currentRow != null || fetchNextRowInGroup() def next(): InternalRow = { assert(hasNext) val res = currentRow currentRow = null res } private def fetchNextRowInGroup(): Boolean = { assert(currentRow == null) if (input.hasNext) { // The inner iterator should NOT consume the input into next group, here we use `head` to // peek the next input, to see if we should continue to process it. if (keyOrdering.compare(currentGroup, input.head) == 0) { // Next input is in the current group. Continue the inner iterator. currentRow = input.next() true } else { // Next input is not in the right group. End this inner iterator. false } } else { // There is no more data, return false. false } } } } }
Example 15
Source File: resources.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 16
Source File: commands.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 17
Source File: StreamingRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes) } }
Example 18
Source File: CoGroupedIterator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 19
Source File: ReferenceSort.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 20
Source File: SparkPlannerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 21
Source File: TiHandleRDD.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.tispark import com.pingcap.tikv.meta.TiDAGRequest import com.pingcap.tikv.util.RangeSplitter import com.pingcap.tikv.{TiConfiguration, TiSession} import com.pingcap.tispark.utils.TiUtil import com.pingcap.tispark.{TiPartition, TiTableReference} import gnu.trove.list.array.TLongArrayList import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.{Partition, TaskContext, TaskKilledException} import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ class TiHandleRDD( override val dagRequest: TiDAGRequest, override val physicalId: Long, val output: Seq[Attribute], override val tiConf: TiConfiguration, override val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) { private val outputTypes = output.map(_.dataType) private val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = new Iterator[InternalRow] { checkTimezone() private val tiPartition = split.asInstanceOf[TiPartition] private val session = TiSession.getInstance(tiConf) private val snapshot = session.createSnapshot(dagRequest.getStartTs) private[this] val tasks = tiPartition.tasks private val handleIterator = snapshot.indexHandleRead(dagRequest, tasks) private val regionManager = session.getRegionManager private lazy val handleList = { val lst = new TLongArrayList() handleIterator.asScala.foreach { // Kill the task in case it has been marked as killed. This logic is from // InterruptedIterator, but we inline it here instead of wrapping the iterator in order // to avoid performance overhead. if (context.isInterrupted()) { throw new TaskKilledException } lst.add(_) } lst } // Fetch all handles and group by region id private val regionHandleMap = RangeSplitter .newSplitter(regionManager) .groupByAndSortHandlesByRegionId(physicalId, handleList) .map(x => (x._1.first.getId, x._2)) private val iterator = regionHandleMap.iterator override def hasNext: Boolean = { // Kill the task in case it has been marked as killed. if (context.isInterrupted()) { throw new TaskKilledException } iterator.hasNext } override def next(): InternalRow = { val next = iterator.next val regionId = next._1 val handleList = next._2 // Returns RegionId:[handle1, handle2, handle3...] K-V pair val sparkRow = Row.apply(regionId, handleList.toArray()) TiUtil.rowToInternalRow(sparkRow, outputTypes, converters) } } }
Example 22
Source File: TiRowRDD.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.tispark import com.pingcap.tikv._ import com.pingcap.tikv.columnar.TiColumnarBatchHelper import com.pingcap.tikv.meta.TiDAGRequest import com.pingcap.tispark.listener.CacheInvalidateListener import com.pingcap.tispark.{TiPartition, TiTableReference} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.{Partition, TaskContext, TaskKilledException} import org.slf4j.Logger import scala.collection.JavaConversions._ class TiRowRDD( override val dagRequest: TiDAGRequest, override val physicalId: Long, val chunkBatchSize: Int, override val tiConf: TiConfiguration, val output: Seq[Attribute], override val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) { protected val logger: Logger = log // cache invalidation call back function // used for driver to update PD cache private val callBackFunc = CacheInvalidateListener.getInstance() override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = new Iterator[ColumnarBatch] { checkTimezone() private val tiPartition = split.asInstanceOf[TiPartition] private val session = TiSession.getInstance(tiConf) session.injectCallBackFunc(callBackFunc) private val snapshot = session.createSnapshot(dagRequest.getStartTs) private[this] val tasks = tiPartition.tasks private val iterator = snapshot.tableReadChunk(dagRequest, tasks, chunkBatchSize) override def hasNext: Boolean = { // Kill the task in case it has been marked as killed. This logic is from // Interrupted Iterator, but we inline it here instead of wrapping the iterator in order // to avoid performance overhead. if (context.isInterrupted()) { throw new TaskKilledException } iterator.hasNext } override def next(): ColumnarBatch = { TiColumnarBatchHelper.createColumnarBatch(iterator.next) } }.asInstanceOf[Iterator[InternalRow]] }
Example 23
Source File: TiAggregation.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import com.pingcap.tispark.TiDBRelation import com.pingcap.tispark.utils.ReflectionUtil import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources.LogicalRelation object TiAggregation { type ReturnType = (Seq[NamedExpression], Seq[AggregateExpression], Seq[NamedExpression], LogicalPlan) def unapply(plan: LogicalPlan): Option[ReturnType] = ReflectionUtil.callTiAggregationImplUnapply(plan) } object TiAggregationProjection { type ReturnType = (Seq[Expression], LogicalPlan, TiDBRelation, Seq[NamedExpression]) def unapply(plan: LogicalPlan): Option[ReturnType] = plan match { // Only push down aggregates projection when all filters can be applied and // all projection expressions are column references case PhysicalOperation( projects, filters, rel @ LogicalRelation(source: TiDBRelation, _, _, _)) if projects.forall(_.isInstanceOf[Attribute]) => Some((filters, rel, source, projects)) case _ => Option.empty[ReturnType] } }
Example 24
Source File: CreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } // For CTAS, there is no static partition values to insert. val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( tableDesc, partition, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable( tableDesc.copy(schema = outputColumns.toStructType), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 25
Source File: datasources.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.types.StringType import org.apache.spark.sql.xsql.XSQLSessionCatalog case class XSQLShowDatasourcesCommand(datasourcePattern: Option[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("dataSourceName", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] val datasources = datasourcePattern .map { pattern => catalog.listDatasources(pattern) } .getOrElse(catalog.listDatasources()) datasources.map { d => Row(d) } } } case class XSQLAddDatasourceCommand(dataSourceName: String, properties: Map[String, String]) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] catalog.addDataSource(dataSourceName, properties) Seq.empty[Row] } } case class XSQLRemoveDatasourceCommand(dataSourceName: String, ifExists: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] catalog.removeDataSource(dataSourceName, ifExists) Seq.empty[Row] } } case class XSQLRefreshDatasourceCommand(dataSourceName: String) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] catalog.refreshDataSource(dataSourceName) Seq.empty[Row] } }
Example 26
Source File: databases.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.types.StringType import org.apache.spark.sql.xsql.XSQLSessionCatalog case class XSQLSetDatabaseCommand(dataSourceName: Option[String], databaseName: String) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] if (dataSourceName.isEmpty) { catalog.setCurrentDatabase(databaseName) } else { catalog.setCurrentDatabase(dataSourceName.get, databaseName) } Seq.empty[Row] } }
Example 27
Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import java.util.Locale import org.apache.spark.SparkException import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.StreamingRelationV2 import org.apache.spark.sql.sources.v2.StreamWriteSupport import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.StreamingSinkType case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand { private var outputMode: OutputMode = OutputMode.Append // dummy override def output: Seq[AttributeReference] = Seq.empty // dummy override def producedAttributes: AttributeSet = plan.producedAttributes override def run(sparkSession: SparkSession): Seq[Row] = { import StreamingSinkType._ val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan)) val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema)) plan.collectLeaves.head match { case StreamingRelationV2(_, _, extraOptions, _, _) => val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK) val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv => val key = kv._1.substring(STREAMING_SINK_PREFIX.length) (key, kv._2) } StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match { case CONSOLE => case TEXT | PARQUET | ORC | JSON | CSV => if (sinkOptions.get(STREAMING_SINK_PATH) == None) { throw new SparkException("Sink type is file, must config path") } case KAFKA => if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) { throw new SparkException("Sink type is kafka, must config bootstrap servers") } if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) { throw new SparkException("Sink type is kafka, must config kafka topic") } case _ => throw new SparkException( "Sink type is invalid, " + s"select from ${StreamingSinkType.values}") } val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") val sink = ds.newInstance() match { case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w case _ => val ds = DataSource( sparkSession, className = source, options = sinkOptions.toMap, partitionColumns = Nil) ds.createSink(InternalOutputModes.Append) } val outputMode = InternalOutputModes( extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE)) val duration = extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION) val trigger = extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match { case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration) case STREAMING_ONCE_TRIGGER => Trigger.Once() case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration) } val query = sparkSession.sessionState.streamingQueryManager.startQuery( extraOptions.get("queryName"), extraOptions.get(STREAMING_CHECKPOINT_LOCATION), df, sinkOptions.toMap, sink, outputMode, useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK, recoverFromCheckpointLocation = true, trigger = trigger) query.awaitTermination() } // dummy Seq.empty } } case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 28
Source File: XSQLCreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand import org.apache.spark.sql.xsql.XSQLSessionCatalog case class XSQLCreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] val tableIdentifier = catalog.getUsedTableIdentifier(tableDesc.identifier) val newTableDesc = tableDesc.copy(identifier = tableIdentifier) if (catalog.tableExists(tableIdentifier)) { assert( mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } XSQLInsertIntoHiveTable( newTableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(newTableDesc.schema.isEmpty) catalog.createTable(newTableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(newTableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap XSQLInsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 29
Source File: joinTypes.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import java.util.Locale import org.apache.spark.sql.catalyst.expressions.Attribute object JoinType { def apply(typ: String): JoinType = typ.toLowerCase(Locale.ROOT).replace("_", "") match { case "inner" => Inner case "outer" | "full" | "fullouter" => FullOuter case "leftouter" | "left" => LeftOuter case "rightouter" | "right" => RightOuter case "leftsemi" => LeftSemi case "leftanti" => LeftAnti case "cross" => Cross case _ => val supported = Seq( "inner", "outer", "full", "fullouter", "full_outer", "leftouter", "left", "left_outer", "rightouter", "right", "right_outer", "leftsemi", "left_semi", "leftanti", "left_anti", "cross") throw new IllegalArgumentException(s"Unsupported join type '$typ'. " + "Supported join types include: " + supported.mkString("'", "', '", "'") + ".") } } sealed abstract class JoinType { def sql: String } sealed abstract class InnerLike extends JoinType { def explicitCartesian: Boolean } case object Inner extends InnerLike { override def explicitCartesian: Boolean = false override def sql: String = "INNER" } case object Cross extends InnerLike { override def explicitCartesian: Boolean = true override def sql: String = "CROSS" } case object LeftOuter extends JoinType { override def sql: String = "LEFT OUTER" } case object RightOuter extends JoinType { override def sql: String = "RIGHT OUTER" } case object FullOuter extends JoinType { override def sql: String = "FULL OUTER" } case object LeftSemi extends JoinType { override def sql: String = "LEFT SEMI" } case object LeftAnti extends JoinType { override def sql: String = "LEFT ANTI" } case class ExistenceJoin(exists: Attribute) extends JoinType { override def sql: String = { // This join type is only used in the end of optimizer and physical plans, we will not // generate SQL for this join type throw new UnsupportedOperationException } } case class NaturalJoin(tpe: JoinType) extends JoinType { require(Seq(Inner, LeftOuter, RightOuter, FullOuter).contains(tpe), "Unsupported natural join type " + tpe) override def sql: String = "NATURAL " + tpe.sql } case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType { require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe), "Unsupported using join type " + tpe) override def sql: String = "USING " + tpe.sql } object LeftExistence { def unapply(joinType: JoinType): Option[JoinType] = joinType match { case LeftSemi | LeftAnti => Some(joinType) case j: ExistenceJoin => Some(joinType) case _ => None } }
Example 30
Source File: ProjectEstimation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical.statsEstimation import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap} import org.apache.spark.sql.catalyst.plans.logical.{Project, Statistics} object ProjectEstimation { import EstimationUtils._ def estimate(project: Project): Option[Statistics] = { if (rowCountsExist(project.child)) { val childStats = project.child.stats val inputAttrStats = childStats.attributeStats // Match alias with its child's column stat val aliasStats = project.expressions.collect { case alias @ Alias(attr: Attribute, _) if inputAttrStats.contains(attr) => alias.toAttribute -> inputAttrStats(attr) } val outputAttrStats = getOutputMap(AttributeMap(inputAttrStats.toSeq ++ aliasStats), project.output) Some(childStats.copy( sizeInBytes = getOutputSize(project.output, childStats.rowCount.get, outputAttrStats), attributeStats = outputAttrStats)) } else { None } } }
Example 31
Source File: AggregateEstimation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical.statsEstimation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Statistics} object AggregateEstimation { import EstimationUtils._ def estimate(agg: Aggregate): Option[Statistics] = { val childStats = agg.child.stats // Check if we have column stats for all group-by columns. val colStatsExist = agg.groupingExpressions.forall { e => e.isInstanceOf[Attribute] && childStats.attributeStats.get(e.asInstanceOf[Attribute]).exists(_.hasCountStats) } if (rowCountsExist(agg.child) && colStatsExist) { // Multiply distinct counts of group-by columns. This is an upper bound, which assumes // the data contains all combinations of distinct values of group-by columns. var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))( (res, expr) => { val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute]) val distinctCount = columnStat.distinctCount.get val distinctValue: BigInt = if (columnStat.nullCount.get > 0) { distinctCount + 1 } else { distinctCount } res * distinctValue }) outputRows = if (agg.groupingExpressions.isEmpty) { // If there's no group-by columns, the output is a single row containing values of aggregate // functions: aggregated results for non-empty input or initial values for empty input. 1 } else { // Here we set another upper bound for the number of output rows: it must not be larger than // child's number of rows. outputRows.min(childStats.rowCount.get) } val outputAttrStats = getOutputMap(childStats.attributeStats, agg.output) Some(Statistics( sizeInBytes = getOutputSize(agg.output, outputRows, outputAttrStats), rowCount = Some(outputRows), attributeStats = outputAttrStats, hints = childStats.hints)) } else { None } } }
Example 32
Source File: ScriptTransformation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 33
Source File: EventTimeWatermark.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends UnaryNode { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val delayMs = EventTimeWatermark.getDelayMs(delay) val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 34
Source File: LocalRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def computeStats(): Statistics = Statistics(sizeInBytes = EstimationUtils.getSizePerRow(output) * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 35
Source File: view.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf object EliminateView extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transform { // The child should have the same output attributes with the View operator, so we simply // remove the View operator. case View(_, output, child) => assert(output == child.output, s"The output of the child ${child.output.mkString("[", ",", "]")} is different from the " + s"view output ${output.mkString("[", ",", "]")}") child } }
Example 36
Source File: StatsEstimationTestBase.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.statsEstimation import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{IntegerType, StringType} trait StatsEstimationTestBase extends SparkFunSuite { var originalValue: Boolean = false override def beforeAll(): Unit = { super.beforeAll() // Enable stats estimation based on CBO. originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED) SQLConf.get.setConf(SQLConf.CBO_ENABLED, true) } override def afterAll(): Unit = { SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue) super.afterAll() } def getColSize(attribute: Attribute, colStat: ColumnStat): Long = attribute.dataType match { // For UTF8String: base + offset + numBytes case StringType => colStat.avgLen.getOrElse(attribute.dataType.defaultSize.toLong) + 8 + 4 case _ => colStat.avgLen.getOrElse(attribute.dataType.defaultSize) } def attr(colName: String): AttributeReference = AttributeReference(colName, IntegerType)() case class StatsTestPlan( outputList: Seq[Attribute], rowCount: BigInt, attributeStats: AttributeMap[ColumnStat], size: Option[BigInt] = None) extends LeafNode { override def output: Seq[Attribute] = outputList override def computeStats(): Statistics = Statistics( // If sizeInBytes is useless in testing, we just use a fake value sizeInBytes = size.getOrElse(Int.MaxValue), rowCount = Some(rowCount), attributeStats = attributeStats) }
Example 37
Source File: LogicalPlanSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Literal, NamedExpression} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types.IntegerType class LogicalPlanSuite extends SparkFunSuite { private var invocationCount = 0 private val function: PartialFunction[LogicalPlan, LogicalPlan] = { case p: Project => invocationCount += 1 p } private val testRelation = LocalRelation() test("transformUp runs on operators") { invocationCount = 0 val plan = Project(Nil, testRelation) plan transformUp function assert(invocationCount === 1) invocationCount = 0 plan transformDown function assert(invocationCount === 1) } test("transformUp runs on operators recursively") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan transformUp function assert(invocationCount === 2) invocationCount = 0 plan transformDown function assert(invocationCount === 2) } test("isStreaming") { val relation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) val incrementalRelation = LocalRelation( Seq(AttributeReference("a", IntegerType, nullable = true)()), isStreaming = true) case class TestBinaryRelation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output } require(relation.isStreaming === false) require(incrementalRelation.isStreaming === true) assert(TestBinaryRelation(relation, relation).isStreaming === false) assert(TestBinaryRelation(incrementalRelation, relation).isStreaming === true) assert(TestBinaryRelation(relation, incrementalRelation).isStreaming === true) assert(TestBinaryRelation(incrementalRelation, incrementalRelation).isStreaming) } test("transformExpressions works with a Stream") { val id1 = NamedExpression.newExprId val id2 = NamedExpression.newExprId val plan = Project(Stream( Alias(Literal(1), "a")(exprId = id1), Alias(Literal(2), "b")(exprId = id2)), OneRowRelation()) val result = plan.transformExpressions { case Literal(v: Int, IntegerType) if v != 1 => Literal(v + 1, IntegerType) } val expected = Project(Stream( Alias(Literal(1), "a")(exprId = id1), Alias(Literal(3), "b")(exprId = id2)), OneRowRelation()) assert(result.sameResult(expected)) } }
Example 38
Source File: DeclarativeAggregateEvaluator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 39
Source File: LocalTableScanExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], @transient rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) @transient private lazy val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 40
Source File: ObjectAggregationMap.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 41
Source File: CartesianProductExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 42
Source File: DataSourceV2StringFormat.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.sources.v2.DataSourceV2 import org.apache.spark.util.Utils def pushedFilters: Seq[Expression] private def sourceName: String = source match { case registered: DataSourceRegister => registered.shortName() // source.getClass.getSimpleName can cause Malformed class name error, // call safer `Utils.getSimpleName` instead case _ => Utils.getSimpleName(source.getClass) } def metadataString: String = { val entries = scala.collection.mutable.ArrayBuffer.empty[(String, String)] if (pushedFilters.nonEmpty) { entries += "Filters" -> pushedFilters.mkString("[", ", ", "]") } // TODO: we should only display some standard options like path, table, etc. if (options.nonEmpty) { entries += "Options" -> Utils.redact(options).map { case (k, v) => s"$k=$v" }.mkString("[", ",", "]") } val outputStr = Utils.truncatedString(output, "[", ", ", "]") val entriesStr = if (entries.nonEmpty) { Utils.truncatedString(entries.map { case (key, value) => key + ": " + StringUtils.abbreviate(value, 100) }, " (", ", ", ")") } else { "" } s"$sourceName$outputStr$entriesStr" } }
Example 43
Source File: DataSourcePartitioning.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression} import org.apache.spark.sql.catalyst.plans.physical import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning} class DataSourcePartitioning( partitioning: Partitioning, colNames: AttributeMap[String]) extends physical.Partitioning { override val numPartitions: Int = partitioning.numPartitions() override def satisfies0(required: physical.Distribution): Boolean = { super.satisfies0(required) || { required match { case d: physical.ClusteredDistribution if isCandidate(d.clustering) => val attrs = d.clustering.map(_.asInstanceOf[Attribute]) partitioning.satisfy( new ClusteredDistribution(attrs.map { a => val name = colNames.get(a) assert(name.isDefined, s"Attribute ${a.name} is not found in the data source output") name.get }.toArray)) case _ => false } } } private def isCandidate(clustering: Seq[Expression]): Boolean = { clustering.forall { case a: Attribute => colNames.contains(a) case _ => false } } }
Example 44
package org.apache.spark.sql.execution.datasources import java.util.Locale import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogUtils} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.{DDLUtils, RunnableCommand} import org.apache.spark.sql.types._ case class CreateTempViewUsing( tableIdent: TableIdentifier, userSpecifiedSchema: Option[StructType], replace: Boolean, global: Boolean, provider: String, options: Map[String, String]) extends RunnableCommand { if (tableIdent.database.isDefined) { throw new AnalysisException( s"Temporary view '$tableIdent' should not have specified a database") } override def argString: String = { s"[tableIdent:$tableIdent " + userSpecifiedSchema.map(_ + " ").getOrElse("") + s"replace:$replace " + s"provider:$provider " + CatalogUtils.maskCredentials(options) } override def run(sparkSession: SparkSession): Seq[Row] = { if (provider.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) { throw new AnalysisException("Hive data source can only be used with tables, " + "you can't use it with CREATE TEMP VIEW USING") } val dataSource = DataSource( sparkSession, userSpecifiedSchema = userSpecifiedSchema, className = provider, options = options) val catalog = sparkSession.sessionState.catalog val viewDefinition = Dataset.ofRows( sparkSession, LogicalRelation(dataSource.resolveRelation())).logicalPlan if (global) { catalog.createGlobalTempView(tableIdent.table, viewDefinition, replace) } else { catalog.createTempView(tableIdent.table, viewDefinition, replace) } Seq.empty[Row] } } case class RefreshTable(tableIdent: TableIdentifier) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { // Refresh the given table's metadata. If this table is cached as an InMemoryRelation, // drop the original cached version and make the new version cached lazily. sparkSession.catalog.refreshTable(tableIdent.quotedString) Seq.empty[Row] } } case class RefreshResource(path: String) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.catalog.refreshByPath(path) Seq.empty[Row] } }
Example 45
Source File: Exchange.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 46
Source File: GroupedIterator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection} object GroupedIterator { def apply( input: Iterator[InternalRow], keyExpressions: Seq[Expression], inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { if (input.hasNext) { new GroupedIterator(input.buffered, keyExpressions, inputSchema) } else { Iterator.empty } } } def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator def next(): (InternalRow, Iterator[InternalRow]) = { assert(hasNext) // Ensure we have fetched the next iterator. val ret = (keyProjection(currentGroup), currentIterator) currentIterator = null ret } private def fetchNextGroupIterator(): Boolean = { assert(currentIterator == null) if (currentRow == null && input.hasNext) { currentRow = input.next() } if (currentRow == null) { // These is no data left, return false. false } else { // Skip to next group. // currentRow may be overwritten by `hasNext`, so we should compare them first. while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) { currentRow = input.next() } if (keyOrdering.compare(currentGroup, currentRow) == 0) { // We are in the last group, there is no more groups, return false. false } else { // Now the `currentRow` is the first row of next group. currentGroup = currentRow.copy() currentIterator = createGroupValuesIterator() true } } } private def createGroupValuesIterator(): Iterator[InternalRow] = { new Iterator[InternalRow] { def hasNext: Boolean = currentRow != null || fetchNextRowInGroup() def next(): InternalRow = { assert(hasNext) val res = currentRow currentRow = null res } private def fetchNextRowInGroup(): Boolean = { assert(currentRow == null) if (input.hasNext) { // The inner iterator should NOT consume the input into next group, here we use `head` to // peek the next input, to see if we should continue to process it. if (keyOrdering.compare(currentGroup, input.head) == 0) { // Next input is in the current group. Continue the inner iterator. currentRow = input.next() true } else { // Next input is not in the right group. End this inner iterator. false } } else { // There is no more data, return false. false } } } } }
Example 47
Source File: resources.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 48
Source File: DataWritingCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.util.SerializableConfiguration def logicalPlanOutputWithNames( query: LogicalPlan, names: Seq[String]): Seq[Attribute] = { // Save the output attributes to a variable to avoid duplicated function calls. val outputAttributes = query.output assert(outputAttributes.length == names.length, "The length of provided names doesn't match the length of output attributes.") outputAttributes.zip(names).map { case (attr, outputName) => attr.withName(outputName) } } }
Example 49
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 50
Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.UUID import org.apache.spark.{HashPartitioner, SparkEnv} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD} case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan { override def output: Seq[Attribute] = child.output override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = SinglePartition override def doExecute(): RDD[InternalRow] = { assert(numPartitions == 1) new ContinuousCoalesceRDD( sparkContext, numPartitions, conf.continuousStreamingExecutorQueueSize, sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong, child.execute()) } }
Example 51
Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan) extends SparkPlan with Logging { override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { val writerFactory = writer.createWriterFactory() val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) logInfo(s"Start processing data source writer: $writer. " + s"The input RDD has ${rdd.partitions.length} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions)) try { // Force the RDD to run so continuous processing starts; no data is actually being collected // to the driver, as ContinuousWriteRDD outputs nothing. rdd.collect() } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. case _ if StreamExecution.isInterruptionException(cause) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause } } sparkContext.emptyRDD } }
Example 52
Source File: StreamingRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2} object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source, session: SparkSession): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes)(session) } }
Example 53
Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 54
Source File: CoGroupedIterator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 55
Source File: ReferenceSort.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 56
Source File: SparkPlannerSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data, _) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 57
Source File: HierarchyPlan.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.LevelMatcher import org.apache.spark.sql.hierarchy._ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.RddUtils val schemaWithNode = StructType(child.schema.fields ++ Seq(StructField("", NodeType, nullable = false))) val resultInternalRdd = RDDConversions.rowToRowRdd(cachedResultRdd, schemaWithNode.fields.map(_.dataType)) resultInternalRdd } } private[sql] case class AdjacencyListHierarchyPlan(child: SparkPlan, parenthoodExp: Expression, startWhere: Option[Expression], orderBy: Seq[SortOrder], node: Attribute, dataType: DataType) extends HierarchyPlan(child, node) { override protected val builder: HierarchyBuilder[Row, Row] = HierarchyRowBroadcastBuilder(child.output, parenthoodExp, startWhere, orderBy) override protected val pathDataType = dataType } private[sql] case class LevelHierarchyPlan(child: SparkPlan, levels: Seq[Expression], startWhere: Option[Expression], orderBy: Seq[SortOrder], matcher: LevelMatcher, node: Attribute, dataType: DataType) extends HierarchyPlan(child, node) { override protected val builder: HierarchyBuilder[Row, Row] = HierarchyRowLevelBasedBuilder( child.output, levels, startWhere, orderBy, matcher) override protected val pathDataType = dataType }
Example 58
Source File: ShowTablesUsingCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.sources.DatasourceCatalog import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.types.{StringType, StructField, StructType} private[sql] case class ShowTablesUsingCommand(provider: String, options: Map[String, String]) extends LogicalPlan with RunnableCommand { override def output: Seq[Attribute] = StructType( StructField("TABLE_NAME", StringType, nullable = false) :: StructField("IS_TEMPORARY", StringType, nullable = false) :: StructField("KIND", StringType, nullable = false) :: Nil ).toAttributes override def run(sqlContext: SQLContext): Seq[Row] = { val dataSource: Any = DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider) dataSource match { case describableRelation: DatasourceCatalog => describableRelation .getRelations(sqlContext, new CaseInsensitiveMap(options)) .map(relationInfo => Row( relationInfo.name, relationInfo.isTemporary.toString.toUpperCase, relationInfo.kind.toUpperCase)) case _ => throw new RuntimeException(s"The provided data source $provider does not support " + "showing its relations.") } } }
Example 59
Source File: DeepDescribeCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.sources.describable.Describable import org.apache.spark.sql.sources.describable.FieldLike.StructFieldLike import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext} private[sql] case class DeepDescribeCommand( relation: Describable) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { val description = relation.describe() Seq(description match { case r: Row => r case default => Row(default) }) } override def output: Seq[Attribute] = { relation.describeOutput match { case StructType(fields) => fields.map(StructFieldLike.toAttribute) case other => AttributeReference("value", other)() :: Nil } } }
Example 60
Source File: DescribeTableUsingCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.TableIdentifierUtils._ import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.sources.{DatasourceCatalog, RelationInfo} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext} private[sql] case class DescribeTableUsingCommand( name: TableIdentifier, provider: String, options: Map[String, String]) extends LogicalPlan with RunnableCommand { override def output: Seq[Attribute] = StructType( StructField("TABLE_NAME", StringType, nullable = false) :: StructField("DDL_STMT", StringType, nullable = false) :: Nil ).toAttributes override def run(sqlContext: SQLContext): Seq[Row] = { // Convert the table name according to the case-sensitivity settings val tableId = name.toSeq val resolver = DatasourceResolver.resolverFor(sqlContext) val catalog = resolver.newInstanceOfTyped[DatasourceCatalog](provider) Seq(catalog .getRelation(sqlContext, tableId, new CaseInsensitiveMap(options)) match { case None => Row("", "") case Some(RelationInfo(relName, _, _, ddl, _)) => Row( relName, ddl.getOrElse("")) }) } }
Example 61
Source File: CreateTablePartitionedByUsing.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} import org.apache.spark.sql.types.StructType case class CreateTablePartitionedByUsing(tableIdent: TableIdentifier, userSpecifiedSchema: Option[StructType], provider: String, partitioningFunc: String, partitioningColumns: Seq[String], temporary: Boolean, options: Map[String, String], allowExisting: Boolean, managedIfNoPath: Boolean) extends LogicalPlan with Command { override def output: Seq[Attribute] = Seq.empty override def children: Seq[LogicalPlan] = Seq.empty }
Example 62
Source File: DescCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.commands.hive import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Row, SQLContext} case class DescCommand(ident: TableIdentifier) extends HiveRunnableCommand { override protected val commandName: String = s"DESC $ident" override def execute(sqlContext: SQLContext): Seq[Row] = { val plan = sqlContext.catalog.lookupRelation(ident) if (plan.resolved) { plan.schema.map { field => Row(field.name, field.dataType.simpleString, None) } } else { Seq.empty } } override lazy val output: Seq[Attribute] = AttributeReference("col_name", StringType)() :: AttributeReference("data_type", StringType)() :: AttributeReference("comment", StringType)() :: Nil }
Example 63
Source File: inferSchemaCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.commands import org.apache.spark.sql.catalyst.analysis.systables.SchemaEnumeration import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation import org.apache.spark.sql.execution.tablefunctions.DataTypeExtractor import org.apache.spark.sql.hive.orc.OrcRelation import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext} case class InferSchemaCommand(path: String, fileType: FileType) extends RunnableCommand { override lazy val output: Seq[Attribute] = InferSchemaCommand.schema.toAttributes override def run(sqlContext: SQLContext): Seq[Row] = { val fileSchema = fileType.readSchema(sqlContext, path) fileSchema.zipWithIndex.map { case (StructField(name, dataType, nullable, _), idx) => val dataTypeExtractor = DataTypeExtractor(dataType) Row( name, idx + 1, // idx + 1 since the ordinal position has to start at 1 nullable, dataTypeExtractor.inferredSqlType, dataTypeExtractor.numericPrecision.orNull, dataTypeExtractor.numericPrecisionRadix.orNull, dataTypeExtractor.numericScale.orNull) } } } object InferSchemaCommand extends SchemaEnumeration { val name = Field("COLUMN_NAME", StringType, nullable = false) val ordinalPosition = Field("ORDINAL_POSITION", IntegerType, nullable = false) val isNullable = Field("IS_NULLABLE", BooleanType, nullable = false) val dataType = Field("DATA_TYPE", StringType, nullable = false) val numericPrecision = Field("NUMERIC_PRECISION", IntegerType, nullable = true) val numericPrecisionRadix = Field("NUMERIC_PRECISION_RADIX", IntegerType, nullable = true) val numericScale = Field("NUMERIC_SCALE", IntegerType, nullable = true) }
Example 64
Source File: ShowPartitionFunctionsUsingCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.commands import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DatasourceResolver, DefaultDatasourceResolver, Row, SQLContext} case class ShowPartitionFunctionsUsingCommand( provider: String, options: Map[String, String]) extends RunnableCommand { def run(sqlContext: SQLContext): Seq[Row] = { val resolver = DatasourceResolver.resolverFor(sqlContext) val pFunProvider = resolver.newInstanceOfTyped[PartitioningFunctionProvider](provider) val pFuns = pFunProvider.getAllPartitioningFunctions(sqlContext, options) pFuns.map { fun => val (splittersOpt, rightClosedOpt) = fun match { case RangeSplitPartitioningFunction(_, _, splitters, rightClosed) => (Some(splitters), Some(rightClosed)) case _ => (None, None) } val (startOpt, endOpt, intervalTypeOpt, intervalValueOpt) = fun match { case RangeIntervalPartitioningFunction(_, _, start, end, strideParts) => (Some(start), Some(end), Some(strideParts.productPrefix), Some(strideParts.n)) case _ => (None, None, None, None) } val partitionsNoOpt = fun match { case HashPartitioningFunction(_, _, partitionsNo) => partitionsNo case s: SimpleDataType => None } Row(fun.name, fun.productPrefix, fun.dataTypes.map(_.toString).mkString(","), splittersOpt.map(_.mkString(",")).orNull, rightClosedOpt.orNull, startOpt.orNull, endOpt.orNull, intervalTypeOpt.orNull, intervalValueOpt.orNull, partitionsNoOpt.orNull) } } override lazy val output: Seq[Attribute] = StructType( StructField("name", StringType, nullable = false) :: StructField("kind", StringType, nullable = false) :: StructField("dataTypes", StringType, nullable = false) :: StructField("splitters", StringType, nullable = true) :: StructField("rightClosed", BooleanType, nullable = true) :: StructField("start", IntegerType, nullable = true) :: StructField("end", IntegerType, nullable = true) :: StructField("intervalType", StringType, nullable = true) :: StructField("intervalValue", IntegerType, nullable = true) :: StructField("partitionsNo", IntegerType, nullable = true) :: Nil ).toAttributes }
Example 65
Source File: RawSqlSourceProvider.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.util.concurrent.atomic.AtomicReference import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.execution.{PhysicalRDD, RDDConversions, SparkPlan} import org.apache.spark.sql.sources.RawDDLObjectType.RawDDLObjectType import org.apache.spark.sql.sources.RawDDLStatementType.RawDDLStatementType import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext} case object RawDDLObjectType { sealed trait RawDDLObjectType { val name: String override def toString: String = name } sealed abstract class BaseRawDDLObjectType(val name: String) extends RawDDLObjectType sealed trait RawData case object PartitionFunction extends BaseRawDDLObjectType("partition function") case object PartitionScheme extends BaseRawDDLObjectType("partition scheme") case object Collection extends BaseRawDDLObjectType("collection") with RawData case object Series extends BaseRawDDLObjectType("table") with RawData case object Graph extends BaseRawDDLObjectType("graph") with RawData } case object RawDDLStatementType { sealed trait RawDDLStatementType case object Create extends RawDDLStatementType case object Drop extends RawDDLStatementType case object Append extends RawDDLStatementType case object Load extends RawDDLStatementType } protected def calculateSchema(): StructType }
Example 66
Source File: UseAliasesForFunctionsInGroupings.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Subquery} import org.apache.spark.sql.catalyst.rules.Rule object UseAliasesForFunctionsInGroupings extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case agg@Aggregate(groupingExpressions, aggregateExpressions, child) => val fixedGroupingExpressions = groupingExpressions.map({ case e: AttributeReference => e case e => val aliasOpt = aggregateExpressions.find({ case Alias(aliasChild, aliasName) => aliasChild == e case _ => false }) aliasOpt match { case Some(alias) => alias.toAttribute case None => sys.error(s"Cannot resolve Alias for $e") } }) agg.copy(groupingExpressions = fixedGroupingExpressions) } }
Example 67
Source File: LogicalPlanExtractorSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.tablefunctions import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId} import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.types._ import org.scalatest.FunSuite class LogicalPlanExtractorSuite extends FunSuite { def attr(name: String, dataType: DataType, id: Int, nullable: Boolean = false): Attribute = { AttributeReference(name, dataType, nullable)(ExprId(id)) } val attributes = Seq(attr("foo", IntegerType, 0), attr("bar", StringType, 1)) test("tablePart") { val project = Project(attributes, null) val tablePart = new LogicalPlanExtractor(project).tablePart assert(tablePart == "" :: Nil) } }
Example 68
Source File: CollapseExpandSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.analysis.CollapseExpandSuite.SqlLikeCatalystSourceRelation import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.sources.sql.SqlLikeRelation import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.util.PlanComparisonUtils._ import org.apache.spark.sql.{GlobalSapSQLContext, Row} import org.mockito.Matchers._ import org.mockito.Mockito._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar class CollapseExpandSuite extends FunSuite with MockitoSugar with GlobalSapSQLContext { case object Leaf extends LeafNode { override def output: Seq[Attribute] = Seq.empty } test("Expansion with a single sequence of projections is correctly collapsed") { val expand = Expand( Seq(Seq('a.string, Literal(1))), Seq('a.string, 'gid.int), Leaf) val collapsed = CollapseExpand(expand) assertResult(normalizeExprIds(Project(Seq('a.string, Literal(1) as "gid"), Leaf)))( normalizeExprIds(collapsed)) } test("Expansion with multiple projections is correctly collapsed") { val expand = Expand( Seq( Seq('a.string, Literal(1)), Seq('b.string, Literal(1))), Seq('a.string, 'gid1.int, 'b.string, 'gid2.int), Leaf) val collapsed = CollapseExpand(expand) assertResult( normalizeExprIds( Project(Seq( 'a.string, Literal(1) as "gid1", 'b.string, Literal(1) as "gid2"), Leaf)))(normalizeExprIds(collapsed)) } test("Expand pushdown integration") { val relation = mock[SqlLikeCatalystSourceRelation] when(relation.supportsLogicalPlan(any[Expand])) .thenReturn(true) when(relation.isMultiplePartitionExecution(any[Seq[CatalystSource]])) .thenReturn(true) when(relation.schema) .thenReturn(StructType(StructField("foo", StringType) :: Nil)) when(relation.relationName) .thenReturn("t") when(relation.logicalPlanToRDD(any[LogicalPlan])) .thenReturn(sc.parallelize(Seq(Row("a", 1), Row("b", 1), Row("a", 1)))) sqlc.baseRelationToDataFrame(relation).registerTempTable("t") val dataFrame = sqlc.sql("SELECT COUNT(DISTINCT foo) FROM t") val Seq(Row(ct)) = dataFrame.collect().toSeq assertResult(2)(ct) } } object CollapseExpandSuite { abstract class SqlLikeCatalystSourceRelation extends BaseRelation with Table with SqlLikeRelation with CatalystSource }
Example 69
Source File: ResolveHierarchySuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Attribute, EqualTo} import org.apache.spark.sql.catalyst.plans.logical.{AdjacencyListHierarchySpec, Hierarchy} import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar class ResolveHierarchySuite extends FunSuite with MockitoSugar { val br1 = new BaseRelation { override def sqlContext: SQLContext = mock[SQLContext] override def schema: StructType = StructType(Seq( StructField("id", IntegerType), StructField("parent", IntegerType) )) } val lr1 = LogicalRelation(br1) val idAtt = lr1.output.find(_.name == "id").get val parentAtt = lr1.output.find(_.name == "parent").get test("Check parenthood expression has no conflicting expression IDs and qualifiers") { val source = SimpleAnalyzer.execute(lr1.select('id, 'parent).subquery('u)) assert(source.resolved) val hierarchy = Hierarchy( AdjacencyListHierarchySpec(source, "v", UnresolvedAttribute("u" :: "id" :: Nil) === UnresolvedAttribute("v" :: "id" :: Nil), Some('id.isNull), Nil), 'node ) val resolveHierarchy = ResolveHierarchy(SimpleAnalyzer) val resolveReferences = ResolveReferencesWithHierarchies(SimpleAnalyzer) val resolvedHierarchy = (0 to 10).foldLeft(hierarchy: Hierarchy) { (h, _) => SimpleAnalyzer.ResolveReferences( resolveReferences(resolveHierarchy(h)) ).asInstanceOf[Hierarchy] } assert(resolvedHierarchy.node.resolved) val resolvedSpec = resolvedHierarchy.spec.asInstanceOf[AdjacencyListHierarchySpec] assert(resolvedSpec.parenthoodExp.resolved) assert(resolvedSpec.startWhere.forall(_.resolved)) assert(resolvedHierarchy.childrenResolved) assert(resolvedHierarchy.resolved) val parenthoodExpression = resolvedSpec.parenthoodExp.asInstanceOf[EqualTo] assertResult("u" :: Nil)(parenthoodExpression.left.asInstanceOf[Attribute].qualifiers) assertResult("v" :: Nil)(parenthoodExpression.right.asInstanceOf[Attribute].qualifiers) assert(parenthoodExpression.right.asInstanceOf[Attribute].exprId != source.output.find(_.name == "id").get.exprId) } }
Example 70
Source File: PlanUtilsSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode} import org.scalatest.FunSuite import org.apache.spark.sql.util.PlanUtils._ class PlanUtilsSuite extends FunSuite { trait NoAttributes { self: LogicalPlan => override def output: Seq[Attribute] = Seq.empty } case object Leaf extends LeafNode with NoAttributes case class Node(children: LogicalPlan*) extends LogicalPlan with NoAttributes val k = Leaf // _____a_____ val j = Leaf // / | \ val i = Leaf // b e k val h = Node(i) // / \ / \ val g = Node(h, j) // c d f g val f = Leaf // / \ val e = Node(f, g) // h j val d = Leaf // | val c = Leaf // i val b = Node(c, d) // val a = Node(b, e, k) // test("isLeaf") { assertResult(expected = false)(Node(Leaf).isLeaf) assertResult(expected = true)(Leaf.isLeaf) } test("find") { assertResult(None)(a.find(_ == Node(Leaf, Leaf, Leaf))) assertResult(Some(h))(a.find(_ == Node(Leaf))) } test("filter") { assertResult(Seq.empty)(a.filter(_ == Node(Leaf, Leaf, Leaf))) assertResult(Seq(c, d, f, i, j, k))(a.filter(_.isLeaf)) } test("contains") { assertResult(expected = false)(a.contains(Node(Leaf, Leaf, Leaf))) assertResult(expected = true)(a.contains(Node(Leaf))) } test("exists") { assertResult(expected = true)(a.exists(node => node == Node(Leaf))) assertResult(expected = false)(a.exists(node => node == Node(Leaf, Leaf, Leaf))) } test("toPreOrderSeq") { assertResult(a.toPreOrderSeq.toList)(List(a, b, c, d, e, f, g, h, i, j, k)) } test("toPostOrderSeq") { assertResult(a.toPostOrderSeq.toList)(List(c, d, b, f, i, h, j, g, e, k, a)) } test("toLevelOrderSeq") { assertResult(a.toLevelOrderSeq.toList)(List(a, b, e, k, c, d, f, g, h, j, i)) } test("toSeq") { assertResult(a.toSeq(PreOrder))(a.toPreOrderSeq) assertResult(a.toSeq(PostOrder))(a.toPostOrderSeq) assertResult(a.toSeq(LevelOrder))(a.toLevelOrderSeq) } }
Example 71
Source File: EventHubsWriter.scala From azure-event-hubs-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.eventhubs import org.apache.spark.internal.Logging import org.apache.spark.sql.{ AnalysisException, SparkSession } import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.types.{ BinaryType, StringType } import org.apache.spark.util.Utils private[eventhubs] object EventHubsWriter extends Logging { val BodyAttributeName = "body" val PartitionKeyAttributeName = "partitionKey" val PartitionIdAttributeName = "partition" val PropertiesAttributeName = "properties" override def toString: String = "EventHubsWriter" private def validateQuery(schema: Seq[Attribute], parameters: Map[String, String]): Unit = { schema .find(_.name == BodyAttributeName) .getOrElse( throw new AnalysisException(s"Required attribute '$BodyAttributeName' not found.") ) .dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException( s"$BodyAttributeName attribute type " + s"must be a String or BinaryType.") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, parameters: Map[String, String] ): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, parameters) queryExecution.toRdd.foreachPartition { iter => val writeTask = new EventHubsWriteTask(parameters, schema) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close() ) } } }
Example 72
Source File: FileSourceScanExecAdapter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.oap.adapter import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.BitSet object FileSourceScanExecAdapter { def createFileSourceScanExec( relation: HadoopFsRelation, output: Seq[Attribute], requiredSchema: StructType, partitionFilters: Seq[Expression], optionalBucketSets: Option[BitSet], dataFilters: Seq[Expression], metastoreTableIdentifier: Option[TableIdentifier]): FileSourceScanExec = { FileSourceScanExec( relation, output, requiredSchema, partitionFilters, optionalBucketSets, dataFilters, metastoreTableIdentifier) } }
Example 73
Source File: OapAggUtils.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, Partial} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.oap.OapAggregationFileScanExec object OapAggUtils { private def createAggregate( requiredChildDistributionExpressions: Option[Seq[Expression]] = None, groupingExpressions: Seq[NamedExpression] = Nil, aggregateExpressions: Seq[AggregateExpression] = Nil, aggregateAttributes: Seq[Attribute] = Nil, initialInputBufferOffset: Int = 0, resultExpressions: Seq[NamedExpression] = Nil, child: SparkPlan): SparkPlan = { if (requiredChildDistributionExpressions.isDefined) { // final aggregate, fall back to Spark HashAggregateExec. HashAggregateExec( requiredChildDistributionExpressions = requiredChildDistributionExpressions, groupingExpressions = groupingExpressions, aggregateExpressions = aggregateExpressions, aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, child = child) } else { // Apply partial aggregate optimizations. OapAggregateExec( requiredChildDistributionExpressions = None, groupingExpressions = groupingExpressions, aggregateExpressions = aggregateExpressions, aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, child = child) } } def planAggregateWithoutDistinct( groupingExpressions: Seq[NamedExpression], aggregateExpressions: Seq[AggregateExpression], resultExpressions: Seq[NamedExpression], child: SparkPlan): Seq[SparkPlan] = { val useHash = HashAggregateExec.supportsAggregate( aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)) if (!child.isInstanceOf[OapAggregationFileScanExec] || !useHash) { // Child can not leverage oap optimization reading. Nil } else { // 1. Create an Aggregate Operator for partial aggregations. val groupingAttributes = groupingExpressions.map(_.toAttribute) val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial)) val partialAggregateAttributes = partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) val partialResultExpressions = groupingAttributes ++ partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes) val partialAggregate = createAggregate( requiredChildDistributionExpressions = None, groupingExpressions = groupingExpressions, aggregateExpressions = partialAggregateExpressions, aggregateAttributes = partialAggregateAttributes, initialInputBufferOffset = 0, resultExpressions = partialResultExpressions, child = child) // 2. Create an Aggregate Operator for final aggregations. val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final)) // The attributes of the final aggregation buffer, which is presented as input to the result // projection: val finalAggregateAttributes = finalAggregateExpressions.map(_.resultAttribute) val finalAggregate = createAggregate( requiredChildDistributionExpressions = Some(groupingAttributes), groupingExpressions = groupingAttributes, aggregateExpressions = finalAggregateExpressions, aggregateAttributes = finalAggregateAttributes, initialInputBufferOffset = groupingExpressions.length, resultExpressions = resultExpressions, child = partialAggregate) finalAggregate :: Nil } } }
Example 74
Source File: joinTypes.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.Attribute object JoinType { def apply(typ: String): JoinType = typ.toLowerCase.replace("_", "") match { case "inner" => Inner case "outer" | "full" | "fullouter" => FullOuter case "leftouter" | "left" => LeftOuter case "rightouter" | "right" => RightOuter case "leftsemi" => LeftSemi case "leftanti" => LeftAnti case "cross" => Cross case _ => val supported = Seq( "inner", "outer", "full", "fullouter", "leftouter", "left", "rightouter", "right", "leftsemi", "leftanti", "cross") throw new IllegalArgumentException(s"Unsupported join type '$typ'. " + "Supported join types include: " + supported.mkString("'", "', '", "'") + ".") } } sealed abstract class JoinType { def sql: String } sealed abstract class InnerLike extends JoinType { def explicitCartesian: Boolean } case object Inner extends InnerLike { override def explicitCartesian: Boolean = false override def sql: String = "INNER" } case object Cross extends InnerLike { override def explicitCartesian: Boolean = true override def sql: String = "CROSS" } case object LeftOuter extends JoinType { override def sql: String = "LEFT OUTER" } case object RightOuter extends JoinType { override def sql: String = "RIGHT OUTER" } case object FullOuter extends JoinType { override def sql: String = "FULL OUTER" } case object LeftSemi extends JoinType { override def sql: String = "LEFT SEMI" } case object LeftAnti extends JoinType { override def sql: String = "LEFT ANTI" } case class ExistenceJoin(exists: Attribute) extends JoinType { override def sql: String = { // This join type is only used in the end of optimizer and physical plans, we will not // generate SQL for this join type throw new UnsupportedOperationException } } case class NaturalJoin(tpe: JoinType) extends JoinType { require(Seq(Inner, LeftOuter, RightOuter, FullOuter).contains(tpe), "Unsupported natural join type " + tpe) override def sql: String = "NATURAL " + tpe.sql } case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType { require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe), "Unsupported using join type " + tpe) override def sql: String = "USING " + tpe.sql } object LeftExistence { def unapply(joinType: JoinType): Option[JoinType] = joinType match { case LeftSemi | LeftAnti => Some(joinType) case j: ExistenceJoin => Some(joinType) case _ => None } }
Example 75
Source File: ScriptTransformation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 76
Source File: EventTimeWatermark.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends LogicalPlan { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override val children: Seq[LogicalPlan] = child :: Nil }
Example 77
Source File: LocalRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 78
Source File: LogicalPlanSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types.IntegerType class LogicalPlanSuite extends SparkFunSuite { private var invocationCount = 0 private val function: PartialFunction[LogicalPlan, LogicalPlan] = { case p: Project => invocationCount += 1 p } private val testRelation = LocalRelation() test("resolveOperator runs on operators") { invocationCount = 0 val plan = Project(Nil, testRelation) plan resolveOperators function assert(invocationCount === 1) } test("resolveOperator runs on operators recursively") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan resolveOperators function assert(invocationCount === 2) } test("resolveOperator skips all ready resolved plans") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan.foreach(_.setAnalyzed()) plan resolveOperators function assert(invocationCount === 0) } test("resolveOperator skips partially resolved plans") { invocationCount = 0 val plan1 = Project(Nil, testRelation) val plan2 = Project(Nil, plan1) plan1.foreach(_.setAnalyzed()) plan2 resolveOperators function assert(invocationCount === 1) } test("isStreaming") { val relation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) val incrementalRelation = new LocalRelation( Seq(AttributeReference("a", IntegerType, nullable = true)())) { override def isStreaming(): Boolean = true } case class TestBinaryRelation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output } require(relation.isStreaming === false) require(incrementalRelation.isStreaming === true) assert(TestBinaryRelation(relation, relation).isStreaming === false) assert(TestBinaryRelation(incrementalRelation, relation).isStreaming === true) assert(TestBinaryRelation(relation, incrementalRelation).isStreaming === true) assert(TestBinaryRelation(incrementalRelation, incrementalRelation).isStreaming) } }
Example 79
Source File: DeclarativeAggregateEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 80
Source File: LocalTableScanExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 81
Source File: ShuffledHashJoinExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 82
Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 83
Source File: LogicalRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 84
Source File: Exchange.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 85
Source File: resources.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 86
Source File: commands.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 87
Source File: StreamingRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes) } }
Example 88
Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 89
Source File: CoGroupedIterator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 90
Source File: ReferenceSort.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 91
Source File: SparkPlannerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 92
Source File: basicOperators.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv.plans.modular import org.apache.spark.sql.catalyst.expressions.{Attribute, _} import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.carbondata.mv.plans._ import org.apache.carbondata.mv.plans.modular.Flags._ trait Matchable extends ModularPlan { def outputList: Seq[NamedExpression] def predicateList: Seq[Expression] } case class GroupBy( outputList: Seq[NamedExpression], inputList: Seq[Expression], predicateList: Seq[Expression], alias: Option[String], child: ModularPlan, flags: FlagSet, flagSpec: Seq[Seq[Any]], modularPlan: Option[ModularPlan] = None) extends UnaryNode with Matchable { override def output: Seq[Attribute] = outputList.map(_.toAttribute) override def makeCopy(newArgs: Array[AnyRef]): GroupBy = { val groupBy = super.makeCopy(newArgs).asInstanceOf[GroupBy] if (rewritten) groupBy.setRewritten() groupBy } } case class Select( outputList: Seq[NamedExpression], inputList: Seq[Expression], predicateList: Seq[Expression], aliasMap: Map[Int, String], joinEdges: Seq[JoinEdge], children: Seq[ModularPlan], flags: FlagSet, flagSpec: Seq[Seq[Any]], windowSpec: Seq[Seq[Any]], modularPlan: Option[ModularPlan] = None) extends ModularPlan with Matchable { override def output: Seq[Attribute] = outputList.map(_.toAttribute) override def adjacencyList: scala.collection.immutable.Map[Int, Seq[(Int, JoinType)]] = { joinEdges.groupBy { _.left }.map { case (k, v) => (k, v.map(e => (e.right, e.joinType))) } } override def extractJoinConditions( left: ModularPlan, right: ModularPlan): Seq[Expression] = { predicateList.filter(p => p.references.intersect(left.outputSet).nonEmpty && p.references.intersect(right.outputSet).nonEmpty && p.references.subsetOf(left.outputSet ++ right.outputSet)) } override def extractRightEvaluableConditions( left: ModularPlan, right: ModularPlan): Seq[Expression] = { predicateList.filter(p => p.references.subsetOf(left.outputSet ++ right.outputSet) && p.references.intersect(right.outputSet).nonEmpty) } override def extractEvaluableConditions(plan: ModularPlan): Seq[Expression] = { predicateList.filter(p => canEvaluate(p, plan)) } override def makeCopy(newArgs: Array[AnyRef]): Select = { val select = super.makeCopy(newArgs).asInstanceOf[Select] if (rewritten) select.setRewritten() select } } case class Union(children: Seq[ModularPlan], flags: FlagSet, flagSpec: Seq[Seq[Any]]) extends ModularPlan { override def output: Seq[Attribute] = children.head.output } case object OneRowTable extends LeafNode { override def output: Seq[Attribute] = Nil }
Example 93
Source File: CarbonShowStreamsCommand.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.stream import java.util.Date import java.util.concurrent.TimeUnit import org.apache.spark.sql.{CarbonEnv, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.command.MetadataCommand import org.apache.spark.sql.types.StringType import org.apache.carbondata.stream.StreamJobManager case class CarbonShowStreamsCommand( tableOp: Option[TableIdentifier] ) extends MetadataCommand { override def output: Seq[Attribute] = { Seq(AttributeReference("Stream Name", StringType, nullable = false)(), AttributeReference("JobId", StringType, nullable = false)(), AttributeReference("Status", StringType, nullable = false)(), AttributeReference("Source", StringType, nullable = false)(), AttributeReference("Sink", StringType, nullable = false)(), AttributeReference("Start Time", StringType, nullable = false)(), AttributeReference("Time Elapse", StringType, nullable = false)()) } override def processMetadata(sparkSession: SparkSession): Seq[Row] = { val jobs = tableOp match { case None => StreamJobManager.getAllJobs.toSeq case Some(table) => val carbonTable = CarbonEnv.getCarbonTable(table.database, table.table)(sparkSession) setAuditTable(carbonTable) StreamJobManager.getAllJobs.filter { job => job.sinkTable.equalsIgnoreCase(carbonTable.getTableName) && job.sinkDb.equalsIgnoreCase(carbonTable.getDatabaseName) }.toSeq } jobs.map { job => val elapsedTime = System.currentTimeMillis() - job.startTime Row( job.streamName, job.streamingQuery.id.toString, if (job.streamingQuery.isActive) "RUNNING" else "FAILED", s"${ job.sourceDb }.${ job.sourceTable }", s"${ job.sinkDb }.${ job.sinkTable }", new Date(job.startTime).toString, String.format( "%s days, %s hours, %s min, %s sec", TimeUnit.MILLISECONDS.toDays(elapsedTime).toString, TimeUnit.MILLISECONDS.toHours(elapsedTime).toString, TimeUnit.MILLISECONDS.toMinutes(elapsedTime).toString, TimeUnit.MILLISECONDS.toSeconds(elapsedTime).toString) ) } } override protected def opName: String = "SHOW STREAMS" }
Example 94
Source File: CarbonShowMVCommand.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.view import java.util import scala.collection.JavaConverters._ import org.apache.spark.sql.{CarbonEnv, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.command.{Checker, DataCommand} import org.apache.spark.sql.types.{BooleanType, StringType} import org.apache.carbondata.core.view.{MVProperty, MVSchema} import org.apache.carbondata.view.MVManagerInSpark case class CarbonShowMVCommand( databaseNameOption: Option[String], relatedTableIdentifier: Option[TableIdentifier]) extends DataCommand { override def output: Seq[Attribute] = { Seq( AttributeReference("Database", StringType, nullable = false)(), AttributeReference("Name", StringType, nullable = false)(), AttributeReference("Status", StringType, nullable = false)(), AttributeReference("Refresh Mode", StringType, nullable = false)(), AttributeReference("Refresh Trigger Mode", StringType, nullable = false)(), AttributeReference("Properties", StringType, nullable = false)()) } override def processData(session: SparkSession): Seq[Row] = { // Get mv schemas. val schemaList = new util.ArrayList[MVSchema]() val viewManager = MVManagerInSpark.get(session) relatedTableIdentifier match { case Some(table) => val relatedTable = CarbonEnv.getCarbonTable(table)(session) setAuditTable(relatedTable) Checker.validateTableExists(table.database, table.table, session) if (databaseNameOption.isDefined) { schemaList.addAll(viewManager.getSchemasOnTable( databaseNameOption.get, relatedTable)) } else { schemaList.addAll(viewManager.getSchemasOnTable(relatedTable)) } case _ => if (databaseNameOption.isDefined) { schemaList.addAll(viewManager.getSchemas(databaseNameOption.get)) } else { schemaList.addAll(viewManager.getSchemas()) } } // Convert mv schema to row. schemaList.asScala.map { schema => Row( schema.getIdentifier.getDatabaseName, schema.getIdentifier.getTableName, schema.getStatus.name(), schema.getProperties.get(MVProperty.REFRESH_MODE), schema.getProperties.get(MVProperty.REFRESH_TRIGGER_MODE), schema.getPropertiesAsString ) } } override protected def opName: String = "SHOW MATERIALIZED VIEW" }
Example 95
Source File: CarbonCliCommand.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.management import java.util import scala.collection.JavaConverters._ import org.apache.spark.sql.{CarbonEnv, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.command.{Checker, DataCommand} import org.apache.spark.sql.types.StringType import org.apache.carbondata.tool.CarbonCli case class CarbonCliCommand( databaseNameOp: Option[String], tableName: String, commandOptions: String) extends DataCommand { override def output: Seq[Attribute] = { Seq(AttributeReference("CarbonCli", StringType, nullable = false)()) } override def processData(sparkSession: SparkSession): Seq[Row] = { Checker.validateTableExists(databaseNameOp, tableName, sparkSession) val carbonTable = CarbonEnv.getCarbonTable(databaseNameOp, tableName)(sparkSession) setAuditTable(carbonTable) setAuditInfo(Map("options" -> commandOptions)) val commandArgs: Seq[String] = commandOptions.split("\\s+").map(_.trim) val finalCommands = commandArgs.exists(_.equalsIgnoreCase("-p")) match { case true => commandArgs case false => val needPath = commandArgs.exists { command => command.equalsIgnoreCase("summary") || command.equalsIgnoreCase("benchmark") } needPath match { case true => commandArgs ++ Seq("-p", carbonTable.getTablePath) case false => commandArgs } } val summaryOutput = new util.ArrayList[String]() CarbonCli.run(finalCommands.toArray, summaryOutput, false) summaryOutput.asScala.map(x => Row(x) ) } override protected def opName: String = "CLI" }
Example 96
Source File: CarbonShowTablesCommand.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.table import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.execution.command.MetadataCommand import org.apache.spark.sql.types.{BooleanType, StringType} private[sql] case class CarbonShowTablesCommand ( databaseName: Option[String], tableIdentifierPattern: Option[String]) extends MetadataCommand{ // The result of SHOW TABLES has three columns: database, tableName and isTemporary. override val output: Seq[Attribute] = { AttributeReference("database", StringType, nullable = false)() :: AttributeReference("tableName", StringType, nullable = false)() :: AttributeReference("isTemporary", BooleanType, nullable = false)() :: Nil } override def processMetadata(sparkSession: SparkSession): Seq[Row] = { // Since we need to return a Seq of rows, we will call getTables directly // instead of calling tables in sparkSession. val catalog = sparkSession.sessionState.catalog val db = databaseName.getOrElse(catalog.getCurrentDatabase) val tables = tableIdentifierPattern.map(catalog.listTables(db, _)).getOrElse(catalog.listTables(db)) val externalCatalog = sparkSession.sharedState.externalCatalog // this method checks whether the table is mainTable or MV based on property "isVisible" def isMainTable(tableIdent: TableIdentifier) = { var isMainTable = true try { isMainTable = externalCatalog.getTable(db, tableIdent.table).storage.properties .getOrElse("isVisible", true).toString.toBoolean } catch { case ex: Throwable => // ignore the exception for show tables } isMainTable } // tables will be filtered for all the MVs to show only main tables tables.collect { case tableIdent if isMainTable(tableIdent) => val isTemp = catalog.isTemporaryTable(tableIdent) Row(tableIdent.database.getOrElse("default"), tableIdent.table, isTemp) } } override protected def opName: String = "SHOW TABLES" }
Example 97
Source File: CarbonExplainCommand.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.table import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan, Union} import org.apache.spark.sql.execution.command.{ExplainCommand, MetadataCommand} import org.apache.spark.sql.types.StringType import org.apache.carbondata.core.profiler.ExplainCollector case class CarbonExplainCommand( child: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)())) extends MetadataCommand { override def processMetadata(sparkSession: SparkSession): Seq[Row] = { val explainCommand = child.asInstanceOf[ExplainCommand] setAuditInfo(Map("query" -> explainCommand.logicalPlan.simpleString)) val isCommand = explainCommand.logicalPlan match { case _: Command => true case Union(childern) if childern.forall(_.isInstanceOf[Command]) => true case _ => false } if (explainCommand.logicalPlan.isStreaming || isCommand) { explainCommand.run(sparkSession) } else { CarbonExplainCommand.collectProfiler(explainCommand, sparkSession) ++ explainCommand.run(sparkSession) } } override protected def opName: String = "EXPLAIN" } case class CarbonInternalExplainCommand( explainCommand: ExplainCommand, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)())) extends MetadataCommand { override def processMetadata(sparkSession: SparkSession): Seq[Row] = { CarbonExplainCommand .collectProfiler(explainCommand, sparkSession) ++ explainCommand.run(sparkSession) } override protected def opName: String = "Carbon EXPLAIN" } object CarbonExplainCommand { def collectProfiler( explain: ExplainCommand, sparkSession: SparkSession): Seq[Row] = { try { ExplainCollector.setup() if (ExplainCollector.enabled()) { val queryExecution = sparkSession.sessionState.executePlan(explain.logicalPlan) queryExecution.toRdd.partitions // For count(*) queries the explain collector will be disabled, so profiler // informations not required in such scenarios. if (null == ExplainCollector.getFormatedOutput) { Seq.empty } Seq(Row("== CarbonData Profiler ==\n" + ExplainCollector.getFormatedOutput)) } else { Seq.empty } } finally { ExplainCollector.remove() } } }
Example 98
Source File: MergeProjection.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation.merge import java.sql.{Date, Timestamp} import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection} import org.apache.spark.sql.catalyst.util.DateTimeUtils case class MergeProjection( @transient tableCols: Seq[String], @transient statusCol : String, @transient ds: Dataset[Row], @transient rltn: CarbonDatasourceHadoopRelation, @transient sparkSession: SparkSession, @transient mergeAction: MergeAction) { private val cutOffDate = Integer.MAX_VALUE >> 1 val isUpdate = mergeAction.isInstanceOf[UpdateAction] val isDelete = mergeAction.isInstanceOf[DeleteAction] def apply(row: GenericRowWithSchema): InternalRow = { // TODO we can avoid these multiple conversions if this is added as a SparkPlan node. val values = row.values.map { case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s) case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d) case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b) case d: Date => DateTimeUtils.fromJavaDate(d) case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) case value => value } projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow] } val (projection, output) = generateProjection private def generateProjection: (Projection, Array[Expression]) = { val existingDsOutput = rltn.carbonRelation.schema.toAttributes val colsMap = mergeAction match { case UpdateAction(updateMap) => updateMap case InsertAction(insertMap) => insertMap case _ => null } if (colsMap != null) { val output = new Array[Expression](tableCols.length) val expecOutput = new Array[Expression](tableCols.length) colsMap.foreach { case (k, v) => val tableIndex = tableCols.indexOf(k.toString().toLowerCase) if (tableIndex < 0) { throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap") } output(tableIndex) = v.expr.transform { case a: Attribute if !a.resolved => ds.queryExecution.analyzed.resolveQuoted(a.name, sparkSession.sessionState.analyzer.resolver).get } expecOutput(tableIndex) = existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get } if (output.contains(null)) { throw new CarbonMergeDataSetException(s"Not all columns are mapped") } (new InterpretedMutableProjection(output++Seq( ds.queryExecution.analyzed.resolveQuoted(statusCol, sparkSession.sessionState.analyzer.resolver).get), ds.queryExecution.analyzed.output), expecOutput) } else { (null, null) } } }
Example 99
Source File: CarbonExpressions.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.execution.command.DescribeTableCommand import org.apache.spark.sql.types.DataType object CarbonScalaUDF { def unapply(expression: Expression): Option[(ScalaUDF)] = { expression match { case a: ScalaUDF => Some(a) case _ => None } } } }
Example 100
Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.strategy import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} class CarbonDataSourceScan( override val output: Seq[Attribute], val rdd: RDD[InternalRow], @transient override val relation: HadoopFsRelation, val partitioning: Partitioning, val md: Map[String, String], identifier: Option[TableIdentifier], @transient private val logicalRelation: LogicalRelation) extends FileSourceScanExec( relation, output, relation.dataSchema, Seq.empty, Seq.empty, identifier) { // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val supportsBatch: Boolean = true // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = (partitioning, Nil) // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val metadata: Map[String, String] = md override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil }
Example 101
Source File: SparkSqlAdapter.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.util.ThreadLocalSessionInfo object SparkSqlAdapter { def initSparkSQL(): Unit = { } def getScanForSegments( @transient relation: HadoopFsRelation, output: Seq[Attribute], outputSchema: StructType, partitionFilters: Seq[Expression], dataFilters: Seq[Expression], tableIdentifier: Option[TableIdentifier] ): FileSourceScanExec = { FileSourceScanExec( relation, output, outputSchema, partitionFilters, dataFilters, tableIdentifier) } def addSparkSessionListener(sparkSession: SparkSession): Unit = { sparkSession.sparkContext.addSparkListener(new SparkListener { override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { CarbonEnv.carbonEnvMap.remove(sparkSession) ThreadLocalSessionInfo.unsetAll() } }) } }
Example 102
Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.strategy import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} class CarbonDataSourceScan( override val output: Seq[Attribute], val rdd: RDD[InternalRow], @transient override val relation: HadoopFsRelation, val partitioning: Partitioning, val md: Map[String, String], identifier: Option[TableIdentifier], @transient private val logicalRelation: LogicalRelation) extends FileSourceScanExec( relation, output, relation.dataSchema, Seq.empty, None, Seq.empty, identifier) { // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val supportsBatch: Boolean = true // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = (partitioning, Nil) // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val metadata: Map[String, String] = md override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil }
Example 103
Source File: SparkSqlAdapter.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.types.StructType object SparkSqlAdapter { def initSparkSQL(): Unit = { } def getScanForSegments( @transient relation: HadoopFsRelation, output: Seq[Attribute], outputSchema: StructType, partitionFilters: Seq[Expression], dataFilters: Seq[Expression], tableIdentifier: Option[TableIdentifier] ): FileSourceScanExec = { FileSourceScanExec( relation, output, outputSchema, partitionFilters, None, dataFilters, tableIdentifier) } }
Example 104
Source File: MemsqlRDD.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, PreparedStatement, ResultSet} import com.memsql.spark.SQLGen.VariableList import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} case class MemsqlRDD(query: String, variables: VariableList, options: MemsqlOptions, schema: StructType, expectedOutput: Seq[Attribute], @transient val sc: SparkContext) extends RDD[Row](sc, Nil) { override protected def getPartitions: Array[Partition] = MemsqlQueryHelpers.GetPartitions(options, query, variables) override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = { var closed = false var rs: ResultSet = null var stmt: PreparedStatement = null var conn: Connection = null var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition] def tryClose(name: String, what: AutoCloseable): Unit = { try { if (what != null) { what.close() } } catch { case e: Exception => logWarning(s"Exception closing $name", e) } } def close(): Unit = { if (closed) { return } tryClose("resultset", rs) tryClose("statement", stmt) tryClose("connection", conn) closed = true } context.addTaskCompletionListener { context => close() } conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)() stmt = conn.prepareStatement(partition.query) JdbcHelpers.fillStatement(stmt, partition.variables) rs = stmt.executeQuery() var rowsIter = JdbcUtils.resultSetToRows(rs, schema) if (expectedOutput.nonEmpty) { val schemaDatatypes = schema.map(_.dataType) val expectedDatatypes = expectedOutput.map(_.dataType) if (schemaDatatypes != expectedDatatypes) { val columnEncoders = schemaDatatypes.zip(expectedDatatypes).zipWithIndex.map { case ((_: StringType, _: NullType), _) => ((_: Row) => null) case ((_: ShortType, _: BooleanType), i) => ((r: Row) => r.getShort(i) != 0) case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0) case ((_: LongType, _: BooleanType), i) => ((r: Row) => r.getLong(i) != 0) case ((l, r), i) => { options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}") ((r: Row) => r.get(i)) } } rowsIter = rowsIter .map(row => Row.fromSeq(columnEncoders.map(_(row)))) } } CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close) } }
Example 105
Source File: ExistingDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.rdd.{EmptyRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow]) extends SparkPlan with StreamPlan { def children = Nil override def doExecute() = { assert(validTime != null) Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime)) .asInstanceOf[Option[RDD[InternalRow]]] .getOrElse(new EmptyRDD[InternalRow](sparkContext)) } }
Example 106
Source File: RangerShowTablesCommand.scala From spark-ranger with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.ranger.authorization.spark.authorizer.{RangerSparkAuthorizer, SparkPrivilegeObject, SparkPrivilegeObjectType} import org.apache.spark.sql.execution.command.{RunnableCommand, ShowTablesCommand} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute case class RangerShowTablesCommand(child: ShowTablesCommand) extends RunnableCommand { override val output: Seq[Attribute] = child.output override def run(sparkSession: SparkSession): Seq[Row] = { val rows = child.run(sparkSession) rows.filter(r => RangerSparkAuthorizer.isAllowed(toSparkPrivilegeObject(r))) } private def toSparkPrivilegeObject(row: Row): SparkPrivilegeObject = { val database = row.getString(0) val table = row.getString(1) new SparkPrivilegeObject(SparkPrivilegeObjectType.TABLE_OR_VIEW, database, table) } }
Example 107
Source File: DescribeDeltaHistoryCommand.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.commands // scalastyle:off import.ordering.noEmptyLine import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier} import org.apache.spark.sql.delta.actions.CommitInfo import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.command.RunnableCommand case class DescribeDeltaHistoryCommand( path: Option[String], tableIdentifier: Option[TableIdentifier], limit: Option[Int], override val output: Seq[Attribute] = ExpressionEncoder[CommitInfo]().schema.toAttributes) extends RunnableCommand with DeltaLogging { override def run(sparkSession: SparkSession): Seq[Row] = { val basePath = if (path.nonEmpty) { new Path(path.get) } else if (tableIdentifier.nonEmpty) { val sessionCatalog = sparkSession.sessionState.catalog lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get) DeltaTableIdentifier(sparkSession, tableIdentifier.get) match { case Some(id) if id.path.nonEmpty => new Path(id.path.get) case Some(id) if id.table.nonEmpty => new Path(metadata.location) case _ => if (metadata.tableType == CatalogTableType.VIEW) { throw DeltaErrors.describeViewHistory } throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY") } } else { throw DeltaErrors.missingTableIdentifierException("DESCRIBE HISTORY") } // Max array size if (limit.exists(_ > Int.MaxValue - 8)) { throw new IllegalArgumentException("Please use a limit less than Int.MaxValue - 8.") } val deltaLog = DeltaLog.forTable(sparkSession, basePath) recordDeltaOperation(deltaLog, "delta.ddl.describeHistory") { if (deltaLog.snapshot.version == -1) { throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY") } import sparkSession.implicits._ deltaLog.history.getHistory(limit).toDF().collect().toSeq } } }
Example 108
Source File: AnalysisHelper.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.util import org.apache.spark.sql.delta.DeltaErrors import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan trait AnalysisHelper { import AnalysisHelper._ protected def tryResolveReferences( sparkSession: SparkSession)( expr: Expression, planContainingExpr: LogicalPlan): Expression = { val newPlan = FakeLogicalPlan(expr, planContainingExpr.children) sparkSession.sessionState.analyzer.execute(newPlan) match { case FakeLogicalPlan(resolvedExpr, _) => // Return even if it did not successfully resolve return resolvedExpr case _ => // This is unexpected throw DeltaErrors.analysisException( s"Could not resolve expression $expr", plan = Option(planContainingExpr)) } } protected def toDataset(sparkSession: SparkSession, logicalPlan: LogicalPlan): Dataset[Row] = { Dataset.ofRows(sparkSession, logicalPlan) } protected def improveUnsupportedOpError(f: => Unit): Unit = { val possibleErrorMsgs = Seq( "is only supported with v2 table", // full error: DELETE is only supported with v2 tables "is not supported temporarily", // full error: UPDATE TABLE is not supported temporarily "Table does not support read", "Table implementation does not support writes" ).map(_.toLowerCase()) def isExtensionOrCatalogError(error: Exception): Boolean = { possibleErrorMsgs.exists(m => error.getMessage().toLowerCase().contains(m)) } try { f } catch { case e: Exception if isExtensionOrCatalogError(e) => throw DeltaErrors.configureSparkSessionWithExtensionAndCatalog(e) } } } object AnalysisHelper { case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan]) extends LogicalPlan { override def output: Seq[Attribute] = Nil } }
Example 109
Source File: DeltaInvariantCheckerExec.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.schema import org.apache.spark.sql.delta.DeltaErrors import org.apache.spark.sql.delta.schema.Invariants.NotNull import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BindReferences, Expression, GetStructField, Literal, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.{NullType, StructType} private def buildExtractors(invariant: Invariant): Option[Expression] = { assert(invariant.column.nonEmpty) val topLevelColumn = invariant.column.head val topLevelRefOpt = output.collectFirst { case a: AttributeReference if SchemaUtils.DELTA_COL_RESOLVER(a.name, topLevelColumn) => a } val rejectColumnNotFound = isNullNotOkay(invariant) if (topLevelRefOpt.isEmpty) { if (rejectColumnNotFound) { throw DeltaErrors.notNullInvariantException(invariant) } } if (invariant.column.length == 1) { topLevelRefOpt.map(BindReferences.bindReference[Expression](_, output)) } else { topLevelRefOpt.flatMap { topLevelRef => val boundTopLevel = BindReferences.bindReference[Expression](topLevelRef, output) try { val nested = invariant.column.tail.foldLeft(boundTopLevel) { case (e, fieldName) => e.dataType match { case StructType(fields) => val ordinal = fields.indexWhere(f => SchemaUtils.DELTA_COL_RESOLVER(f.name, fieldName)) if (ordinal == -1) { throw new IndexOutOfBoundsException(s"Not nullable column not found in struct: " + s"${fields.map(_.name).mkString("[", ",", "]")}") } GetStructField(e, ordinal, Some(fieldName)) case _ => throw new UnsupportedOperationException( "Invariants on nested fields other than StructTypes are not supported.") } } Some(nested) } catch { case i: IndexOutOfBoundsException if rejectColumnNotFound => throw InvariantViolationException(invariant, i.getMessage) case _: IndexOutOfBoundsException if !rejectColumnNotFound => None } } } } override protected def doExecute(): RDD[InternalRow] = { if (invariants.isEmpty) return child.execute() val boundRefs = invariants.map { invariant => CheckDeltaInvariant(buildExtractors(invariant).getOrElse(Literal(null, NullType)), invariant) } child.execute().mapPartitionsInternal { rows => val assertions = GenerateUnsafeProjection.generate(boundRefs) rows.map { row => assertions(row) row } } } override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 110
Source File: VacuumTableCommand.scala From delta with Apache License 2.0 | 5 votes |
package io.delta.tables.execution import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier, DeltaTableUtils} import org.apache.spark.sql.delta.commands.VacuumCommand import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.types.StringType case class VacuumTableCommand( path: Option[String], table: Option[TableIdentifier], horizonHours: Option[Double], dryRun: Boolean) extends RunnableCommand { override val output: Seq[Attribute] = Seq(AttributeReference("path", StringType, nullable = true)()) override def run(sparkSession: SparkSession): Seq[Row] = { val pathToVacuum = if (path.nonEmpty) { new Path(path.get) } else if (table.nonEmpty) { DeltaTableIdentifier(sparkSession, table.get) match { case Some(id) if id.path.nonEmpty => new Path(id.path.get) case _ => new Path(sparkSession.sessionState.catalog.getTableMetadata(table.get).location) } } else { throw DeltaErrors.missingTableIdentifierException("VACUUM") } val baseDeltaPath = DeltaTableUtils.findDeltaTableRoot(sparkSession, pathToVacuum) if (baseDeltaPath.isDefined) { if (baseDeltaPath.get != pathToVacuum) { throw DeltaErrors.vacuumBasePathMissingException(baseDeltaPath.get) } } val deltaLog = DeltaLog.forTable(sparkSession, pathToVacuum) if (deltaLog.snapshot.version == -1) { throw DeltaErrors.notADeltaTableException( "VACUUM", DeltaTableIdentifier(path = Some(pathToVacuum.toString))) } VacuumCommand.gc(sparkSession, deltaLog, dryRun, horizonHours).collect() } }
Example 111
Source File: DruidOperatorSchema.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, NamedExpression} import org.apache.spark.sql.types.DataType import org.sparklinedata.druid.{DruidOperatorAttribute, DruidQueryBuilder} lazy val pushedDownExprToDruidAttr : Map[Expression, DruidOperatorAttribute] = buildPushDownDruidAttrsMap private def pushDownExpressionMap : Map[String, (Expression, DataType, DataType, String)] = dqb.outputAttributeMap.filter(t => t._2._1 != null) private def buildPushDownDruidAttrsMap : Map[Expression, DruidOperatorAttribute] = (pushDownExpressionMap map { case (nm, (e, oDT, dDT, tf)) => { (e -> druidAttrMap(nm)) } }) private def buildDruidOpAttr : Map[String, DruidOperatorAttribute] = (dqb.outputAttributeMap map { case (nm, (e, oDT, dDT, tf)) => { val druidEid = e match { case null => NamedExpression.newExprId case n: NamedExpression => n.exprId case _ => NamedExpression.newExprId } (nm -> DruidOperatorAttribute(druidEid, nm, dDT, tf)) } } ) }
Example 112
Source File: DruidMetadataCommands.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sparklinedata.commands import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.hive.sparklinedata.SPLSessionState import org.apache.spark.sql.sources.druid.{DruidPlanner, DruidQueryCostModel} import org.apache.spark.sql.types._ import org.apache.spark.sql.util.PlanUtil import org.apache.spark.sql.{Row, SQLContext, SparkSession} import org.joda.time.Interval import org.sparklinedata.druid.metadata.{DruidMetadataCache, DruidRelationName, DruidRelationOptions} case class ClearMetadata(druidHost: Option[String]) extends RunnableCommand { override val output: Seq[Attribute] = { val schema = StructType( StructField("", StringType, nullable = true) :: Nil) schema.toAttributes } override def run(sparkSession: SparkSession): Seq[Row] = { if (druidHost.isDefined) { DruidMetadataCache.clearCache(druidHost.get) } else { DruidMetadataCache.clearCache } Seq(Row("")) } } case class ExplainDruidRewrite(sql: String) extends RunnableCommand { override val output: Seq[Attribute] = { val schema = StructType( StructField("", StringType, nullable = true) :: Nil) schema.toAttributes } override def run(sparkSession: SparkSession): Seq[Row] = { val qe = sparkSession.sessionState.executeSql(sql) qe.sparkPlan.toString().split("\n").map(Row(_)).toSeq ++ Seq(Row("")) ++ DruidPlanner.getDruidRDDs(qe.sparkPlan).flatMap { dR => val druidDSIntervals = dR.drDSIntervals val druidDSFullName= dR.drFullName val druidDSOptions = dR.drOptions val inputEstimate = dR.inputEstimate val outputEstimate = dR.outputEstimate s"""DruidQuery(${System.identityHashCode(dR.dQuery)}) details :: |${DruidQueryCostModel.computeMethod( sparkSession.sqlContext, druidDSIntervals, druidDSFullName, druidDSOptions, inputEstimate, outputEstimate, dR.dQuery.q) } """.stripMargin.split("\n").map(Row(_)) } } }
Example 113
Source File: DruidRelation.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.sparklinedata.druid import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId} import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Row, SQLContext} import org.joda.time.Interval import org.sparklinedata.druid.metadata.DruidRelationInfo case class DruidOperatorAttribute(exprId : ExprId, name : String, dataType : DataType, tf: String = null) override val needConversion: Boolean = false override def schema: StructType = dQuery.map(_.schema(info)).getOrElse(info.sourceDF(sqlContext).schema) def buildInternalScan : RDD[InternalRow] = dQuery.map(new DruidRDD(sqlContext, info, _)).getOrElse( info.sourceDF(sqlContext).queryExecution.toRdd ) override def buildScan(): RDD[Row] = buildInternalScan.asInstanceOf[RDD[Row]] override def toString : String = { if (dQuery.isDefined) { s"DruidQuery(${System.identityHashCode(dQuery)}): ${Utils.queryToString(dQuery.get)}" } else { info.toString } } }
Example 114
Source File: joinTypes.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.Attribute object JoinType { def apply(typ: String): JoinType = typ.toLowerCase.replace("_", "") match { case "inner" => Inner case "outer" | "full" | "fullouter" => FullOuter case "leftouter" | "left" => LeftOuter case "rightouter" | "right" => RightOuter case "leftsemi" => LeftSemi case "leftanti" => LeftAnti case "cross" => Cross case _ => val supported = Seq( "inner", "outer", "full", "fullouter", "leftouter", "left", "rightouter", "right", "leftsemi", "leftanti", "cross") throw new IllegalArgumentException(s"Unsupported join type '$typ'. " + "Supported join types include: " + supported.mkString("'", "', '", "'") + ".") } } sealed abstract class JoinType { def sql: String } sealed abstract class InnerLike extends JoinType { def explicitCartesian: Boolean } case object Inner extends InnerLike { override def explicitCartesian: Boolean = false override def sql: String = "INNER" } case object Cross extends InnerLike { override def explicitCartesian: Boolean = true override def sql: String = "CROSS" } case object LeftOuter extends JoinType { override def sql: String = "LEFT OUTER" } case object RightOuter extends JoinType { override def sql: String = "RIGHT OUTER" } case object FullOuter extends JoinType { override def sql: String = "FULL OUTER" } case object LeftSemi extends JoinType { override def sql: String = "LEFT SEMI" } case object LeftAnti extends JoinType { override def sql: String = "LEFT ANTI" } case class ExistenceJoin(exists: Attribute) extends JoinType { override def sql: String = { // This join type is only used in the end of optimizer and physical plans, we will not // generate SQL for this join type throw new UnsupportedOperationException } } case class NaturalJoin(tpe: JoinType) extends JoinType { require(Seq(Inner, LeftOuter, RightOuter, FullOuter).contains(tpe), "Unsupported natural join type " + tpe) override def sql: String = "NATURAL " + tpe.sql } case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType { require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe), "Unsupported using join type " + tpe) override def sql: String = "USING " + tpe.sql } object LeftExistence { def unapply(joinType: JoinType): Option[JoinType] = joinType match { case LeftSemi | LeftAnti => Some(joinType) case j: ExistenceJoin => Some(joinType) case _ => None } }
Example 115
Source File: ScriptTransformation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 116
Source File: EventTimeWatermark.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends LogicalPlan { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override val children: Seq[LogicalPlan] = child :: Nil }
Example 117
Source File: LocalRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 118
Source File: LogicalPlanSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types.IntegerType class LogicalPlanSuite extends SparkFunSuite { private var invocationCount = 0 private val function: PartialFunction[LogicalPlan, LogicalPlan] = { case p: Project => invocationCount += 1 p } private val testRelation = LocalRelation() test("resolveOperator runs on operators") { invocationCount = 0 val plan = Project(Nil, testRelation) plan resolveOperators function assert(invocationCount === 1) } test("resolveOperator runs on operators recursively") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan resolveOperators function assert(invocationCount === 2) } test("resolveOperator skips all ready resolved plans") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan.foreach(_.setAnalyzed()) plan resolveOperators function assert(invocationCount === 0) } test("resolveOperator skips partially resolved plans") { invocationCount = 0 val plan1 = Project(Nil, testRelation) val plan2 = Project(Nil, plan1) plan1.foreach(_.setAnalyzed()) plan2 resolveOperators function assert(invocationCount === 1) } test("isStreaming") { val relation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) val incrementalRelation = new LocalRelation( Seq(AttributeReference("a", IntegerType, nullable = true)())) { override def isStreaming(): Boolean = true } case class TestBinaryRelation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output } require(relation.isStreaming === false) require(incrementalRelation.isStreaming === true) assert(TestBinaryRelation(relation, relation).isStreaming === false) assert(TestBinaryRelation(incrementalRelation, relation).isStreaming === true) assert(TestBinaryRelation(relation, incrementalRelation).isStreaming === true) assert(TestBinaryRelation(incrementalRelation, incrementalRelation).isStreaming) } }
Example 119
Source File: DeclarativeAggregateEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 120
Source File: LocalTableScanExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow], override val user: String) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 121
Source File: ShuffledHashJoinExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 122
Source File: CartesianProductExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.hadoop.security.UserGroupInformation import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { private[this] val user = UserGroupInformation.getCurrentUser.getShortUserName override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get(user).blockManager, SparkEnv.get(user).serializerManager, context, null, null, 1024, SparkEnv.get(user).memoryManager.pageSizeBytes, SparkEnv.get(user).conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 123
Source File: LogicalRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 124
Source File: Exchange.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get, plan.user) } else { sameSchema += exchange exchange } } } }
Example 125
Source File: resources.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 126
Source File: commands.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 127
Source File: StreamingRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec( sourceName: String, output: Seq[Attribute], override val user: String) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes) } }
Example 128
Source File: EventTimeWatermarkExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { override def user: String = child.user val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 129
Source File: CoGroupedIterator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 130
Source File: ReferenceSort.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 131
Source File: SparkPlannerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def user: String = sparkContext.sparkUser def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child, user) :: planLater(NeverPlanned, user) :: Nil case Union(children) => planned += 1 UnionExec(children.map(p => planLater(p, user))) :: planLater(NeverPlanned, user) :: Nil case LocalRelation(output, data) => planned += 1 LocalTableScanExec(output, data, user) :: planLater(NeverPlanned, user) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 132
Source File: DescribeHiveTableCommand.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.collection.JavaConversions._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation} import org.apache.spark.sql.hive.HiveShim import org.apache.spark.sql.SQLContext private[hive] case class DescribeHiveTableCommand( table: MetastoreRelation, override val output: Seq[Attribute], isExtended: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { // Trying to mimic the format of Hive's output. But not exactly the same. var results: Seq[(String, String, String)] = Nil val columns: Seq[FieldSchema] = table.hiveQlTable.getCols val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols results ++= columns.map(field => (field.getName, field.getType, field.getComment)) if (partitionColumns.nonEmpty) { val partColumnInfo = partitionColumns.map(field => (field.getName, field.getType, field.getComment)) results ++= partColumnInfo ++ Seq(("# Partition Information", "", "")) ++ Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++ partColumnInfo } if (isExtended) { results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, "")) } results.map { case (name, dataType, comment) => Row(name, dataType, comment) } } }
Example 133
Source File: LocalRelation.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, analysis} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.types.{StructType, StructField} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[Row])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[Row] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 134
Source File: SqlParserSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.logical.Command private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Command { override def output: Seq[Attribute] = Seq.empty override def children: Seq[LogicalPlan] = Seq.empty } private[sql] class SuperLongKeywordTestParser extends AbstractSparkSQLParser { protected val EXECUTE = Keyword("THISISASUPERLONGKEYWORDTEST") override protected lazy val start: Parser[LogicalPlan] = set private lazy val set: Parser[LogicalPlan] = EXECUTE ~> ident ^^ { case fileName => TestCommand(fileName) } } private[sql] class CaseInsensitiveTestParser extends AbstractSparkSQLParser { protected val EXECUTE = Keyword("EXECUTE") override protected lazy val start: Parser[LogicalPlan] = set private lazy val set: Parser[LogicalPlan] = EXECUTE ~> ident ^^ { case fileName => TestCommand(fileName) } } class SqlParserSuite extends SparkFunSuite { test("test long keyword") { val parser = new SuperLongKeywordTestParser assert(TestCommand("NotRealCommand") === parser.parse("ThisIsASuperLongKeyWordTest NotRealCommand")) } test("test case insensitive") { val parser = new CaseInsensitiveTestParser assert(TestCommand("NotRealCommand") === parser.parse("EXECUTE NotRealCommand")) assert(TestCommand("NotRealCommand") === parser.parse("execute NotRealCommand")) assert(TestCommand("NotRealCommand") === parser.parse("exEcute NotRealCommand")) } }
Example 135
Source File: SparkSQLParser.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import scala.util.parsing.combinator.RegexParsers import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution._ import org.apache.spark.sql.types.StringType private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser { // A parser for the key-value part of the "SET [key = [value ]]" syntax private object SetCommandParser extends RegexParsers { private val key: Parser[String] = "(?m)[^=]+".r private val value: Parser[String] = "(?m).*$".r private val output: Seq[Attribute] = Seq(AttributeReference("", StringType, nullable = false)()) private val pair: Parser[LogicalPlan] = (key ~ ("=".r ~> value).?).? ^^ { case None => SetCommand(None, output) case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)), output) } def apply(input: String): LogicalPlan = parseAll(pair, input) match { case Success(plan, _) => plan case x => sys.error(x.toString) } } protected val AS = Keyword("AS") protected val CACHE = Keyword("CACHE") protected val CLEAR = Keyword("CLEAR") protected val IN = Keyword("IN") protected val LAZY = Keyword("LAZY") protected val SET = Keyword("SET") protected val SHOW = Keyword("SHOW") protected val TABLE = Keyword("TABLE") protected val TABLES = Keyword("TABLES") protected val UNCACHE = Keyword("UNCACHE") override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | show | others private lazy val cache: Parser[LogicalPlan] = CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ { case isLazy ~ tableName ~ plan => CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined) } private lazy val uncache: Parser[LogicalPlan] = ( UNCACHE ~ TABLE ~> ident ^^ { case tableName => UncacheTableCommand(tableName) } | CLEAR ~ CACHE ^^^ ClearCacheCommand ) private lazy val set: Parser[LogicalPlan] = SET ~> restInput ^^ { case input => SetCommandParser(input) } private lazy val show: Parser[LogicalPlan] = SHOW ~> TABLES ~ (IN ~> ident).? ^^ { case _ ~ dbName => ShowTablesCommand(dbName) } private lazy val others: Parser[LogicalPlan] = wholeInput ^^ { case input => fallback(input) } }
Example 136
Source File: LeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class LeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } }
Example 137
Source File: BroadcastLeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class BroadcastLeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val broadcastedRelation = sparkContext.broadcast(hashSet) streamedPlan.execute().mapPartitions { streamIter => val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue) }) } } }
Example 138
Source File: CartesianProduct.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output protected override def doExecute(): RDD[Row] = { val leftResults = left.execute().map(_.copy()) val rightResults = right.execute().map(_.copy()) leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow iter.map(r => joinedRow(r._1, r._2)) } } }
Example 139
Source File: ExistingRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} private[sql] case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[Row])(sqlContext: SQLContext) extends LogicalPlan with MultiInstanceRelation { override def children: Seq[LogicalPlan] = Nil override def newInstance(): this.type = LogicalLocalTable(output.map(_.newInstance()), rows)(sqlContext).asInstanceOf[this.type] override def sameResult(plan: LogicalPlan): Boolean = plan match { case LogicalRDD(_, otherRDD) => rows == rows case _ => false } @transient override lazy val statistics: Statistics = Statistics( // TODO: Improve the statistics estimation. // This is made small enough so it can be broadcasted. sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold - 1 ) }
Example 140
Source File: MetadataIteratorSpec.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.iterator import java.nio.file.Paths import java.util.{Properties, UUID} import org.apache.commons.io.FileUtils import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{Metadata, StringType, StructType} import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} import tech.sourced.engine.{BaseSparkSpec, Schema} class JDBCQueryIteratorSpec extends FlatSpec with Matchers with BeforeAndAfterAll with BaseSparkSpec { private val tmpPath = Paths.get( System.getProperty("java.io.tmpdir"), UUID.randomUUID.toString ) private val dbPath = tmpPath.resolve("test.db") override def beforeAll(): Unit = { super.beforeAll() tmpPath.toFile.mkdir() val rdd = ss.sparkContext.parallelize(Seq( Row("id1"), Row("id2"), Row("id3") )) val properties = new Properties() properties.put("driver", "org.sqlite.JDBC") val df = ss.createDataFrame(rdd, StructType(Seq(Schema.repositories.head))) df.write.jdbc(s"jdbc:sqlite:${dbPath.toString}", "repositories", properties) } override def afterAll(): Unit = { super.afterAll() FileUtils.deleteQuietly(tmpPath.toFile) } "JDBCQueryIterator" should "return all rows for the query" in { val iter = new JDBCQueryIterator( Seq(attr("id")), dbPath.toString, "SELECT id FROM repositories ORDER BY id" ) // calling hasNext more than one time does not cause rows to be lost iter.hasNext iter.hasNext val rows = (for (row <- iter) yield row).toArray rows.length should be(3) rows(0).length should be(1) rows(0)(0).toString should be("id1") rows(1)(0).toString should be("id2") rows(2)(0).toString should be("id3") } private def attr(name: String): Attribute = AttributeReference( name, StringType, nullable = false, Metadata.empty )() }
Example 141
Source File: DescribeHiveTableCommand.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.collection.JavaConversions._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation import org.apache.spark.sql.{Row, SQLContext} private[hive] case class DescribeHiveTableCommand( table: MetastoreRelation, override val output: Seq[Attribute], isExtended: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { // Trying to mimic the format of Hive's output. But not exactly the same. //试图模仿Hive输出的格式,但不完全一样。 var results: Seq[(String, String, String)] = Nil val columns: Seq[FieldSchema] = table.hiveQlTable.getCols val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols results ++= columns.map(field => (field.getName, field.getType, field.getComment)) if (partitionColumns.nonEmpty) { val partColumnInfo = partitionColumns.map(field => (field.getName, field.getType, field.getComment)) results ++= partColumnInfo ++ Seq(("# Partition Information", "", "")) ++ Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++ partColumnInfo } if (isExtended) { results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, "")) } results.map { case (name, dataType, comment) => Row(name, dataType, comment) } } }
Example 142
Source File: LocalRelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 143
Source File: SqlParserSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.logical.Command private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Command { override def output: Seq[Attribute] = Seq.empty override def children: Seq[LogicalPlan] = Seq.empty } private[sql] class SuperLongKeywordTestParser extends AbstractSparkSQLParser { protected val EXECUTE = Keyword("THISISASUPERLONGKEYWORDTEST") override protected lazy val start: Parser[LogicalPlan] = set private lazy val set: Parser[LogicalPlan] = EXECUTE ~> ident ^^ { case fileName => TestCommand(fileName) } } private[sql] class CaseInsensitiveTestParser extends AbstractSparkSQLParser { protected val EXECUTE = Keyword("EXECUTE") override protected lazy val start: Parser[LogicalPlan] = set private lazy val set: Parser[LogicalPlan] = EXECUTE ~> ident ^^ { case fileName => TestCommand(fileName) } } class SqlParserSuite extends SparkFunSuite { test("test long keyword") { val parser = new SuperLongKeywordTestParser assert(TestCommand("NotRealCommand") === parser.parse("ThisIsASuperLongKeyWordTest NotRealCommand")) } test("test case insensitive") { val parser = new CaseInsensitiveTestParser assert(TestCommand("NotRealCommand") === parser.parse("EXECUTE NotRealCommand")) assert(TestCommand("NotRealCommand") === parser.parse("execute NotRealCommand")) assert(TestCommand("NotRealCommand") === parser.parse("exEcute NotRealCommand")) } }
Example 144
Source File: SparkSQLParser.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import scala.util.parsing.combinator.RegexParsers import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{DescribeFunction, LogicalPlan, ShowFunctions} import org.apache.spark.sql.execution._ import org.apache.spark.sql.types.StringType private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser { // A parser for the key-value part of the "SET [key = [value ]]" syntax //用于“SET [key = [value]]”语法的键值部分的解析器 private object SetCommandParser extends RegexParsers { private val key: Parser[String] = "(?m)[^=]+".r private val value: Parser[String] = "(?m).*$".r private val output: Seq[Attribute] = Seq(AttributeReference("", StringType, nullable = false)()) private val pair: Parser[LogicalPlan] = (key ~ ("=".r ~> value).?).? ^^ { case None => SetCommand(None) case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim))) } def apply(input: String): LogicalPlan = parseAll(pair, input) match { case Success(plan, _) => plan case x => sys.error(x.toString) } } protected val AS = Keyword("AS") protected val CACHE = Keyword("CACHE") protected val CLEAR = Keyword("CLEAR") protected val DESCRIBE = Keyword("DESCRIBE") protected val EXTENDED = Keyword("EXTENDED") protected val FUNCTION = Keyword("FUNCTION") protected val FUNCTIONS = Keyword("FUNCTIONS") protected val IN = Keyword("IN") protected val LAZY = Keyword("LAZY") protected val SET = Keyword("SET") protected val SHOW = Keyword("SHOW") protected val TABLE = Keyword("TABLE") protected val TABLES = Keyword("TABLES") protected val UNCACHE = Keyword("UNCACHE") override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | show | desc | others private lazy val cache: Parser[LogicalPlan] = CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ { case isLazy ~ tableName ~ plan => CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined) } private lazy val uncache: Parser[LogicalPlan] = ( UNCACHE ~ TABLE ~> ident ^^ { case tableName => UncacheTableCommand(tableName) } | CLEAR ~ CACHE ^^^ ClearCacheCommand ) private lazy val set: Parser[LogicalPlan] = SET ~> restInput ^^ { case input => SetCommandParser(input) } // It can be the following patterns: // SHOW FUNCTIONS;显示函数 // SHOW FUNCTIONS mydb.func1; // SHOW FUNCTIONS func1; // SHOW FUNCTIONS `mydb.a`.`func1.aa`; private lazy val show: Parser[LogicalPlan] = ( SHOW ~> TABLES ~ (IN ~> ident).? ^^ { case _ ~ dbName => ShowTablesCommand(dbName) } | SHOW ~ FUNCTIONS ~> ((ident <~ ".").? ~ (ident | stringLit)).? ^^ { case Some(f) => ShowFunctions(f._1, Some(f._2)) case None => ShowFunctions(None, None) } ) private lazy val desc: Parser[LogicalPlan] = DESCRIBE ~ FUNCTION ~> EXTENDED.? ~ (ident | stringLit) ^^ { case isExtended ~ functionName => DescribeFunction(functionName, isExtended.isDefined) } private lazy val others: Parser[LogicalPlan] = wholeInput ^^ { case input => fallback(input) } }
Example 145
Source File: package.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable.HashSet import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.{Accumulator, AccumulatorParam, Logging} case class ColumnMetrics( elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty)) val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0) val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { logDebug(s"== ${child.simpleString} ==") logDebug(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case(attr, metric) => val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}") logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount += 1 var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes += HashSet(value.getClass.getName) } i += 1 } currentRow } } } } } }
Example 146
Source File: CartesianProduct.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().map { row => numLeftRows += 1 row.copy() } val rightResults = right.execute().map { row => numRightRows += 1 row.copy() } leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow iter.map { r => numOutputRows += 1 joinedRow(r._1, r._2) } } } }
Example 147
Source File: ExtraStrategiesSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String //快速操作 case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } //Nil是一个空的List override def children: Seq[SparkPlan] = Nil } //测试策略 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 FastOperator(attr.toAttribute :: Nil) :: Nil //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 case _ => Nil } } //额外的策略集 class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") {//插入一个额外的策略 try { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = Nil } } }
Example 148
Source File: hbaseCommands.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hbase._ import org.apache.spark.sql.hbase.util.DataTypeUtils import org.apache.spark.sql.types._ import scala.collection.mutable.ArrayBuffer @DeveloperApi case class AlterDropColCommand(namespace: String, tableName: String, columnName: String) extends RunnableCommand { def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog] .alterTableDropNonKey(namespace, tableName, columnName) sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog].stopAdmin() Seq.empty[Row] } } @DeveloperApi case class AlterAddColCommand(namespace: String, tableName: String, colName: String, colType: String, colFamily: String, colQualifier: String) extends RunnableCommand { def run(sparkSession: SparkSession): Seq[Row] = { val hbaseCatalog = sparkSession.sharedState.externalCatalog.asInstanceOf[HBaseCatalog] hbaseCatalog.alterTableAddNonKey(namespace, tableName, NonKeyColumn(colName, DataTypeUtils.getDataType(colType), colFamily, colQualifier)) hbaseCatalog.stopAdmin() Seq.empty[Row] } } @DeveloperApi case class InsertValueIntoTableCommand(tid: TableIdentifier, valueSeq: Seq[String]) extends RunnableCommand { override def run(sparkSession: SparkSession) = { val relation: HBaseRelation = sparkSession.sessionState.catalog.externalCatalog .asInstanceOf[HBaseCatalog] .getHBaseRelation(tid.database.getOrElse(null), tid.table).getOrElse(null) val bytes = valueSeq.zipWithIndex.map(v => DataTypeUtils.string2TypeData(v._1, relation.schema(v._2).dataType)) val rows = sparkSession.sparkContext.makeRDD(Seq(Row.fromSeq(bytes))) val inputValuesDF = sparkSession.createDataFrame(rows, relation.schema) relation.insert(inputValuesDF, overwrite = false) Seq.empty[Row] } override def output: Seq[Attribute] = Seq.empty }
Example 149
Source File: KafkaStreamWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import scala.collection.JavaConverters._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery import org.apache.spark.sql.sources.v2.writer._ import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.types.StructType class KafkaStreamDataWriter( targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute]) extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] { import scala.collection.JavaConverters._ private lazy val producer = CachedKafkaProducer.getOrCreate( new java.util.HashMap[String, Object](producerParams.asJava)) def write(row: InternalRow): Unit = { checkForErrors() sendRow(row, producer) } def commit(): WriterCommitMessage = { // Send is asynchronous, but we can't commit until all rows are actually in Kafka. // This requires flushing and then checking that no callbacks produced errors. // We also check for errors before to fail as soon as possible - the check is cheap. checkForErrors() producer.flush() checkForErrors() KafkaWriterCommitMessage } def abort(): Unit = {} def close(): Unit = { checkForErrors() if (producer != null) { producer.flush() checkForErrors() CachedKafkaProducer.close(new java.util.HashMap[String, Object](producerParams.asJava)) } } }
Example 150
Source File: InsertIntoHiveDirCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.language.existentials import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.FileUtils import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.mapred._ import org.apache.spark.SparkException import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.hive.client.HiveClientImpl case class InsertIntoHiveDirCommand( isLocal: Boolean, storage: CatalogStorageFormat, query: LogicalPlan, overwrite: Boolean, outputColumns: Seq[Attribute]) extends SaveAsHiveFile { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { assert(storage.locationUri.nonEmpty) val hiveTable = HiveClientImpl.toHiveTable(CatalogTable( identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")), tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW, storage = storage, schema = query.schema )) hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB, storage.serde.getOrElse(classOf[LazySimpleSerDe].getName)) val tableDesc = new TableDesc( hiveTable.getInputFormatClass, hiveTable.getOutputFormatClass, hiveTable.getMetadata ) val hadoopConf = sparkSession.sessionState.newHadoopConf() val jobConf = new JobConf(hadoopConf) val targetPath = new Path(storage.locationUri.get) val writeToPath = if (isLocal) { val localFileSystem = FileSystem.getLocal(jobConf) localFileSystem.makeQualified(targetPath) } else { val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf) val dfs = qualifiedPath.getFileSystem(jobConf) if (!dfs.exists(qualifiedPath)) { dfs.mkdirs(qualifiedPath.getParent) } qualifiedPath } val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath) val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc( tmpPath.toString, tableDesc, false) try { saveAsHiveFile( sparkSession = sparkSession, plan = child, hadoopConf = hadoopConf, fileSinkConf = fileSinkConf, outputLocation = tmpPath.toString, allColumns = outputColumns) val fs = writeToPath.getFileSystem(hadoopConf) if (overwrite && fs.exists(writeToPath)) { fs.listStatus(writeToPath).foreach { existFile => if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true) } } fs.listStatus(tmpPath).foreach { tmpFile => fs.rename(tmpFile.getPath, writeToPath) } } catch { case e: Throwable => throw new SparkException( "Failed inserting overwrite directory " + storage.locationUri.get, e) } finally { deleteExternalTmpPath(hadoopConf) } Seq.empty[Row] } }
Example 151
Source File: CreateHiveTableAsSelectCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumns: Seq[Attribute], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } InsertIntoHiveTable( tableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 152
Source File: joinTypes.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import java.util.Locale import org.apache.spark.sql.catalyst.expressions.Attribute object JoinType { def apply(typ: String): JoinType = typ.toLowerCase(Locale.ROOT).replace("_", "") match { case "inner" => Inner case "outer" | "full" | "fullouter" => FullOuter case "leftouter" | "left" => LeftOuter case "rightouter" | "right" => RightOuter case "leftsemi" => LeftSemi case "leftanti" => LeftAnti case "cross" => Cross case _ => val supported = Seq( "inner", "outer", "full", "fullouter", "full_outer", "leftouter", "left", "left_outer", "rightouter", "right", "right_outer", "leftsemi", "left_semi", "leftanti", "left_anti", "cross") throw new IllegalArgumentException(s"Unsupported join type '$typ'. " + "Supported join types include: " + supported.mkString("'", "', '", "'") + ".") } } sealed abstract class JoinType { def sql: String } sealed abstract class InnerLike extends JoinType { def explicitCartesian: Boolean } case object Inner extends InnerLike { override def explicitCartesian: Boolean = false override def sql: String = "INNER" } case object Cross extends InnerLike { override def explicitCartesian: Boolean = true override def sql: String = "CROSS" } case object LeftOuter extends JoinType { override def sql: String = "LEFT OUTER" } case object RightOuter extends JoinType { override def sql: String = "RIGHT OUTER" } case object FullOuter extends JoinType { override def sql: String = "FULL OUTER" } case object LeftSemi extends JoinType { override def sql: String = "LEFT SEMI" } case object LeftAnti extends JoinType { override def sql: String = "LEFT ANTI" } case class ExistenceJoin(exists: Attribute) extends JoinType { override def sql: String = { // This join type is only used in the end of optimizer and physical plans, we will not // generate SQL for this join type throw new UnsupportedOperationException } } case class NaturalJoin(tpe: JoinType) extends JoinType { require(Seq(Inner, LeftOuter, RightOuter, FullOuter).contains(tpe), "Unsupported natural join type " + tpe) override def sql: String = "NATURAL " + tpe.sql } case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType { require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe), "Unsupported using join type " + tpe) override def sql: String = "USING " + tpe.sql } object LeftExistence { def unapply(joinType: JoinType): Option[JoinType] = joinType match { case LeftSemi | LeftAnti => Some(joinType) case j: ExistenceJoin => Some(joinType) case _ => None } }
Example 153
Source File: ProjectEstimation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical.statsEstimation import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap} import org.apache.spark.sql.catalyst.plans.logical.{Project, Statistics} object ProjectEstimation { import EstimationUtils._ def estimate(project: Project): Option[Statistics] = { if (rowCountsExist(project.child)) { val childStats = project.child.stats val inputAttrStats = childStats.attributeStats // Match alias with its child's column stat val aliasStats = project.expressions.collect { case alias @ Alias(attr: Attribute, _) if inputAttrStats.contains(attr) => alias.toAttribute -> inputAttrStats(attr) } val outputAttrStats = getOutputMap(AttributeMap(inputAttrStats.toSeq ++ aliasStats), project.output) Some(childStats.copy( sizeInBytes = getOutputSize(project.output, childStats.rowCount.get, outputAttrStats), attributeStats = outputAttrStats)) } else { None } } }
Example 154
Source File: AggregateEstimation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical.statsEstimation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Statistics} object AggregateEstimation { import EstimationUtils._ def estimate(agg: Aggregate): Option[Statistics] = { val childStats = agg.child.stats // Check if we have column stats for all group-by columns. val colStatsExist = agg.groupingExpressions.forall { e => e.isInstanceOf[Attribute] && childStats.attributeStats.contains(e.asInstanceOf[Attribute]) } if (rowCountsExist(agg.child) && colStatsExist) { // Multiply distinct counts of group-by columns. This is an upper bound, which assumes // the data contains all combinations of distinct values of group-by columns. var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))( (res, expr) => res * childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount) outputRows = if (agg.groupingExpressions.isEmpty) { // If there's no group-by columns, the output is a single row containing values of aggregate // functions: aggregated results for non-empty input or initial values for empty input. 1 } else { // Here we set another upper bound for the number of output rows: it must not be larger than // child's number of rows. outputRows.min(childStats.rowCount.get) } val outputAttrStats = getOutputMap(childStats.attributeStats, agg.output) Some(Statistics( sizeInBytes = getOutputSize(agg.output, outputRows, outputAttrStats), rowCount = Some(outputRows), attributeStats = outputAttrStats, hints = childStats.hints)) } else { None } } }
Example 155
Source File: ScriptTransformation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 156
Source File: EventTimeWatermark.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends UnaryNode { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val delayMs = EventTimeWatermark.getDelayMs(delay) val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 157
Source File: LocalRelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation( output: Seq[Attribute], data: Seq[InternalRow] = Nil, // Indicates whether this relation has data from a streaming source. override val isStreaming: Boolean = false) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def computeStats(): Statistics = Statistics(sizeInBytes = output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 158
Source File: StatsEstimationTestBase.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.statsEstimation import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{IntegerType, StringType} trait StatsEstimationTestBase extends SparkFunSuite { var originalValue: Boolean = false override def beforeAll(): Unit = { super.beforeAll() // Enable stats estimation based on CBO. originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED) SQLConf.get.setConf(SQLConf.CBO_ENABLED, true) } override def afterAll(): Unit = { SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue) super.afterAll() } def getColSize(attribute: Attribute, colStat: ColumnStat): Long = attribute.dataType match { // For UTF8String: base + offset + numBytes case StringType => colStat.avgLen + 8 + 4 case _ => colStat.avgLen } def attr(colName: String): AttributeReference = AttributeReference(colName, IntegerType)() case class StatsTestPlan( outputList: Seq[Attribute], rowCount: BigInt, attributeStats: AttributeMap[ColumnStat], size: Option[BigInt] = None) extends LeafNode { override def output: Seq[Attribute] = outputList override def computeStats(): Statistics = Statistics( // If sizeInBytes is useless in testing, we just use a fake value sizeInBytes = size.getOrElse(Int.MaxValue), rowCount = Some(rowCount), attributeStats = attributeStats) }
Example 159
Source File: LogicalPlanSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types.IntegerType class LogicalPlanSuite extends SparkFunSuite { private var invocationCount = 0 private val function: PartialFunction[LogicalPlan, LogicalPlan] = { case p: Project => invocationCount += 1 p } private val testRelation = LocalRelation() test("transformUp runs on operators") { invocationCount = 0 val plan = Project(Nil, testRelation) plan transformUp function assert(invocationCount === 1) invocationCount = 0 plan transformDown function assert(invocationCount === 1) } test("transformUp runs on operators recursively") { invocationCount = 0 val plan = Project(Nil, Project(Nil, testRelation)) plan transformUp function assert(invocationCount === 2) invocationCount = 0 plan transformDown function assert(invocationCount === 2) } test("transformUp skips all ready resolved plans wrapped in analysis barrier") { invocationCount = 0 val plan = AnalysisBarrier(Project(Nil, Project(Nil, testRelation))) plan transformUp function assert(invocationCount === 0) invocationCount = 0 plan transformDown function assert(invocationCount === 0) } test("transformUp skips partially resolved plans wrapped in analysis barrier") { invocationCount = 0 val plan1 = AnalysisBarrier(Project(Nil, testRelation)) val plan2 = Project(Nil, plan1) plan2 transformUp function assert(invocationCount === 1) invocationCount = 0 plan2 transformDown function assert(invocationCount === 1) } test("isStreaming") { val relation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)()) val incrementalRelation = LocalRelation( Seq(AttributeReference("a", IntegerType, nullable = true)()), isStreaming = true) case class TestBinaryRelation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output } require(relation.isStreaming === false) require(incrementalRelation.isStreaming === true) assert(TestBinaryRelation(relation, relation).isStreaming === false) assert(TestBinaryRelation(incrementalRelation, relation).isStreaming === true) assert(TestBinaryRelation(relation, incrementalRelation).isStreaming === true) assert(TestBinaryRelation(incrementalRelation, incrementalRelation).isStreaming) } }
Example 160
Source File: DeclarativeAggregateEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 161
Source File: LocalTableScanExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], @transient rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) @transient private lazy val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 162
Source File: ObjectAggregationMap.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 163
Source File: CartesianProductExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 164
Source File: DataSourcePartitioning.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression} import org.apache.spark.sql.catalyst.plans.physical import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning} class DataSourcePartitioning( partitioning: Partitioning, colNames: AttributeMap[String]) extends physical.Partitioning { override val numPartitions: Int = partitioning.numPartitions() override def satisfies(required: physical.Distribution): Boolean = { super.satisfies(required) || { required match { case d: physical.ClusteredDistribution if isCandidate(d.clustering) => val attrs = d.clustering.map(_.asInstanceOf[Attribute]) partitioning.satisfy( new ClusteredDistribution(attrs.map { a => val name = colNames.get(a) assert(name.isDefined, s"Attribute ${a.name} is not found in the data source output") name.get }.toArray)) case _ => false } } } private def isCandidate(clustering: Seq[Expression]): Boolean = { clustering.forall { case a: Attribute => colNames.contains(a) case _ => false } } }
Example 165
Source File: DataSourceReaderHolder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import java.util.Objects import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.sources.v2.reader._ private def metadata: Seq[Any] = { val filters: Any = reader match { case s: SupportsPushDownCatalystFilters => s.pushedCatalystFilters().toSet case s: SupportsPushDownFilters => s.pushedFilters().toSet case _ => Nil } Seq(output, reader.getClass, filters) } def canEqual(other: Any): Boolean override def equals(other: Any): Boolean = other match { case other: DataSourceReaderHolder => canEqual(other) && metadata.length == other.metadata.length && metadata.zip(other.metadata).forall { case (l, r) => l == r } case _ => false } override def hashCode(): Int = { metadata.map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b) } }
Example 166
Source File: ddl.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Locale import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogUtils} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.{DDLUtils, RunnableCommand} import org.apache.spark.sql.types._ case class CreateTempViewUsing( tableIdent: TableIdentifier, userSpecifiedSchema: Option[StructType], replace: Boolean, global: Boolean, provider: String, options: Map[String, String]) extends RunnableCommand { if (tableIdent.database.isDefined) { throw new AnalysisException( s"Temporary view '$tableIdent' should not have specified a database") } override def argString: String = { s"[tableIdent:$tableIdent " + userSpecifiedSchema.map(_ + " ").getOrElse("") + s"replace:$replace " + s"provider:$provider " + CatalogUtils.maskCredentials(options) } override def run(sparkSession: SparkSession): Seq[Row] = { if (provider.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) { throw new AnalysisException("Hive data source can only be used with tables, " + "you can't use it with CREATE TEMP VIEW USING") } val dataSource = DataSource( sparkSession, userSpecifiedSchema = userSpecifiedSchema, className = provider, options = options) val catalog = sparkSession.sessionState.catalog val viewDefinition = Dataset.ofRows( sparkSession, LogicalRelation(dataSource.resolveRelation())).logicalPlan if (global) { catalog.createGlobalTempView(tableIdent.table, viewDefinition, replace) } else { catalog.createTempView(tableIdent.table, viewDefinition, replace) } Seq.empty[Row] } } case class RefreshTable(tableIdent: TableIdentifier) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { // Refresh the given table's metadata. If this table is cached as an InMemoryRelation, // drop the original cached version and make the new version cached lazily. sparkSession.catalog.refreshTable(tableIdent.quotedString) Seq.empty[Row] } } case class RefreshResource(path: String) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { sparkSession.catalog.refreshByPath(path) Seq.empty[Row] } }
Example 167
Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 168
Source File: resources.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 169
Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 170
Source File: CoGroupedIterator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 171
Source File: ReferenceSort.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 172
Source File: SparkPlannerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data, _) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 173
Source File: StarryLocalTableScanExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.{RDD, StarryRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class StarryLocalTableScanExec( tableName: String, output: Seq[Attribute], @transient rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) @transient private lazy val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val rdd = new StarryRDD(sparkContext, tableName, unsafeRows) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.length) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.length) taken } }
Example 174
Source File: StarryTakeOrderedAndProjectExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.util.Utils case class StarryTakeOrderedAndProjectExec( limit: Int, sortOrder: Seq[SortOrder], projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryExecNode { override def output: Seq[Attribute] = { projectList.map(_.toAttribute) } override def executeCollect(): Array[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val data = child.execute().map(_.copy()).takeOrdered(limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) data.map(r => proj(r).copy()) } else { data } } protected override def doExecute(): RDD[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val localTopK: RDD[InternalRow] = { child.execute().map(_.copy()).mapPartitions { iter => org.apache.spark.util.collection.Utils.takeOrdered(iter, limit)(ord) } } localTopK.mapPartitions { iter => val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) topK.map(r => proj(r)) } else { topK } } } override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = SinglePartition override def simpleString: String = { val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]") val outputString = Utils.truncatedString(output, "[", ",", "]") s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)" } }
Example 175
Source File: StarryUnionExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import com.github.passionke.starry.SparkPlanExecutor import org.apache.spark.rdd.{RDD, StarryRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class StarryUnionExec(children: Seq[SparkPlan]) extends SparkPlan { override def output: Seq[Attribute] = children.map(_.output).transpose.map(attrs => attrs.head.withNullability(attrs.exists(_.nullable))) protected override def doExecute(): RDD[InternalRow] = { val b = children.flatMap(child => { SparkPlanExecutor.doExec(child) }) new StarryRDD(sparkContext, b) } }
Example 176
Source File: StarryLocalRelation.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.logical import org.apache.spark.sql.catalyst.{InternalRow, analysis} import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, Statistics} override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def computeStats(): Statistics = Statistics(sizeInBytes = output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 177
Source File: KinesisWriteTask.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import java.nio.ByteBuffer import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult} import com.google.common.util.concurrent.{FutureCallback, Futures} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection} import org.apache.spark.sql.types.{BinaryType, StringType} private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String], inputSchema: Seq[Attribute]) extends Logging { private var producer: KinesisProducer = _ private val projection = createProjection private val streamName = producerConfiguration.getOrElse( KinesisSourceProvider.SINK_STREAM_NAME_KEY, "") def execute(iterator: Iterator[InternalRow]): Unit = { producer = CachedKinesisProducer.getOrCreate(producerConfiguration) while (iterator.hasNext) { val currentRow = iterator.next() val projectedRow = projection(currentRow) val partitionKey = projectedRow.getString(0) val data = projectedRow.getBinary(1) sendData(partitionKey, data) } } def sendData(partitionKey: String, data: Array[Byte]): String = { var sentSeqNumbers = new String val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data)) val kinesisCallBack = new FutureCallback[UserRecordResult]() { override def onFailure(t: Throwable): Unit = { logError(s"Writing to $streamName failed due to ${t.getCause}") } override def onSuccess(result: UserRecordResult): Unit = { val shardId = result.getShardId sentSeqNumbers = result.getSequenceNumber } } Futures.addCallback(future, kinesisCallBack) producer.flushSync() sentSeqNumbers } def close(): Unit = { if (producer != null) { producer.flush() producer = null } } private def createProjection: UnsafeProjection = { val partitionKeyExpression = inputSchema .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found")) partitionKeyExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found") ) dataExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } UnsafeProjection.create( Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema) } }
Example 178
Source File: ShuffleHashJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, PartitioningCollection} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffleHashJoin(leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { @transient final protected val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext] val cacheBuildSide = bigDatalogContext.getConf.getBoolean("spark.datalog.shufflehashjoin.cachebuildside", true) override lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) var cachedBuildPlan: RDD[HashedRelation] = null override def output: Seq[Attribute] = left.output ++ right.output override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected override def doExecute(): RDD[InternalRow] = { val numStreamedRows = buildSide match { case BuildLeft => longMetric("numRightRows") case BuildRight => longMetric("numLeftRows") } val numOutputRows = longMetric("numOutputRows") if (cacheBuildSide) { if (cachedBuildPlan == null) { cachedBuildPlan = buildPlan.execute() .mapPartitionsInternal(iter => Iterator(HashedRelation(iter, SQLMetrics.nullLongMetric, buildSideKeyGenerator))) .persist() } cachedBuildPlan.zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => hashJoin(streamedIter, numStreamedRows, buildIter.next(), numOutputRows)} } else { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => val hashedRelation = HashedRelation(buildIter, SQLMetrics.nullLongMetric, buildSideKeyGenerator) hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows) } } } }
Example 179
Source File: operators.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LeafNode, LogicalPlan, Statistics, UnaryNode} case class Recursion(name: String, isLinear: Boolean, left: LogicalPlan, right: LogicalPlan, partitioning: Seq[Int]) extends BinaryNode { // left is exitRules plan // right is recursive rules plan override def output: Seq[Attribute] = right.output } case class MutualRecursion(name: String, isLinear: Boolean, left: LogicalPlan, right: LogicalPlan, partitioning: Seq[Int]) extends BinaryNode { override def output: Seq[Attribute] = right.output override def children: Seq[LogicalPlan] = { if (left == null) Seq(right) else Seq(left, right) } override def generateTreeString(depth: Int, lastChildren: Seq[Boolean], builder: StringBuilder): StringBuilder = { if (depth > 0) { lastChildren.init.foreach { isLast => val prefixFragment = if (isLast) " " else ": " builder.append(prefixFragment) } val branch = if (lastChildren.last) "+- " else ":- " builder.append(branch) } builder.append(simpleString) builder.append("\n") if (children.nonEmpty) { val exitRule = children.init if (exitRule != null) exitRule.foreach(_.generateTreeString(depth + 1, lastChildren :+ false, builder)) children.last.generateTreeString(depth + 1, lastChildren :+ true, builder) } builder } } case class LinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode { override def statistics: Statistics = Statistics(Long.MaxValue) var name = _name } case class NonLinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode { override def statistics: Statistics = Statistics(Long.MaxValue) def name = "all_" + _name } case class MonotonicAggregate(groupingExpressions: Seq[Expression], aggregateExpressions: Seq[NamedExpression], child: LogicalPlan, partitioning: Seq[Int]) extends UnaryNode { override lazy val resolved: Boolean = !expressions.exists(!_.resolved) && childrenResolved override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute) } case class AggregateRecursion(name: String, isLinear: Boolean, left: LogicalPlan, right: LogicalPlan, partitioning: Seq[Int]) extends BinaryNode { // left is exitRules plan // right is recursive rules plan override def output: Seq[Attribute] = right.output } case class AggregateRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode { override def statistics: Statistics = Statistics(Long.MaxValue) var name = _name } case class CacheHint(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 180
Source File: DescribeHiveTableCommand.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.collection.JavaConverters._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation import org.apache.spark.sql.{Row, SQLContext} private[hive] case class DescribeHiveTableCommand( table: MetastoreRelation, override val output: Seq[Attribute], isExtended: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { // Trying to mimic the format of Hive's output. But not exactly the same. var results: Seq[(String, String, String)] = Nil val columns: Seq[FieldSchema] = table.hiveQlTable.getCols.asScala val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols.asScala results ++= columns.map(field => (field.getName, field.getType, field.getComment)) if (partitionColumns.nonEmpty) { val partColumnInfo = partitionColumns.map(field => (field.getName, field.getType, field.getComment)) results ++= partColumnInfo ++ Seq(("# Partition Information", "", "")) ++ Seq((s"# ${output(0).name}", output(1).name, output(2).name)) ++ partColumnInfo } if (isExtended) { results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, "")) } results.map { case (name, dataType, comment) => Row(name, dataType, comment) } } }
Example 181
Source File: CreateViewAsSelect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext} import org.apache.spark.sql.{AnalysisException, Row, SQLContext} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable} // TODO: Note that this class can NOT canonicalize the view SQL string entirely, which is different // from Hive and may not work for some cases like create view on self join. private[hive] case class CreateViewAsSelect( tableDesc: HiveTable, childSchema: Seq[Attribute], allowExisting: Boolean, orReplace: Boolean) extends RunnableCommand { assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length) assert(tableDesc.viewText.isDefined) val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database)) override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] if (hiveContext.catalog.tableExists(tableIdentifier)) { if (allowExisting) { // view already exists, will do nothing, to keep consistent with Hive } else if (orReplace) { hiveContext.catalog.client.alertView(prepareTable()) } else { throw new AnalysisException(s"View $tableIdentifier already exists. " + "If you want to update the view definition, please use ALTER VIEW AS or " + "CREATE OR REPLACE VIEW AS") } } else { hiveContext.catalog.client.createView(prepareTable()) } Seq.empty[Row] } private def prepareTable(): HiveTable = { // setup column types according to the schema of child. val schema = if (tableDesc.schema == Nil) { childSchema.map { attr => HiveColumn(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), null) } } else { childSchema.zip(tableDesc.schema).map { case (attr, col) => HiveColumn(col.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), col.comment) } } val columnNames = childSchema.map(f => verbose(f.name)) // When user specified column names for view, we should create a project to do the renaming. // When no column name specified, we still need to create a project to declare the columns // we need, to make us more robust to top level `*`s. val projectList = if (tableDesc.schema == Nil) { columnNames.mkString(", ") } else { columnNames.zip(tableDesc.schema.map(f => verbose(f.name))).map { case (name, alias) => s"$name AS $alias" }.mkString(", ") } val viewName = verbose(tableDesc.name) val expandedText = s"SELECT $projectList FROM (${tableDesc.viewText.get}) $viewName" tableDesc.copy(schema = schema, viewText = Some(expandedText)) } // escape backtick with double-backtick in column name and wrap it with backtick. private def verbose(name: String) = s"`${name.replaceAll("`", "``")}`" }
Example 182
Source File: LocalRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 183
Source File: SeqScanNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class SeqScanNode(conf: SQLConf, output: Seq[Attribute], data: Seq[InternalRow]) extends LeafLocalNode(conf) { private[this] var iterator: Iterator[InternalRow] = _ private[this] var currentRow: InternalRow = _ override def open(): Unit = { iterator = data.iterator } override def next(): Boolean = { if (iterator.hasNext) { currentRow = iterator.next() true } else { false } } override def fetch(): InternalRow = currentRow override def close(): Unit = { // Do nothing } }
Example 184
Source File: FilterNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate case class FilterNode(conf: SQLConf, condition: Expression, child: LocalNode) extends UnaryLocalNode(conf) { private[this] var predicate: (InternalRow) => Boolean = _ override def output: Seq[Attribute] = child.output override def open(): Unit = { child.open() predicate = GeneratePredicate.generate(condition, child.output) } override def next(): Boolean = { var found = false while (!found && child.next()) { found = predicate.apply(child.fetch()) } found } override def fetch(): InternalRow = child.fetch() override def close(): Unit = child.close() }
Example 185
Source File: ExpandNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Projection} case class ExpandNode( conf: SQLConf, projections: Seq[Seq[Expression]], output: Seq[Attribute], child: LocalNode) extends UnaryLocalNode(conf) { assert(projections.size > 0) private[this] var result: InternalRow = _ private[this] var idx: Int = _ private[this] var input: InternalRow = _ private[this] var groups: Array[Projection] = _ override def open(): Unit = { child.open() groups = projections.map(ee => newProjection(ee, child.output)).toArray idx = groups.length } override def next(): Boolean = { if (idx >= groups.length) { if (child.next()) { input = child.fetch() idx = 0 } else { return false } } result = groups(idx)(input) idx += 1 true } override def fetch(): InternalRow = result override def close(): Unit = child.close() }
Example 186
Source File: IntersectNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import scala.collection.mutable import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class IntersectNode(conf: SQLConf, left: LocalNode, right: LocalNode) extends BinaryLocalNode(conf) { override def output: Seq[Attribute] = left.output private[this] var leftRows: mutable.HashSet[InternalRow] = _ private[this] var currentRow: InternalRow = _ override def open(): Unit = { left.open() leftRows = mutable.HashSet[InternalRow]() while (left.next()) { leftRows += left.fetch().copy() } left.close() right.open() } override def next(): Boolean = { currentRow = null while (currentRow == null && right.next()) { currentRow = right.fetch() if (!leftRows.contains(currentRow)) { currentRow = null } } currentRow != null } override def fetch(): InternalRow = currentRow override def close(): Unit = { left.close() right.close() } }
Example 187
Source File: SampleNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler} case class SampleNode( conf: SQLConf, lowerBound: Double, upperBound: Double, withReplacement: Boolean, seed: Long, child: LocalNode) extends UnaryLocalNode(conf) { override def output: Seq[Attribute] = child.output private[this] var iterator: Iterator[InternalRow] = _ private[this] var currentRow: InternalRow = _ override def open(): Unit = { child.open() val sampler = if (withReplacement) { // Disable gap sampling since the gap sampling method buffers two rows internally, // requiring us to copy the row, which is more expensive than the random number generator. new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false) } else { new BernoulliCellSampler[InternalRow](lowerBound, upperBound) } sampler.setSeed(seed) iterator = sampler.sample(child.asIterator) } override def next(): Boolean = { if (iterator.hasNext) { currentRow = iterator.next() true } else { false } } override def fetch(): InternalRow = currentRow override def close(): Unit = child.close() }
Example 188
Source File: UnionNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class UnionNode(conf: SQLConf, children: Seq[LocalNode]) extends LocalNode(conf) { override def output: Seq[Attribute] = children.head.output private[this] var currentChild: LocalNode = _ private[this] var nextChildIndex: Int = _ override def open(): Unit = { currentChild = children.head currentChild.open() nextChildIndex = 1 } private def advanceToNextChild(): Boolean = { var found = false var exit = false while (!exit && !found) { if (currentChild != null) { currentChild.close() } if (nextChildIndex >= children.size) { found = false exit = true } else { currentChild = children(nextChildIndex) nextChildIndex += 1 currentChild.open() found = currentChild.next() } } found } override def close(): Unit = { if (currentChild != null) { currentChild.close() } } override def fetch(): InternalRow = currentChild.fetch() override def next(): Boolean = { if (currentChild.next()) { true } else { advanceToNextChild() } } }
Example 189
Source File: package.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable.HashSet import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.{Accumulator, AccumulatorParam, Logging} case class ColumnMetrics( elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty)) val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0) val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { logDebug(s"== ${child.simpleString} ==") logDebug(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case(attr, metric) => val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}") logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount += 1 var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes += HashSet(value.getClass.getName) } i += 1 } currentRow } } } } } }
Example 190
Source File: CartesianProduct.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().map { row => numLeftRows += 1 row.copy() } val rightResults = right.execute().map { row => numRightRows += 1 row.copy() } leftResults.cartesian(rightResults).mapPartitionsInternal { iter => val joinedRow = new JoinedRow iter.map { r => numOutputRows += 1 joinedRow(r._1, r._2) } } } }
Example 191
Source File: SparkSQLParser.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.util.parsing.combinator.RegexParsers import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.types.StringType class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser { // A parser for the key-value part of the "SET [key = [value ]]" syntax private object SetCommandParser extends RegexParsers { private val key: Parser[String] = "(?m)[^=]+".r private val value: Parser[String] = "(?m).*$".r private val output: Seq[Attribute] = Seq(AttributeReference("", StringType, nullable = false)()) private val pair: Parser[LogicalPlan] = (key ~ ("=".r ~> value).?).? ^^ { case None => SetCommand(None) case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim))) } def apply(input: String): LogicalPlan = parseAll(pair, input) match { case Success(plan, _) => plan case x => sys.error(x.toString) } } protected val AS = Keyword("AS") protected val CACHE = Keyword("CACHE") protected val CLEAR = Keyword("CLEAR") protected val DESCRIBE = Keyword("DESCRIBE") protected val EXTENDED = Keyword("EXTENDED") protected val FUNCTION = Keyword("FUNCTION") protected val FUNCTIONS = Keyword("FUNCTIONS") protected val IN = Keyword("IN") protected val LAZY = Keyword("LAZY") protected val SET = Keyword("SET") protected val SHOW = Keyword("SHOW") protected val TABLE = Keyword("TABLE") protected val TABLES = Keyword("TABLES") protected val UNCACHE = Keyword("UNCACHE") override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | show | desc | others private lazy val cache: Parser[LogicalPlan] = CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ { case isLazy ~ tableName ~ plan => CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined) } private lazy val uncache: Parser[LogicalPlan] = ( UNCACHE ~ TABLE ~> ident ^^ { case tableName => UncacheTableCommand(tableName) } | CLEAR ~ CACHE ^^^ ClearCacheCommand ) private lazy val set: Parser[LogicalPlan] = SET ~> restInput ^^ { case input => SetCommandParser(input) } // It can be the following patterns: // SHOW FUNCTIONS; // SHOW FUNCTIONS mydb.func1; // SHOW FUNCTIONS func1; // SHOW FUNCTIONS `mydb.a`.`func1.aa`; private lazy val show: Parser[LogicalPlan] = ( SHOW ~> TABLES ~ (IN ~> ident).? ^^ { case _ ~ dbName => ShowTablesCommand(dbName) } | SHOW ~ FUNCTIONS ~> ((ident <~ ".").? ~ (ident | stringLit)).? ^^ { case Some(f) => logical.ShowFunctions(f._1, Some(f._2)) case None => logical.ShowFunctions(None, None) } ) private lazy val desc: Parser[LogicalPlan] = DESCRIBE ~ FUNCTION ~> EXTENDED.? ~ (ident | stringLit) ^^ { case isExtended ~ functionName => logical.DescribeFunction(functionName, isExtended.isDefined) } private lazy val others: Parser[LogicalPlan] = wholeInput ^^ { case input => fallback(input) } }
Example 192
Source File: ExistingRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericMutableRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } //private[sql] case class PhysicalRDD( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String, override val metadata: Map[String, String] = Map.empty, override val outputsUnsafeRows: Boolean = false) extends LeafNode { protected override def doExecute(): RDD[InternalRow] = rdd override def simpleString: String = { val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value" s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}" } } private[sql] object PhysicalRDD { // Metadata keys val INPUT_PATHS = "InputPaths" val PUSHED_FILTERS = "PushedFilters" def createFromDataSource( output: Seq[Attribute], rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty): PhysicalRDD = { // All HadoopFsRelations output UnsafeRows val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation] PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows) } }
Example 193
Source File: CoGroupedIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, SortOrder, Attribute} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 194
Source File: DummyNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LocalRelation private[local] case class DummyNode( output: Seq[Attribute], relation: LocalRelation, conf: SQLConf) extends LocalNode(conf) { import DummyNode._ private var index: Int = CLOSED private val input: Seq[InternalRow] = relation.data def this(output: Seq[Attribute], data: Seq[Product], conf: SQLConf = new SQLConf) { this(output, LocalRelation.fromProduct(output, data), conf) } def isOpen: Boolean = index != CLOSED override def children: Seq[LocalNode] = Seq.empty override def open(): Unit = { index = -1 } override def next(): Boolean = { index += 1 index < input.size } override def fetch(): InternalRow = { assert(index >= 0 && index < input.size) input(index) } override def close(): Unit = { index = CLOSED } } private object DummyNode { val CLOSED: Int = Int.MinValue }
Example 195
Source File: ReferenceSort.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.{InternalAccumulator, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder }
Example 196
Source File: ExtraStrategiesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { sqlContext.experimental.extraStrategies = Nil } } }
Example 197
Source File: GenomicIntervalStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.utvf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{DataFrame, GenomicInterval, SparkSession, Strategy} import org.apache.spark.unsafe.types.UTF8String case class GIntervalRow(contigName: String, start: Int, end: Int) class GenomicIntervalStrategy( spark: SparkSession) extends Strategy with Serializable { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case GenomicInterval(contigName, start, end,output) => GenomicIntervalPlan(plan,spark,GIntervalRow(contigName,start,end),output) :: Nil case _ => Nil } } case class GenomicIntervalPlan(plan: LogicalPlan, spark: SparkSession,interval:GIntervalRow, output: Seq[Attribute]) extends SparkPlan with Serializable { def doExecute(): org.apache.spark.rdd.RDD[InternalRow] = { import spark.implicits._ lazy val genomicInterval = spark.createDataset(Seq(interval)) genomicInterval .rdd .map(r=>{ val proj = UnsafeProjection.create(schema) proj.apply(InternalRow.fromSeq(Seq(UTF8String.fromString(r.contigName),r.start,r.end))) } ) } def children: Seq[SparkPlan] = Nil }
Example 198
Source File: GenomicInterval.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Range, Statistics} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.biodatageeks.sequila.utils.Columns case class GenomicInterval( contig:String, start:Int, end:Int, output: Seq[Attribute] ) extends LeafNode with MultiInstanceRelation with Serializable { override def newInstance(): GenomicInterval = copy(output = output.map(_.newInstance())) def computeStats(conf: SQLConf): Statistics = { val sizeInBytes = IntegerType.defaultSize * 2 //FIXME: Add contigName size Statistics( sizeInBytes = sizeInBytes ) } override def simpleString: String = { s"GenomicInterval ($contig, $start, $end)" } } object GenomicInterval { def apply(contig:String, start: Int, end: Int): GenomicInterval = { val output = StructType(Seq( StructField(s"${Columns.CONTIG}", StringType, nullable = false), StructField(s"${Columns.START}", IntegerType, nullable = false), StructField(s"${Columns.END}", IntegerType, nullable = false)) ) .toAttributes new GenomicInterval(contig,start, end, output) } }
Example 199
Source File: Pileup.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.pileup import htsjdk.samtools.SAMRecord import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter import org.biodatageeks.sequila.datasources.InputDataType import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat import org.biodatageeks.sequila.utils.{InternalParams, TableFuncs} import org.seqdoop.hadoop_bam.CRAMBDGInputFormat import org.slf4j.LoggerFactory import scala.reflect.ClassTag class Pileup[T<:BDGAlignInputFormat](spark:SparkSession)(implicit c: ClassTag[T]) extends BDGAlignFileReaderWriter[T] { val logger = LoggerFactory.getLogger(this.getClass.getCanonicalName) def handlePileup(tableName: String, sampleId: String, refPath:String, output: Seq[Attribute]): RDD[InternalRow] = { logger.info("Calculating pileup on table: {}", tableName) lazy val allAlignments = readTableFile(name=tableName, sampleId) if(logger.isDebugEnabled()) logger.debug("Processing {} reads in total", allAlignments.count() ) val alignments = filterAlignments(allAlignments ) PileupMethods.calculatePileup(alignments, spark ,refPath) } private def filterAlignments(alignments:RDD[SAMRecord]): RDD[SAMRecord] = { // any other filtering conditions should go here val filterFlag = spark.conf.get(InternalParams.filterReadsByFlag, "1796").toInt val cleaned = alignments.filter(read => read.getContig != null && (read.getFlags & filterFlag) == 0) if(logger.isDebugEnabled()) logger.debug("Processing {} cleaned reads in total", cleaned.count() ) cleaned } private def readTableFile(name: String, sampleId: String): RDD[SAMRecord] = { val metadata = TableFuncs.getTableMetadata(spark, name) val path = metadata.location.toString val samplePathTemplate = ( path .split('/') .dropRight(1) ++ Array(s"$sampleId*.{{fileExtension}}")) .mkString("/") metadata.provider match { case Some(f) => if (f == InputDataType.BAMInputDataType) readBAMFile(spark.sqlContext, samplePathTemplate.replace("{{fileExtension}}", "bam"), refPath = None) else if (f == InputDataType.CRAMInputDataType) { val refPath = spark.sqlContext .sparkContext .hadoopConfiguration .get(CRAMBDGInputFormat.REFERENCE_SOURCE_PATH_PROPERTY) readBAMFile(spark.sqlContext, samplePathTemplate.replace("{{fileExtension}}", "cram"), Some(refPath)) } else throw new Exception("Only BAM and CRAM file formats are supported in bdg_coverage.") case None => throw new Exception("Wrong file extension - only BAM and CRAM file formats are supported in bdg_coverage.") } } }
Example 200
Source File: PileupStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.pileup import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{PileupTemplate, SparkSession, Strategy} import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter import org.biodatageeks.sequila.datasources.InputDataType import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat import org.biodatageeks.sequila.utils.TableFuncs import org.seqdoop.hadoop_bam.{BAMBDGInputFormat, CRAMBDGInputFormat} import scala.reflect.ClassTag class PileupStrategy (spark:SparkSession) extends Strategy with Serializable { override def apply(plan: LogicalPlan): Seq[SparkPlan] = { plan match { case PileupTemplate(tableName, sampleId, refPath, output) => val inputFormat = TableFuncs.getTableMetadata(spark, tableName).provider inputFormat match { case Some(f) => if (f == InputDataType.BAMInputDataType) PileupPlan[BAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else if (f == InputDataType.CRAMInputDataType) PileupPlan[CRAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else Nil case None => throw new RuntimeException("Only BAM and CRAM file formats are supported in pileup function.") } case _ => Nil } } } case class PileupPlan [T<:BDGAlignInputFormat](plan:LogicalPlan, spark:SparkSession, tableName:String, sampleId:String, refPath: String, output:Seq[Attribute])(implicit c: ClassTag[T]) extends SparkPlan with Serializable with BDGAlignFileReaderWriter [T]{ override def children: Seq[SparkPlan] = Nil override protected def doExecute(): RDD[InternalRow] = { new Pileup(spark).handlePileup(tableName, sampleId, refPath, output) } }