org.apache.spark.sql.catalyst.plans.logical.LeafNode Scala Example

Source File: SparkPlannerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Strategy
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union}
import org.apache.spark.sql.test.SharedSQLContext

class SparkPlannerSuite extends SharedSQLContext {
  import testImplicits._

  test("Ensure to go down only the first branch, not any other possible branches") {

    case object NeverPlanned extends LeafNode {
      override def output: Seq[Attribute] = Nil
    }

    var planned = 0
    object TestStrategy extends Strategy {
      def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
        case ReturnAnswer(child) =>
          planned += 1
          planLater(child) :: planLater(NeverPlanned) :: Nil
        case Union(children) =>
          planned += 1
          UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil
        case LocalRelation(output, data) =>
          planned += 1
          LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil
        case NeverPlanned =>
          fail("QueryPlanner should not go down to this branch.")
        case _ => Nil
      }
    }

    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS())

      assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f"))
      assert(planned === 4)
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: GenomicInterval.scala From bdg-sequila with Apache License 2.0

5 votes

package  org.apache.spark.sql

import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Range, Statistics}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.biodatageeks.sequila.utils.Columns

case class GenomicInterval(
                            contig:String,
                            start:Int,
                            end:Int,
                            output: Seq[Attribute]
                          )  extends LeafNode with MultiInstanceRelation with Serializable {

  override def newInstance(): GenomicInterval = copy(output = output.map(_.newInstance()))

  def computeStats(conf: SQLConf): Statistics = {
    val sizeInBytes = IntegerType.defaultSize * 2 //FIXME: Add contigName size
    Statistics( sizeInBytes = sizeInBytes )
  }

  override def simpleString: String = {
    s"GenomicInterval ($contig, $start, $end)"
  }

}

object GenomicInterval {
  def apply(contig:String, start: Int, end: Int): GenomicInterval = {
    val output = StructType(Seq(
      StructField(s"${Columns.CONTIG}", StringType, nullable = false),
      StructField(s"${Columns.START}", IntegerType, nullable = false),
      StructField(s"${Columns.END}", IntegerType, nullable = false))
    )  .toAttributes
    new GenomicInterval(contig,start, end, output)
  }
}

Source File: operators.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression}
import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LeafNode, LogicalPlan, Statistics, UnaryNode}

case class Recursion(name: String,
                     isLinear: Boolean,
                     left: LogicalPlan,
                     right: LogicalPlan,
                     partitioning: Seq[Int]) extends BinaryNode {
  // left is exitRules plan
  // right is recursive rules plan
  override def output: Seq[Attribute] = right.output
}

case class MutualRecursion(name: String,
                             isLinear: Boolean,
                             left: LogicalPlan,
                             right: LogicalPlan,
                             partitioning: Seq[Int]) extends BinaryNode {

  override def output: Seq[Attribute] = right.output

  override def children: Seq[LogicalPlan] = {
    if (left == null)
      Seq(right)
    else
      Seq(left, right)
  }

  override def generateTreeString(depth: Int,
                         lastChildren: Seq[Boolean],
                         builder: StringBuilder): StringBuilder = {
    if (depth > 0) {
      lastChildren.init.foreach { isLast =>
        val prefixFragment = if (isLast) "   " else ":  "
        builder.append(prefixFragment)
      }

      val branch = if (lastChildren.last) "+- " else ":- "
      builder.append(branch)
    }

    builder.append(simpleString)
    builder.append("\n")

    if (children.nonEmpty) {
      val exitRule = children.init
      if (exitRule != null)
        exitRule.foreach(_.generateTreeString(depth + 1, lastChildren :+ false, builder))
      children.last.generateTreeString(depth + 1, lastChildren :+ true, builder)
    }

    builder
  }
}

case class LinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode {
  override def statistics: Statistics = Statistics(Long.MaxValue)
  var name = _name
}

case class NonLinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode {
  override def statistics: Statistics = Statistics(Long.MaxValue)

  def name = "all_" + _name
}

case class MonotonicAggregate(groupingExpressions: Seq[Expression],
                              aggregateExpressions: Seq[NamedExpression],
                              child: LogicalPlan,
                              partitioning: Seq[Int]) extends UnaryNode {
  override lazy val resolved: Boolean = !expressions.exists(!_.resolved) && childrenResolved

  override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute)
}

case class AggregateRecursion(name: String,
                              isLinear: Boolean,
                              left: LogicalPlan,
                              right: LogicalPlan,
                              partitioning: Seq[Int]) extends BinaryNode {
  // left is exitRules plan
  // right is recursive rules plan
  override def output: Seq[Attribute] = right.output
}

case class AggregateRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode {
  override def statistics: Statistics = Statistics(Long.MaxValue)
  var name = _name
}


case class CacheHint(child: LogicalPlan) extends UnaryNode {
  override def output: Seq[Attribute] = child.output
}

Source File: StarryLocalRelation.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.logical

import org.apache.spark.sql.catalyst.{InternalRow, analysis}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, Statistics}


  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type]
  }

  override protected def stringArgs: Iterator[Any] = {
    if (data.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def computeStats(): Statistics =
    Statistics(sizeInBytes = output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length)

  def toSQL(inlineTableName: String): String = {
    require(data.nonEmpty)
    val types = output.map(_.dataType)
    val rows = data.map { row =>
      val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql }
      cells.mkString("(", ", ", ")")
    }
    "VALUES " + rows.mkString(", ") +
      " AS " + inlineTableName +
      output.map(_.name).mkString("(", ", ", ")")
  }

}

Source File: SparkPlannerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Strategy
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union}
import org.apache.spark.sql.test.SharedSQLContext

class SparkPlannerSuite extends SharedSQLContext {
  import testImplicits._

  test("Ensure to go down only the first branch, not any other possible branches") {

    case object NeverPlanned extends LeafNode {
      override def output: Seq[Attribute] = Nil
    }

    var planned = 0
    object TestStrategy extends Strategy {
      def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
        case ReturnAnswer(child) =>
          planned += 1
          planLater(child) :: planLater(NeverPlanned) :: Nil
        case Union(children) =>
          planned += 1
          UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil
        case LocalRelation(output, data, _) =>
          planned += 1
          LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil
        case NeverPlanned =>
          fail("QueryPlanner should not go down to this branch.")
        case _ => Nil
      }
    }

    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS())

      assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f"))
      assert(planned === 4)
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: DataSourceV2Relation.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.AttributeReference
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
import org.apache.spark.sql.sources.v2.reader._

case class DataSourceV2Relation(
    output: Seq[AttributeReference],
    reader: DataSourceReader)
  extends LeafNode with MultiInstanceRelation with DataSourceReaderHolder {

  override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2Relation]

  override def computeStats(): Statistics = reader match {
    case r: SupportsReportStatistics =>
      Statistics(sizeInBytes = r.getStatistics.sizeInBytes().orElse(conf.defaultSizeInBytes))
    case _ =>
      Statistics(sizeInBytes = conf.defaultSizeInBytes)
  }

  override def newInstance(): DataSourceV2Relation = {
    copy(output = output.map(_.newInstance()))
  }
}


class StreamingDataSourceV2Relation(
    output: Seq[AttributeReference],
    reader: DataSourceReader) extends DataSourceV2Relation(output, reader) {
  override def isStreaming: Boolean = true
}

object DataSourceV2Relation {
  def apply(reader: DataSourceReader): DataSourceV2Relation = {
    new DataSourceV2Relation(reader.readSchema().toAttributes, reader)
  }
}

Source File: LogicalRelation.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.util.Utils


  override def newInstance(): LogicalRelation = {
    this.copy(output = output.map(_.newInstance()))
  }

  override def refresh(): Unit = relation match {
    case fs: HadoopFsRelation => fs.location.refresh()
    case _ =>  // Do nothing.
  }

  override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation"
}

object LogicalRelation {
  def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation =
    LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming)

  def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation =
    LogicalRelation(relation, relation.schema.toAttributes, Some(table), false)
}

Source File: StatsEstimationTestBase.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.statsEstimation

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{IntegerType, StringType}


trait StatsEstimationTestBase extends SparkFunSuite {

  var originalValue: Boolean = false

  override def beforeAll(): Unit = {
    super.beforeAll()
    // Enable stats estimation based on CBO.
    originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, true)
  }

  override def afterAll(): Unit = {
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
    super.afterAll()
  }

  def getColSize(attribute: Attribute, colStat: ColumnStat): Long = attribute.dataType match {
    // For UTF8String: base + offset + numBytes
    case StringType => colStat.avgLen + 8 + 4
    case _ => colStat.avgLen
  }

  def attr(colName: String): AttributeReference = AttributeReference(colName, IntegerType)()

  
case class StatsTestPlan(
    outputList: Seq[Attribute],
    rowCount: BigInt,
    attributeStats: AttributeMap[ColumnStat],
    size: Option[BigInt] = None) extends LeafNode {
  override def output: Seq[Attribute] = outputList
  override def computeStats(): Statistics = Statistics(
    // If sizeInBytes is useless in testing, we just use a fake value
    sizeInBytes = size.getOrElse(Int.MaxValue),
    rowCount = Some(rowCount),
    attributeStats = attributeStats)
}

Source File: SparkPlannerSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Strategy
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union}
import org.apache.spark.sql.test.SharedSQLContext

class SparkPlannerSuite extends SharedSQLContext {
  import testImplicits._

  test("Ensure to go down only the first branch, not any other possible branches") {

    case object NeverPlanned extends LeafNode {
      override def output: Seq[Attribute] = Nil
    }

    var planned = 0
    object TestStrategy extends Strategy {
      def user: String = sparkContext.sparkUser
      def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
        case ReturnAnswer(child) =>
          planned += 1
          planLater(child, user) :: planLater(NeverPlanned, user) :: Nil
        case Union(children) =>
          planned += 1
          UnionExec(children.map(p => planLater(p, user))) :: planLater(NeverPlanned, user) :: Nil
        case LocalRelation(output, data) =>
          planned += 1
          LocalTableScanExec(output, data, user) :: planLater(NeverPlanned, user) :: Nil
        case NeverPlanned =>
          fail("QueryPlanner should not go down to this branch.")
        case _ => Nil
      }
    }

    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS())

      assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f"))
      assert(planned === 4)
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: StreamingRelation.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LeafNode
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.execution.datasources.DataSource

object StreamingRelation {
  def apply(dataSource: DataSource): StreamingRelation = {
    StreamingRelation(
      dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes)
  }
}


case class StreamingRelationExec(
    sourceName: String,
    output: Seq[Attribute],
    override val user: String) extends LeafExecNode {
  override def toString: String = sourceName
  override protected def doExecute(): RDD[InternalRow] = {
    throw new UnsupportedOperationException("StreamingRelationExec cannot be executed")
  }
}

object StreamingExecutionRelation {
  def apply(source: Source): StreamingExecutionRelation = {
    StreamingExecutionRelation(source, source.schema.toAttributes)
  }
}

Source File: LogicalRelation.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.util.Utils


  override def newInstance(): this.type = {
    LogicalRelation(
      relation,
      expectedOutputAttributes.map(_.map(_.newInstance())),
      catalogTable).asInstanceOf[this.type]
  }

  override def refresh(): Unit = relation match {
    case fs: HadoopFsRelation => fs.location.refresh()
    case _ =>  // Do nothing.
  }

  override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation"
}

Source File: DescribeDeltaHistoryCommand.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.commands

// scalastyle:off import.ordering.noEmptyLine
import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier}
import org.apache.spark.sql.delta.actions.CommitInfo
import org.apache.spark.sql.delta.metering.DeltaLogging
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.execution.command.RunnableCommand


case class DescribeDeltaHistoryCommand(
    path: Option[String],
    tableIdentifier: Option[TableIdentifier],
    limit: Option[Int],
    override val output: Seq[Attribute] = ExpressionEncoder[CommitInfo]().schema.toAttributes)
  extends RunnableCommand with DeltaLogging {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val basePath =
      if (path.nonEmpty) {
        new Path(path.get)
      } else if (tableIdentifier.nonEmpty) {
        val sessionCatalog = sparkSession.sessionState.catalog
        lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get)

        DeltaTableIdentifier(sparkSession, tableIdentifier.get) match {
          case Some(id) if id.path.nonEmpty =>
            new Path(id.path.get)
          case Some(id) if id.table.nonEmpty =>
            new Path(metadata.location)
          case _ =>
            if (metadata.tableType == CatalogTableType.VIEW) {
              throw DeltaErrors.describeViewHistory
            }
            throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY")
        }
      } else {
        throw DeltaErrors.missingTableIdentifierException("DESCRIBE HISTORY")
      }

    // Max array size
    if (limit.exists(_ > Int.MaxValue - 8)) {
      throw new IllegalArgumentException("Please use a limit less than Int.MaxValue - 8.")
    }

    val deltaLog = DeltaLog.forTable(sparkSession, basePath)
    recordDeltaOperation(deltaLog, "delta.ddl.describeHistory") {
      if (deltaLog.snapshot.version == -1) {
        throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY")
      }

      import sparkSession.implicits._
      deltaLog.history.getHistory(limit).toDF().collect().toSeq
    }
  }
}

Source File: LogicalRelation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.util.Utils


  override def newInstance(): this.type = {
    LogicalRelation(
      relation,
      expectedOutputAttributes.map(_.map(_.newInstance())),
      catalogTable).asInstanceOf[this.type]
  }

  override def refresh(): Unit = relation match {
    case fs: HadoopFsRelation => fs.location.refresh()
    case _ =>  // Do nothing.
  }

  override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation"
}

Source File: StreamingRelation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LeafNode
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.execution.datasources.DataSource

object StreamingRelation {
  def apply(dataSource: DataSource): StreamingRelation = {
    StreamingRelation(
      dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes)
  }
}


case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode {
  override def toString: String = sourceName
  override protected def doExecute(): RDD[InternalRow] = {
    throw new UnsupportedOperationException("StreamingRelationExec cannot be executed")
  }
}

object StreamingExecutionRelation {
  def apply(source: Source): StreamingExecutionRelation = {
    StreamingExecutionRelation(source, source.schema.toAttributes)
  }
}

Source File: LogicalRelation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.util.Utils


  override def newInstance(): this.type = {
    LogicalRelation(
      relation,
      expectedOutputAttributes.map(_.map(_.newInstance())),
      catalogTable).asInstanceOf[this.type]
  }

  override def refresh(): Unit = relation match {
    case fs: HadoopFsRelation => fs.location.refresh()
    case _ =>  // Do nothing.
  }

  override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation"
}

Source File: PlanUtilsSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LeafNode}
import org.scalatest.FunSuite
import org.apache.spark.sql.util.PlanUtils._

class PlanUtilsSuite extends FunSuite {
  trait NoAttributes {
    self: LogicalPlan =>
    override def output: Seq[Attribute] = Seq.empty
  }
  case object Leaf extends LeafNode with NoAttributes
  case class Node(children: LogicalPlan*) extends LogicalPlan with NoAttributes

  val k = Leaf            //        _____a_____
  val j = Leaf            //       /     |     \
  val i = Leaf            //      b      e      k
  val h = Node(i)         //     / \    / \
  val g = Node(h, j)      //    c   d  f   g
  val f = Leaf            //              / \
  val e = Node(f, g)      //             h   j
  val d = Leaf            //             |
  val c = Leaf            //             i
  val b = Node(c, d)      //
  val a = Node(b, e, k)   //

  test("isLeaf") {
    assertResult(expected = false)(Node(Leaf).isLeaf)
    assertResult(expected = true)(Leaf.isLeaf)
  }

  test("find") {
    assertResult(None)(a.find(_ == Node(Leaf, Leaf, Leaf)))
    assertResult(Some(h))(a.find(_ == Node(Leaf)))
  }

  test("filter") {
    assertResult(Seq.empty)(a.filter(_ == Node(Leaf, Leaf, Leaf)))
    assertResult(Seq(c, d, f, i, j, k))(a.filter(_.isLeaf))
  }

  test("contains") {
    assertResult(expected = false)(a.contains(Node(Leaf, Leaf, Leaf)))
    assertResult(expected = true)(a.contains(Node(Leaf)))
  }

  test("exists") {
    assertResult(expected = true)(a.exists(node => node == Node(Leaf)))
    assertResult(expected = false)(a.exists(node => node == Node(Leaf, Leaf, Leaf)))
  }

  test("toPreOrderSeq") {
    assertResult(a.toPreOrderSeq.toList)(List(a, b, c, d, e, f, g, h, i, j, k))
  }

  test("toPostOrderSeq") {
    assertResult(a.toPostOrderSeq.toList)(List(c, d, b, f, i, h, j, g, e, k, a))
  }

  test("toLevelOrderSeq") {
    assertResult(a.toLevelOrderSeq.toList)(List(a, b, e, k, c, d, f, g, h, j, i))
  }

  test("toSeq") {
    assertResult(a.toSeq(PreOrder))(a.toPreOrderSeq)
    assertResult(a.toSeq(PostOrder))(a.toPostOrderSeq)
    assertResult(a.toSeq(LevelOrder))(a.toLevelOrderSeq)
  }
}

Source File: inferSchemaCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.commands

import org.apache.spark.sql.catalyst.analysis.systables.SchemaEnumeration
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LeafNode
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
import org.apache.spark.sql.execution.tablefunctions.DataTypeExtractor
import org.apache.spark.sql.hive.orc.OrcRelation
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}


case class InferSchemaCommand(path: String, fileType: FileType) extends RunnableCommand {
  override lazy val output: Seq[Attribute] = InferSchemaCommand.schema.toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val fileSchema = fileType.readSchema(sqlContext, path)
    fileSchema.zipWithIndex.map {
      case (StructField(name, dataType, nullable, _), idx) =>
        val dataTypeExtractor = DataTypeExtractor(dataType)
        Row(
          name,
          idx + 1, // idx + 1 since the ordinal position has to start at 1
          nullable,
          dataTypeExtractor.inferredSqlType,
          dataTypeExtractor.numericPrecision.orNull,
          dataTypeExtractor.numericPrecisionRadix.orNull,
          dataTypeExtractor.numericScale.orNull)
    }
  }
}

object InferSchemaCommand extends SchemaEnumeration {
  val name = Field("COLUMN_NAME", StringType, nullable = false)
  val ordinalPosition = Field("ORDINAL_POSITION", IntegerType, nullable = false)
  val isNullable = Field("IS_NULLABLE", BooleanType, nullable = false)
  val dataType = Field("DATA_TYPE", StringType, nullable = false)
  val numericPrecision = Field("NUMERIC_PRECISION", IntegerType, nullable = true)
  val numericPrecisionRadix = Field("NUMERIC_PRECISION_RADIX", IntegerType, nullable = true)
  val numericScale = Field("NUMERIC_SCALE", IntegerType, nullable = true)
}

Source File: SparkPlannerSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Strategy
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union}
import org.apache.spark.sql.test.SharedSQLContext

class SparkPlannerSuite extends SharedSQLContext {
  import testImplicits._

  test("Ensure to go down only the first branch, not any other possible branches") {

    case object NeverPlanned extends LeafNode {
      override def output: Seq[Attribute] = Nil
    }

    var planned = 0
    object TestStrategy extends Strategy {
      def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
        case ReturnAnswer(child) =>
          planned += 1
          planLater(child) :: planLater(NeverPlanned) :: Nil
        case Union(children) =>
          planned += 1
          UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil
        case LocalRelation(output, data, _) =>
          planned += 1
          LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil
        case NeverPlanned =>
          fail("QueryPlanner should not go down to this branch.")
        case _ => Nil
      }
    }

    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS())

      assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f"))
      assert(planned === 4)
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: StreamingRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2}

object StreamingRelation {
  def apply(dataSource: DataSource): StreamingRelation = {
    StreamingRelation(
      dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes)
  }
}


case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode {
  override def toString: String = sourceName
  override protected def doExecute(): RDD[InternalRow] = {
    throw new UnsupportedOperationException("StreamingRelationExec cannot be executed")
  }
}

object StreamingExecutionRelation {
  def apply(source: Source, session: SparkSession): StreamingExecutionRelation = {
    StreamingExecutionRelation(source, source.schema.toAttributes)(session)
  }
}

Source File: LogicalRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.util.Utils


  override def newInstance(): LogicalRelation = {
    this.copy(output = output.map(_.newInstance()))
  }

  override def refresh(): Unit = relation match {
    case fs: HadoopFsRelation => fs.location.refresh()
    case _ =>  // Do nothing.
  }

  override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation"
}

object LogicalRelation {
  def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation =
    LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming)

  def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation =
    LogicalRelation(relation, relation.schema.toAttributes, Some(table), false)
}

Source File: StatsEstimationTestBase.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.statsEstimation

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{IntegerType, StringType}


trait StatsEstimationTestBase extends SparkFunSuite {

  var originalValue: Boolean = false

  override def beforeAll(): Unit = {
    super.beforeAll()
    // Enable stats estimation based on CBO.
    originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, true)
  }

  override def afterAll(): Unit = {
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
    super.afterAll()
  }

  def getColSize(attribute: Attribute, colStat: ColumnStat): Long = attribute.dataType match {
    // For UTF8String: base + offset + numBytes
    case StringType => colStat.avgLen.getOrElse(attribute.dataType.defaultSize.toLong) + 8 + 4
    case _ => colStat.avgLen.getOrElse(attribute.dataType.defaultSize)
  }

  def attr(colName: String): AttributeReference = AttributeReference(colName, IntegerType)()

  
case class StatsTestPlan(
    outputList: Seq[Attribute],
    rowCount: BigInt,
    attributeStats: AttributeMap[ColumnStat],
    size: Option[BigInt] = None) extends LeafNode {
  override def output: Seq[Attribute] = outputList
  override def computeStats(): Statistics = Statistics(
    // If sizeInBytes is useless in testing, we just use a fake value
    sizeInBytes = size.getOrElse(Int.MaxValue),
    rowCount = Some(rowCount),
    attributeStats = attributeStats)
}

Source File: SparkPlannerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Strategy
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union}
import org.apache.spark.sql.test.SharedSQLContext

class SparkPlannerSuite extends SharedSQLContext {
  import testImplicits._

  test("Ensure to go down only the first branch, not any other possible branches") {

    case object NeverPlanned extends LeafNode {
      override def output: Seq[Attribute] = Nil
    }

    var planned = 0
    object TestStrategy extends Strategy {
      def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
        case ReturnAnswer(child) =>
          planned += 1
          planLater(child) :: planLater(NeverPlanned) :: Nil
        case Union(children) =>
          planned += 1
          UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil
        case LocalRelation(output, data) =>
          planned += 1
          LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil
        case NeverPlanned =>
          fail("QueryPlanner should not go down to this branch.")
        case _ => Nil
      }
    }

    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS())

      assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f"))
      assert(planned === 4)
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: StreamingRelation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LeafNode
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.execution.datasources.DataSource

object StreamingRelation {
  def apply(dataSource: DataSource): StreamingRelation = {
    StreamingRelation(
      dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes)
  }
}


case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode {
  override def toString: String = sourceName
  override protected def doExecute(): RDD[InternalRow] = {
    throw new UnsupportedOperationException("StreamingRelationExec cannot be executed")
  }
}

object StreamingExecutionRelation {
  def apply(source: Source): StreamingExecutionRelation = {
    StreamingExecutionRelation(source, source.schema.toAttributes)
  }
}

org.apache.spark.sql.catalyst.plans.logical.LeafNode Scala Examples