org.apache.spark.sql.Strategy Scala Examples
The following examples show how to use org.apache.spark.sql.Strategy.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkPlannerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data, _) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 2
Source File: IntervalTreeJoinStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.genApp import org.biodatageeks.sequila.rangejoins.common.{ExtractRangeJoinKeys, ExtractRangeJoinKeysWithEquality} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{SparkSession, Strategy} import org.biodatageeks.sequila.rangejoins.methods.genApp.IntervalTreeJoinChromosome class IntervalTreeJoinStrategy(spark: SparkSession) extends Strategy with Serializable with PredicateHelper { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ExtractRangeJoinKeys(joinType, rangeJoinKeys, left, right) => IntervalTreeJoin(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case ExtractRangeJoinKeysWithEquality(joinType, rangeJoinKeys, left, right) => IntervalTreeJoinChromosome(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case _ => Nil } }
Example 3
Source File: NCListsJoinStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.NCList import org.biodatageeks.sequila.rangejoins.common.{ExtractRangeJoinKeys, ExtractRangeJoinKeysWithEquality} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{SparkSession, Strategy} import org.biodatageeks.sequila.rangejoins.methods.NCList.NCListsJoinChromosome class NCListsJoinStrategy(spark: SparkSession) extends Strategy with Serializable with PredicateHelper { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ExtractRangeJoinKeys(joinType, rangeJoinKeys, left, right) => NCListsJoin(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case ExtractRangeJoinKeysWithEquality(joinType, rangeJoinKeys, left, right) => NCListsJoinChromosome(planLater(left), planLater(right), rangeJoinKeys, spark) :: Nil case _ => Nil } }
Example 4
Source File: PileupStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.pileup import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{PileupTemplate, SparkSession, Strategy} import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter import org.biodatageeks.sequila.datasources.InputDataType import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat import org.biodatageeks.sequila.utils.TableFuncs import org.seqdoop.hadoop_bam.{BAMBDGInputFormat, CRAMBDGInputFormat} import scala.reflect.ClassTag class PileupStrategy (spark:SparkSession) extends Strategy with Serializable { override def apply(plan: LogicalPlan): Seq[SparkPlan] = { plan match { case PileupTemplate(tableName, sampleId, refPath, output) => val inputFormat = TableFuncs.getTableMetadata(spark, tableName).provider inputFormat match { case Some(f) => if (f == InputDataType.BAMInputDataType) PileupPlan[BAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else if (f == InputDataType.CRAMInputDataType) PileupPlan[CRAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else Nil case None => throw new RuntimeException("Only BAM and CRAM file formats are supported in pileup function.") } case _ => Nil } } } case class PileupPlan [T<:BDGAlignInputFormat](plan:LogicalPlan, spark:SparkSession, tableName:String, sampleId:String, refPath: String, output:Seq[Attribute])(implicit c: ClassTag[T]) extends SparkPlan with Serializable with BDGAlignFileReaderWriter [T]{ override def children: Seq[SparkPlan] = Nil override protected def doExecute(): RDD[InternalRow] = { new Pileup(spark).handlePileup(tableName, sampleId, refPath, output) } }
Example 5
Source File: GenomicIntervalStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.utvf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{DataFrame, GenomicInterval, SparkSession, Strategy} import org.apache.spark.unsafe.types.UTF8String case class GIntervalRow(contigName: String, start: Int, end: Int) class GenomicIntervalStrategy( spark: SparkSession) extends Strategy with Serializable { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case GenomicInterval(contigName, start, end,output) => GenomicIntervalPlan(plan,spark,GIntervalRow(contigName,start,end),output) :: Nil case _ => Nil } } case class GenomicIntervalPlan(plan: LogicalPlan, spark: SparkSession,interval:GIntervalRow, output: Seq[Attribute]) extends SparkPlan with Serializable { def doExecute(): org.apache.spark.rdd.RDD[InternalRow] = { import spark.implicits._ lazy val genomicInterval = spark.createDataset(Seq(interval)) genomicInterval .rdd .map(r=>{ val proj = UnsafeProjection.create(schema) proj.apply(InternalRow.fromSeq(Seq(UTF8String.fromString(r.contigName),r.start,r.end))) } ) } def children: Seq[SparkPlan] = Nil }
Example 6
Source File: ExtraStrategiesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { sqlContext.experimental.extraStrategies = Nil } } }
Example 7
Source File: StarryJoinLocalStrategy.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.RowOrdering import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, StarryHashJoinExec, StarryNestedLoopJoinExec} import org.apache.spark.sql.internal.SQLConf private def canRunInLocalMemory(plan: LogicalPlan) = { plan.stats.sizeInBytes >= 0 && plan.stats.sizeInBytes <= conf.getConfString("spark.sql.maxLocalMemoryJoin", "10485760").toLong } private def canBuildRight(joinType: JoinType): Boolean = joinType match { case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => true case _ => false } private def canBuildLeft(joinType: JoinType): Boolean = joinType match { case _: InnerLike | RightOuter => true case _ => false } def decideBuildSide(joinType: JoinType, left: LogicalPlan, right: LogicalPlan) = { val buildLeft = canBuildLeft(joinType) && canRunInLocalMemory(left) val buildRight = canBuildRight(joinType) && canRunInLocalMemory(right) def smallerSide = if (right.stats.sizeInBytes <= left.stats.sizeInBytes) BuildRight else BuildLeft if (buildRight && buildLeft) { smallerSide } else if (buildRight) { BuildRight } else if (buildLeft) { BuildLeft } else { smallerSide } } override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) => val buildSide = decideBuildSide(joinType, left, right) Seq(StarryHashJoinExec( leftKeys, rightKeys, joinType, buildSide, condition, planLater(left), planLater(right))) // --- SortMergeJoin ------------------------------------------------------------ case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) if RowOrdering.isOrderable(leftKeys) => joins.SortMergeJoinExec( leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil // --- Without joining keys ------------------------------------------------------------ // Pick BroadcastNestedLoopJoin if one side could be broadcast case [email protected](left, right, joinType, condition) => val buildSide = decideBuildSide(joinType, left, right) StarryNestedLoopJoinExec( planLater(left), planLater(right), buildSide, joinType, condition) :: Nil // Pick CartesianProduct for InnerJoin case logical.Join(left, right, _: InnerLike, condition) => joins.CartesianProductExec(planLater(left), planLater(right), condition) :: Nil case _ => Nil } }
Example 8
Source File: StarryAggStrategy.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.planning.PhysicalAggregation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.stanlee.execution.aggregate.StarryAggUtils case class StarryAggStrategy() extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case PhysicalAggregation( groupingExpressions, aggregateExpressions, resultExpressions, child) => val (functionsWithDistinct, functionsWithoutDistinct) = aggregateExpressions.partition(_.isDistinct) if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) { // This is a sanity check. We should not reach here when we have multiple distinct // column sets. Our MultipleDistinctRewriter should take care this case. sys.error("You hit a query analyzer bug. Please report your query to " + "Spark user mailing list.") } val aggregateOperator = if (functionsWithDistinct.isEmpty) { StarryAggUtils.planAggregateWithoutDistinct( groupingExpressions, aggregateExpressions, resultExpressions, planLater(child)) } else { StarryAggUtils.planAggregateWithOneDistinct( groupingExpressions, functionsWithDistinct, functionsWithoutDistinct, resultExpressions, planLater(child)) } aggregateOperator case _ => Nil } }
Example 9
Source File: StarryLimitLocalStrategy.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.IntegerLiteral import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.exchange.StarryTakeOrderedAndProjectExec case class StarryLimitLocalStrategy() extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(rootPlan) => rootPlan match { case Limit(IntegerLiteral(limit), Sort(order, true, child)) => StarryTakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil case Limit(IntegerLiteral(limit), Project(projectList, Sort(order, true, child))) => StarryTakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil case Limit(IntegerLiteral(limit), child) => // With whole stage codegen, Spark releases resources only when all the output data of the // query plan are consumed. It's possible that `CollectLimitExec` only consumes a little // data from child plan and finishes the query without releasing resources. Here we wrap // the child plan with `LocalLimitExec`, to stop the processing of whole stage codegen and // trigger the resource releasing work, after we consume `limit` rows. StarryCollectLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil case other => planLater(other) :: Nil } case Limit(IntegerLiteral(limit), Sort(order, true, child)) => StarryTakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil case Limit(IntegerLiteral(limit), Project(projectList, Sort(order, true, child))) => StarryTakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil case _ => Nil } }
Example 10
Source File: SparkPlannerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 11
Source File: ExtraStrategiesSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String //快速操作 case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } //Nil是一个空的List override def children: Seq[SparkPlan] = Nil } //测试策略 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 FastOperator(attr.toAttribute :: Nil) :: Nil //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 case _ => Nil } } //额外的策略集 class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") {//插入一个额外的策略 try { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = Nil } } }
Example 12
Source File: SparkPlannerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def user: String = sparkContext.sparkUser def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child, user) :: planLater(NeverPlanned, user) :: Nil case Union(children) => planned += 1 UnionExec(children.map(p => planLater(p, user))) :: planLater(NeverPlanned, user) :: Nil case LocalRelation(output, data) => planned += 1 LocalTableScanExec(output, data, user) :: planLater(NeverPlanned, user) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 13
Source File: SparkPlannerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 14
Source File: CreateTableStrategy.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{DatasourceResolver, SQLContext, Strategy} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{ExecutedCommand, SparkPlan} import org.apache.spark.sql.sources.TemporaryAndPersistentNature private[sql] case class CreateTableStrategy(sqlContext: SQLContext) extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { // Currently we only handle cases where the user wants to instantiate a // persistent relation any other cases has to be handled by the datasource itself case CreateTableUsing(tableName, userSpecifiedSchema, provider, temporary, options, allowExisting, _) => DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider) match { case _: TemporaryAndPersistentNature => ExecutedCommand(CreateTableUsingTemporaryAwareCommand(tableName, userSpecifiedSchema, Array.empty[String], None, None, provider, options, temporary, allowExisting)) :: Nil case _ => Nil } case CreateTablePartitionedByUsing(tableId, userSpecifiedSchema, provider, partitioningFunction, partitioningColumns, temporary, options, allowExisting, _) => ResolvedDataSource.lookupDataSource(provider).newInstance() match { case _: TemporaryAndPersistentNature => ExecutedCommand(CreateTableUsingTemporaryAwareCommand( tableId, userSpecifiedSchema, Array.empty[String], Some(partitioningFunction), Some(partitioningColumns), provider, options, isTemporary = false, allowExisting)) :: Nil case _ => Nil } case _ => Nil } }
Example 15
Source File: HierarchyStrategy.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hierarchy import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution._ import org.apache.spark.sql.extension.ExtendedPlanner private[sql] case class HierarchyStrategy(planner: ExtendedPlanner) extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Hierarchy( a@AdjacencyListHierarchySpec(source, _, parenthoodExp, startWhere, orderBy), node) => AdjacencyListHierarchyPlan( planner.planLaterExt(source), parenthoodExp, startWhere, orderBy, node, a.pathDataType) :: Nil case Hierarchy( a@LevelBasedHierarchySpec(source, levels, startWhere, orderBy, matcher), node) => LevelHierarchyPlan( planner.planLaterExt(source), levels, startWhere, orderBy, matcher, node, a.pathDataType) :: Nil case _ => Nil } }
Example 16
Source File: SparkPlannerSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Strategy import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, ReturnAnswer, Union} import org.apache.spark.sql.test.SharedSQLContext class SparkPlannerSuite extends SharedSQLContext { import testImplicits._ test("Ensure to go down only the first branch, not any other possible branches") { case object NeverPlanned extends LeafNode { override def output: Seq[Attribute] = Nil } var planned = 0 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case ReturnAnswer(child) => planned += 1 planLater(child) :: planLater(NeverPlanned) :: Nil case Union(children) => planned += 1 UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data, _) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil case NeverPlanned => fail("QueryPlanner should not go down to this branch.") case _ => Nil } } try { spark.experimental.extraStrategies = TestStrategy :: Nil val ds = Seq("a", "b", "c").toDS().union(Seq("d", "e", "f").toDS()) assert(ds.collect().toSeq === Seq("a", "b", "c", "d", "e", "f")) assert(planned === 4) } finally { spark.experimental.extraStrategies = Nil } } }
Example 17
Source File: DataSourceV2Strategy.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.{sources, Strategy} import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition} import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns} import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader object DataSourceV2Strategy extends Strategy { // TODO: nested column pruning. private def pruneColumns( reader: DataSourceReader, relation: DataSourceV2Relation, exprs: Seq[Expression]): Seq[AttributeReference] = { reader match { case r: SupportsPushDownRequiredColumns => val requiredColumns = AttributeSet(exprs.flatMap(_.references)) val neededOutput = relation.output.filter(requiredColumns.contains) if (neededOutput != relation.output) { r.pruneColumns(neededOutput.toStructType) val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap r.readSchema().toAttributes.map { // We have to keep the attribute id during transformation. a => a.withExprId(nameToAttr(a.name).exprId) } } else { relation.output } case _ => relation.output } } override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => val reader = relation.newReader() // `pushedFilters` will be pushed down and evaluated in the underlying data sources. // `postScanFilters` need to be evaluated after the scan. // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. val (pushedFilters, postScanFilters) = pushFilters(reader, filters) val output = pruneColumns(reader, relation, project ++ postScanFilters) logInfo( s""" |Pushing operators to ${relation.source.getClass} |Pushed Filters: ${pushedFilters.mkString(", ")} |Post-Scan Filters: ${postScanFilters.mkString(",")} |Output: ${output.mkString(", ")} """.stripMargin) val scan = DataSourceV2ScanExec( output, relation.source, relation.options, pushedFilters, reader) val filterCondition = postScanFilters.reduceLeftOption(And) val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan) // always add the projection, which will produce unsafe rows required by some operators ProjectExec(project, withFilter) :: Nil case r: StreamingDataSourceV2Relation => // ensure there is a projection, which will produce unsafe rows required by some operators ProjectExec(r.output, DataSourceV2ScanExec(r.output, r.source, r.options, r.pushedFilters, r.reader)) :: Nil case WriteToDataSourceV2(writer, query) => WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil case AppendData(r: DataSourceV2Relation, query, _) => WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil case WriteToContinuousDataSource(writer, query) => WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil case Repartition(1, false, child) => val isContinuous = child.collectFirst { case StreamingDataSourceV2Relation(_, _, _, r: ContinuousReader) => r }.isDefined if (isContinuous) { ContinuousCoalesceExec(1, planLater(child)) :: Nil } else { Nil } case _ => Nil } }