org.apache.spark.sql.catalyst.plans.logical.Filter Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.plans.logical.Filter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RangerSparkRowFilterExtensionTest.scala From spark-ranger with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.hive.test.TestHive import org.scalatest.FunSuite import org.apache.spark.sql.RangerSparkTestUtils._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, RangerSparkRowFilter} class RangerSparkRowFilterExtensionTest extends FunSuite { private val spark = TestHive.sparkSession test("ranger spark row filter extension") { val extension = RangerSparkRowFilterExtension(spark) val plan = spark.sql("select * from src").queryExecution.optimizedPlan println(plan) withUser("bob") { val newPlan = extension.apply(plan) assert(newPlan.isInstanceOf[RangerSparkRowFilter]) val filters = newPlan.collect { case f: Filter => f } assert(filters.nonEmpty, "ranger row level filters should be applied automatically") println(newPlan) } withUser("alice") { val newPlan = extension.apply(plan) assert(newPlan.isInstanceOf[RangerSparkRowFilter]) val filters = newPlan.collect { case f: Filter => f } assert(filters.isEmpty, "alice does not have implicit filters") println(newPlan) } } }
Example 2
Source File: SimbaOptimizer.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba import org.apache.spark.sql.ExperimentalMethods import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.expressions.{And, Expression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkOptimizer import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.simba.plans.SpatialJoin class SimbaOptimizer(catalog: SessionCatalog, conf: SQLConf, experimentalMethods: ExperimentalMethods) extends SparkOptimizer(catalog, conf, experimentalMethods) { override def batches: Seq[Batch] = super.batches :+ Batch("SpatialJoinPushDown", FixedPoint(100), PushPredicateThroughSpatialJoin) } object PushPredicateThroughSpatialJoin extends Rule[LogicalPlan] with PredicateHelper { private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = { val (leftEvaluateCondition, rest) = condition.partition(_.references subsetOf left.outputSet) val (rightEvaluateCondition, commonCondition) = rest.partition(_.references subsetOf right.outputSet) (leftEvaluateCondition, rightEvaluateCondition, commonCondition) } def apply(plan: LogicalPlan): LogicalPlan = plan transform { // push the where condition down into join filter case f @ Filter(filterCondition, SpatialJoin(left, right, joinType, joinCondition)) => val (leftFilterConditions, rightFilterConditions, commonFilterCondition) = split(splitConjunctivePredicates(filterCondition), left, right) val newLeft = leftFilterConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left) val newRight = rightFilterConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right) val newJoinCond = (commonFilterCondition ++ joinCondition).reduceLeftOption(And) SpatialJoin(newLeft, newRight, joinType, newJoinCond) // push down the join filter into sub query scanning if applicable case f @ SpatialJoin(left, right, joinType, joinCondition) => val (leftJoinConditions, rightJoinConditions, commonJoinCondition) = split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right) val newLeft = leftJoinConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left) val newRight = rightJoinConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right) val newJoinCond = commonJoinCondition.reduceLeftOption(And) SpatialJoin(newLeft, newRight, joinType, newJoinCond) } }
Example 3
Source File: HiveAcidAutoConvert.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid import java.util.Locale import com.qubole.spark.datasources.hiveacid.sql.execution.SparkAcidSqlParser import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.plans.logical.{Filter, InsertIntoTable, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.LogicalRelation import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource case class HiveAcidAutoConvert(spark: SparkSession) extends Rule[LogicalPlan] { private def isConvertible(relation: HiveTableRelation): Boolean = { val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) relation.tableMeta.properties.getOrElse("transactional", "false").toBoolean } private def convert(relation: HiveTableRelation): LogicalRelation = { val options = relation.tableMeta.properties ++ relation.tableMeta.storage.properties ++ Map("table" -> relation.tableMeta.qualifiedName) val newRelation = new HiveAcidDataSource().createRelation(spark.sqlContext, options) LogicalRelation(newRelation, isStreaming = false) } override def apply(plan: LogicalPlan): LogicalPlan = { plan resolveOperators { // Write path case InsertIntoTable(r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists) if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && isConvertible(r) => InsertIntoTable(convert(r), partition, query, overwrite, ifPartitionNotExists) // Read path case relation: HiveTableRelation if DDLUtils.isHiveTable(relation.tableMeta) && isConvertible(relation) => convert(relation) } } } class HiveAcidAutoConvertExtension extends (SparkSessionExtensions => Unit) { def apply(extension: SparkSessionExtensions): Unit = { extension.injectResolutionRule(HiveAcidAutoConvert.apply) extension.injectParser { (session, parser) => SparkAcidSqlParser(parser) } } }
Example 4
Source File: SampleStratifiedOn.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.algebra.sampling import org.apache.spark.sql.functions.{rand, udf, col} import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter } import play.api.libs.json._ import mimir.algebra._ import mimir.exec.spark.RAToSpark import mimir.serialization.{ Json => MimirJson } case class SampleStratifiedOn(column:ID, t:Type, strata:Map[PrimitiveValue,Double]) extends SamplingMode { val sparkStrata = strata.map { case (v, p) => RAToSpark.getNative(v, t) -> p } .toMap override def toString = s"ON $column WITH STRATA ${strata.map { case (v,p) => s"$v -> $p"}.mkString(" | ")}" def apply(plan: LogicalPlan, seed: Long): LogicalPlan = { // Adapted from Spark's df.stat.sampleBy method val c = col(column.id) val r = rand(seed) val f = udf { (stratum: Any, x: Double) => x < sparkStrata.getOrElse(stratum, 0.0) } Filter( f(c, r).expr, plan ) } def expressions: Seq[Expression] = Seq(Var(column)) def rebuildExpressions(x: Seq[Expression]): SamplingMode = { x(0) match { case Var(newColumn) => SampleStratifiedOn(newColumn, t, strata) case _ => throw new RAException("Internal Error: Rewriting stratification variable with arbitrary expression") } } def toJson: JsValue = JsObject(Map[String,JsValue]( "mode" -> JsString(SampleStratifiedOn.MODE), "column" -> JsString(column.id), "type" -> MimirJson.ofType(t), "strata" -> JsArray( strata .toSeq .map { case (v, p) => JsObject(Map[String,JsValue]( "value" -> MimirJson.ofPrimitive(v), "probability" -> JsNumber(p) )) } ) )) } object SampleStratifiedOn { val MODE = "stratified_on" def parseJson(json:Map[String, JsValue]): Option[SampleStratifiedOn] = { if(json("mode").as[String].equals(MODE)){ val t = MimirJson.toType(json("type")) Some(SampleStratifiedOn( ID(json("column").as[String]), t, json("strata") .as[Seq[Map[String,JsValue]]] .map { stratum => MimirJson.toPrimitive(t, stratum("value")) -> stratum("probability").as[Double] } .toMap )) } else { None } } }
Example 5
Source File: SampleRowsUniformly.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.algebra.sampling import org.apache.spark.sql.functions.{ rand, udf } import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter } import play.api.libs.json._ import mimir.algebra._ case class SampleRowsUniformly(probability:Double) extends SamplingMode { override def toString = s"WITH PROBABILITY $probability" def apply(plan: LogicalPlan, seed: Long): LogicalPlan = { // Adapted from Spark's df.stat.sampleBy method val r = rand(seed) val f = udf { (x: Double) => x < probability } Filter( f(r).expr, plan ) } def expressions: Seq[Expression] = Seq() def rebuildExpressions(x: Seq[Expression]): SamplingMode = this def toJson: JsValue = JsObject(Map[String,JsValue]( "mode" -> JsString(SampleRowsUniformly.MODE), "probability" -> JsNumber(probability) )) } object SampleRowsUniformly { val MODE = "uniform_probability" def parseJson(json:Map[String, JsValue]): Option[SampleRowsUniformly] = { if(json("mode").as[String].equals(MODE)){ Some(SampleRowsUniformly(json("probability").as[Double])) } else { None } } }
Example 6
Source File: PruneFileSourcePartitions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.catalog.CatalogStatistics import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case op @ PhysicalOperation(projects, filters, logicalRelation @ LogicalRelation(fsRelation @ HadoopFsRelation( catalogFileIndex: CatalogFileIndex, partitionSchema, _, _, _, _), _, _, _)) if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined => // The attribute name of predicate could be different than the one in schema in case of // case insensitive, we should change them to match the one in schema, so we donot need to // worry about case sensitivity anymore. val normalizedFilters = filters.map { e => e transform { case a: AttributeReference => a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name) } } val sparkSession = fsRelation.sparkSession val partitionColumns = logicalRelation.resolve( partitionSchema, sparkSession.sessionState.analyzer.resolver) val partitionSet = AttributeSet(partitionColumns) val partitionKeyFilters = ExpressionSet(normalizedFilters .filterNot(SubqueryExpression.hasSubquery(_)) .filter(_.references.subsetOf(partitionSet))) if (partitionKeyFilters.nonEmpty) { val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq) val prunedFsRelation = fsRelation.copy(location = prunedFileIndex)(sparkSession) // Change table stats based on the sizeInBytes of pruned files val withStats = logicalRelation.catalogTable.map(_.copy( stats = Some(CatalogStatistics(sizeInBytes = BigInt(prunedFileIndex.sizeInBytes))))) val prunedLogicalRelation = logicalRelation.copy( relation = prunedFsRelation, catalogTable = withStats) // Keep partition-pruning predicates so that they are visible in physical planning val filterExpression = filters.reduceLeft(And) val filter = Filter(filterExpression, prunedLogicalRelation) Project(projects, filter) } else { op } } }
Example 7
Source File: ResolveSubquerySuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter(In(a, Seq(ListQuery(Project(Seq(UnresolvedAttribute("a")), t2)))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.checkAnalysis(SimpleAnalyzer.ResolveSubquery(expr)) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 8
Source File: PruneFileSourcePartitionsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.toURI}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, tableMeta) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } test("SPARK-20986 Reset table's statistics after PruneFileSourcePartitions rule") { withTable("tbl") { spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("tbl") sql(s"ANALYZE TABLE tbl COMPUTE STATISTICS") val tableStats = spark.sessionState.catalog.getTableMetadata(TableIdentifier("tbl")).stats assert(tableStats.isDefined && tableStats.get.sizeInBytes > 0, "tableStats is lost") val df = sql("SELECT * FROM tbl WHERE p = 1") val sizes1 = df.queryExecution.analyzed.collect { case relation: LogicalRelation => relation.catalogTable.get.stats.get.sizeInBytes } assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}") assert(sizes1(0) == tableStats.get.sizeInBytes) val relations = df.queryExecution.optimizedPlan.collect { case relation: LogicalRelation => relation } assert(relations.size === 1, s"Size wrong for:\n ${df.queryExecution}") val size2 = relations(0).stats.sizeInBytes assert(size2 == relations(0).catalogTable.get.stats.get.sizeInBytes) assert(size2 < tableStats.get.sizeInBytes) } } }
Example 9
Source File: PruneFileSourcePartitions.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case op @ PhysicalOperation(projects, filters, logicalRelation @ LogicalRelation(fsRelation @ HadoopFsRelation( catalogFileIndex: CatalogFileIndex, partitionSchema, _, _, _, _), _, _)) if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined => // The attribute name of predicate could be different than the one in schema in case of // case insensitive, we should change them to match the one in schema, so we donot need to // worry about case sensitivity anymore. val normalizedFilters = filters.map { e => e transform { case a: AttributeReference => a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name) } } val sparkSession = fsRelation.sparkSession val partitionColumns = logicalRelation.resolve( partitionSchema, sparkSession.sessionState.analyzer.resolver) val partitionSet = AttributeSet(partitionColumns) val partitionKeyFilters = ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet))) if (partitionKeyFilters.nonEmpty) { val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq) val prunedFsRelation = fsRelation.copy(location = prunedFileIndex)(sparkSession) val prunedLogicalRelation = logicalRelation.copy( relation = prunedFsRelation, expectedOutputAttributes = Some(logicalRelation.output)) // Keep partition-pruning predicates so that they are visible in physical planning val filterExpression = filters.reduceLeft(And) val filter = Filter(filterExpression, prunedLogicalRelation) Project(projects, filter) } else { op } } }
Example 10
Source File: ResolveSubquerySuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.ResolveSubquery(expr) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 11
Source File: PruneFileSourcePartitionsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.getAbsolutePath}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, catalogTable = Some(tableMeta)) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } }
Example 12
Source File: PlanningTest.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid.test import java.util.TimeZone import com.github.nscala_time.time.Imports._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.expressions.{Expression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.Filter import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.test.sparklinedata.TestHive import org.apache.spark.sql.sources.druid.DruidPlanner import org.scalatest.BeforeAndAfterEach import org.sparklinedata.druid._ import org.sparklinedata.druid.client.test.BaseTest import org.sparklinedata.druid.metadata.DruidRelationInfo trait PlanningTestHelper extends PredicateHelper { System.setProperty("user.timezone", "UTC") TimeZone.setDefault(TimeZone.getTimeZone("UTC")) override def splitConjunctivePredicates(condition: Expression): Seq[Expression] = { super.splitConjunctivePredicates(condition) } } abstract class PlanningTest extends BaseTest with BeforeAndAfterEach with PlanningTestHelper { val dPlanner = new DruidPlanner(TestHive) var tab: DataFrame = _ var drInfo: DruidRelationInfo = _ var dqb: DruidQueryBuilder = _ var iCE: IntervalConditionExtractor = _ var iCE2: SparkIntervalConditionExtractor = _ override def beforeAll() = { super.beforeAll() tab = TestHive.table("orderLineItemPartSupplier") drInfo = tab.queryExecution.optimizedPlan. asInstanceOf[LogicalRelation].relation.asInstanceOf[DruidRelation].info } override protected def beforeEach(): Unit = { dqb = DruidQueryBuilder(drInfo) iCE = new IntervalConditionExtractor(dqb) iCE2 = new SparkIntervalConditionExtractor(dqb) } def validateFilter(filterStr: String, pushedToDruid: Boolean = true, filSpec: Option[FilterSpec] = None, intervals: List[Interval] = List() ): Unit = { val q = tab.where(filterStr) val filter = q.queryExecution.optimizedPlan.asInstanceOf[Filter] val dqbs = dPlanner.translateProjectFilter( Some(dqb), Seq(), splitConjunctivePredicates(filter.condition), true ) if (pushedToDruid) { assert(dqbs.size == 1) val odqb = dqbs(0) assert(odqb.filterSpec == filSpec) assert(odqb.queryIntervals.intervals == intervals) } } }
Example 13
Source File: PruneFileSourcePartitionsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions, TableFileCatalog} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.getAbsolutePath}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = tableFileCatalog, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, catalogTable = Some(tableMeta)) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } }
Example 14
Source File: CarbonUDFTransformRule.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.optimizer import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, PredicateHelper, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.types.StringType import org.apache.carbondata.core.constants.CarbonCommonConstants class CarbonUDFTransformRule extends Rule[LogicalPlan] with PredicateHelper { override def apply(plan: LogicalPlan): LogicalPlan = { pushDownUDFToJoinLeftRelation(plan) } private def pushDownUDFToJoinLeftRelation(plan: LogicalPlan): LogicalPlan = { val output = plan.transform { case proj@Project(cols, Join( left, right, jointype: org.apache.spark.sql.catalyst.plans.JoinType, condition)) => var projectionToBeAdded: Seq[org.apache.spark.sql.catalyst.expressions.Alias] = Seq.empty var udfExists = false val newCols = cols.map { case a@Alias(s: ScalaUDF, name) if name.equalsIgnoreCase(CarbonCommonConstants.POSITION_ID) || name.equalsIgnoreCase(CarbonCommonConstants.CARBON_IMPLICIT_COLUMN_TUPLEID) => udfExists = true projectionToBeAdded :+= a AttributeReference(name, StringType, nullable = true)().withExprId(a.exprId) case other => other } if (udfExists) { val newLeft = left match { case Project(columns, logicalPlan) => Project(columns ++ projectionToBeAdded, logicalPlan) case filter: Filter => Project(filter.output ++ projectionToBeAdded, filter) case relation: LogicalRelation => Project(relation.output ++ projectionToBeAdded, relation) case other => other } Project(newCols, Join(newLeft, right, jointype, condition)) } else { proj } case other => other } output } }
Example 15
Source File: IUDCommonUtil.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.HiveSessionCatalog import org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties def checkIfSegmentListIsSet(sparkSession: SparkSession, logicalPlan: LogicalPlan): Unit = { val carbonProperties = CarbonProperties.getInstance() logicalPlan.foreach { case unresolvedRelation: UnresolvedRelation => val dbAndTb = sparkSession.sessionState.catalog.asInstanceOf[HiveSessionCatalog].getCurrentDatabase + "." + unresolvedRelation.tableIdentifier.table val segmentProperties = carbonProperties .getProperty(CarbonCommonConstants.CARBON_INPUT_SEGMENTS + dbAndTb, "") if (!(segmentProperties.equals("") || segmentProperties.trim.equals("*"))) { throw new MalformedCarbonCommandException("carbon.input.segments." + dbAndTb + "should not be set for table used in DELETE " + "query. Please reset the property to carbon" + ".input.segments." + dbAndTb + "=*") } case logicalRelation: LogicalRelation if (logicalRelation.relation .isInstanceOf[CarbonDatasourceHadoopRelation]) => val dbAndTb = logicalRelation.relation.asInstanceOf[CarbonDatasourceHadoopRelation].carbonTable .getDatabaseName + "." + logicalRelation.relation.asInstanceOf[CarbonDatasourceHadoopRelation].carbonTable .getTableName val sementProperty = carbonProperties .getProperty(CarbonCommonConstants.CARBON_INPUT_SEGMENTS + dbAndTb, "") if (!(sementProperty.equals("") || sementProperty.trim.equals("*"))) { throw new MalformedCarbonCommandException("carbon.input.segments." + dbAndTb + "should not be set for table used in UPDATE " + "query. Please reset the property to carbon" + ".input.segments." + dbAndTb + "=*") } case filter: Filter => filter.subqueries.toList .foreach(subquery => checkIfSegmentListIsSet(sparkSession, subquery)) case _ => } } }
Example 16
Source File: PruneFileSourcePartitions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case op @ PhysicalOperation(projects, filters, logicalRelation @ LogicalRelation(fsRelation @ HadoopFsRelation( catalogFileIndex: CatalogFileIndex, partitionSchema, _, _, _, _), _, _)) if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined => // The attribute name of predicate could be different than the one in schema in case of // case insensitive, we should change them to match the one in schema, so we donot need to // worry about case sensitivity anymore. val normalizedFilters = filters.map { e => e transform { case a: AttributeReference => a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name) } } val sparkSession = fsRelation.sparkSession val partitionColumns = logicalRelation.resolve( partitionSchema, sparkSession.sessionState.analyzer.resolver) val partitionSet = AttributeSet(partitionColumns) val partitionKeyFilters = ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet))) if (partitionKeyFilters.nonEmpty) { val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq) val prunedFsRelation = fsRelation.copy(location = prunedFileIndex)(sparkSession) val prunedLogicalRelation = logicalRelation.copy( relation = prunedFsRelation, expectedOutputAttributes = Some(logicalRelation.output)) // Keep partition-pruning predicates so that they are visible in physical planning val filterExpression = filters.reduceLeft(And) val filter = Filter(filterExpression, prunedLogicalRelation) Project(projects, filter) } else { op } } }
Example 17
Source File: ResolveSubquerySuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.ResolveSubquery(expr) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 18
Source File: PruneFileSourcePartitionsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.getAbsolutePath}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, catalogTable = Some(tableMeta)) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } }
Example 19
Source File: RangeQueryRewrite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.catalyst import magellan.{BoundingBox, Point} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule private[magellan] case class RangeQueryRewrite(session: SparkSession) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { plan transformUp { case p @ Filter(condition, child) => val transformedCondition = condition transformUp { case WithinRange((PointConverter(xexpr, yexpr)), boundingBox) => prune(boundingBox, xexpr, yexpr) case q @ WithinCircleRange((PointConverter(xexpr, yexpr)), point, radius) => val (x, y) = (point.getX(), point.getY()) val boundingBox = BoundingBox(x - radius, y - radius, x + radius, y + radius) And(prune(boundingBox, xexpr, yexpr), new WithinCircleRangePostOpt((PointConverter(xexpr, yexpr)), point, radius)) case q : WithinCircleRangePostOpt => q } Filter(transformedCondition, child) } } private def prune(boundingBox: BoundingBox, xexpr: Expression, yexpr: Expression) = { val xpredicate = And(LessThanOrEqual(xexpr, Literal(boundingBox.xmax)), GreaterThanOrEqual(xexpr, Literal(boundingBox.xmin))) val ypredicate = And(LessThanOrEqual(yexpr, Literal(boundingBox.ymax)), GreaterThanOrEqual(yexpr, Literal(boundingBox.ymin))) And(xpredicate, ypredicate) } } private [magellan] class WithinCircleRangePostOpt(child: Expression, point: Point, radius: Double) extends WithinCircleRange(child, point, radius)
Example 20
Source File: PruneFileSourcePartitions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.catalog.CatalogStatistics import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case op @ PhysicalOperation(projects, filters, logicalRelation @ LogicalRelation(fsRelation @ HadoopFsRelation( catalogFileIndex: CatalogFileIndex, partitionSchema, _, _, _, _), _, _, _)) if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined => // The attribute name of predicate could be different than the one in schema in case of // case insensitive, we should change them to match the one in schema, so we donot need to // worry about case sensitivity anymore. val normalizedFilters = filters.map { e => e transform { case a: AttributeReference => a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name) } } val sparkSession = fsRelation.sparkSession val partitionColumns = logicalRelation.resolve( partitionSchema, sparkSession.sessionState.analyzer.resolver) val partitionSet = AttributeSet(partitionColumns) val partitionKeyFilters = ExpressionSet(normalizedFilters .filterNot(SubqueryExpression.hasSubquery(_)) .filter(_.references.subsetOf(partitionSet))) if (partitionKeyFilters.nonEmpty) { val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq) val prunedFsRelation = fsRelation.copy(location = prunedFileIndex)(sparkSession) // Change table stats based on the sizeInBytes of pruned files val withStats = logicalRelation.catalogTable.map(_.copy( stats = Some(CatalogStatistics(sizeInBytes = BigInt(prunedFileIndex.sizeInBytes))))) val prunedLogicalRelation = logicalRelation.copy( relation = prunedFsRelation, catalogTable = withStats) // Keep partition-pruning predicates so that they are visible in physical planning val filterExpression = filters.reduceLeft(And) val filter = Filter(filterExpression, prunedLogicalRelation) Project(projects, filter) } else { op } } }
Example 21
Source File: ResolveSubquerySuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{InSubquery, ListQuery} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter( InSubquery(Seq(a), ListQuery(Project(Seq(UnresolvedAttribute("a")), t2))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.checkAnalysis(SimpleAnalyzer.ResolveSubquery(expr)) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 22
Source File: PruneFileSourcePartitionsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.scalatest.Matchers._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project, ResolvedHint} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.functions.broadcast import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.toURI}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, tableMeta) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } test("SPARK-20986 Reset table's statistics after PruneFileSourcePartitions rule") { withTable("tbl") { spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("tbl") sql(s"ANALYZE TABLE tbl COMPUTE STATISTICS") val tableStats = spark.sessionState.catalog.getTableMetadata(TableIdentifier("tbl")).stats assert(tableStats.isDefined && tableStats.get.sizeInBytes > 0, "tableStats is lost") val df = sql("SELECT * FROM tbl WHERE p = 1") val sizes1 = df.queryExecution.analyzed.collect { case relation: LogicalRelation => relation.catalogTable.get.stats.get.sizeInBytes } assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}") assert(sizes1(0) == tableStats.get.sizeInBytes) val relations = df.queryExecution.optimizedPlan.collect { case relation: LogicalRelation => relation } assert(relations.size === 1, s"Size wrong for:\n ${df.queryExecution}") val size2 = relations(0).stats.sizeInBytes assert(size2 == relations(0).catalogTable.get.stats.get.sizeInBytes) assert(size2 < tableStats.get.sizeInBytes) } } test("SPARK-26576 Broadcast hint not applied to partitioned table") { withTable("tbl") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("tbl") val df = spark.table("tbl") val qe = df.join(broadcast(df), "p").queryExecution qe.optimizedPlan.collect { case _: ResolvedHint => } should have size 1 qe.sparkPlan.collect { case j: BroadcastHashJoinExec => j } should have size 1 } } } }
Example 23
Source File: OptimizeHiveMetadataOnlyQuerySuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfter import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.expressions.NamedExpression import org.apache.spark.sql.catalyst.plans.logical.{Distinct, Filter, Project, SubqueryAlias} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class OptimizeHiveMetadataOnlyQuerySuite extends QueryTest with TestHiveSingleton with BeforeAndAfter with SQLTestUtils { import spark.implicits._ override def beforeAll(): Unit = { super.beforeAll() sql("CREATE TABLE metadata_only (id bigint, data string) PARTITIONED BY (part int)") (0 to 10).foreach(p => sql(s"ALTER TABLE metadata_only ADD PARTITION (part=$p)")) } override protected def afterAll(): Unit = { try { sql("DROP TABLE IF EXISTS metadata_only") } finally { super.afterAll() } } test("SPARK-23877: validate metadata-only query pushes filters to metastore") { withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") { val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount // verify the number of matching partitions assert(sql("SELECT DISTINCT part FROM metadata_only WHERE part < 5").collect().length === 5) // verify that the partition predicate was pushed down to the metastore assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount === 5) } } test("SPARK-23877: filter on projected expression") { withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") { val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount // verify the matching partitions val partitions = spark.internalCreateDataFrame(Distinct(Filter(($"x" < 5).expr, Project(Seq(($"part" + 1).as("x").expr.asInstanceOf[NamedExpression]), spark.table("metadata_only").logicalPlan.asInstanceOf[SubqueryAlias].child))) .queryExecution.toRdd, StructType(Seq(StructField("x", IntegerType)))) checkAnswer(partitions, Seq(1, 2, 3, 4).toDF("x")) // verify that the partition predicate was not pushed down to the metastore assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount == 11) } } }
Example 24
Source File: PruneFileSourcePartitions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case op @ PhysicalOperation(projects, filters, logicalRelation @ LogicalRelation(fsRelation @ HadoopFsRelation( tableFileCatalog: TableFileCatalog, partitionSchema, _, _, _, _), _, _)) if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined => // The attribute name of predicate could be different than the one in schema in case of // case insensitive, we should change them to match the one in schema, so we donot need to // worry about case sensitivity anymore. val normalizedFilters = filters.map { e => e transform { case a: AttributeReference => a.withName(logicalRelation.output.find(_.semanticEquals(a)).get.name) } } val sparkSession = fsRelation.sparkSession val partitionColumns = logicalRelation.resolve( partitionSchema, sparkSession.sessionState.analyzer.resolver) val partitionSet = AttributeSet(partitionColumns) val partitionKeyFilters = ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet))) if (partitionKeyFilters.nonEmpty) { val prunedFileCatalog = tableFileCatalog.filterPartitions(partitionKeyFilters.toSeq) val prunedFsRelation = fsRelation.copy(location = prunedFileCatalog)(sparkSession) val prunedLogicalRelation = logicalRelation.copy( relation = prunedFsRelation, expectedOutputAttributes = Some(logicalRelation.output)) // Keep partition-pruning predicates so that they are visible in physical planning val filterExpression = filters.reduceLeft(And) val filter = Filter(filterExpression, prunedLogicalRelation) Project(projects, filter) } else { op } } }