org.apache.spark.sql.catalyst.expressions.PredicateHelper Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.PredicateHelper.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SimpleTextHadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 2
Source File: SimpleTextHadoopFsRelationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.catalog.CatalogUtils import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path( CatalogUtils.URIToString(makeQualifiedPath(file.getCanonicalPath)), s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 3
Source File: SimpleTextHadoopFsRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 4
Source File: package.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, PredicateHelper, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.carbondata.mv.plans.modular.ModularPlan import org.apache.carbondata.mv.plans.util.{CheckSPJG, LogicalPlanSignatureGenerator, Signature} def canEvaluate(exp: ScalaUDF, exprList: Seq[Expression]): Boolean = { var canBeDerived = false exprList.forall { case udf: ScalaUDF => if (udf.children.length == exp.children.length) { if (udf.children.zip(exp.children).forall(e => e._1.sql.equalsIgnoreCase(e._2.sql))) { canBeDerived = true } } canBeDerived case _ => canBeDerived } } def canEvaluate(expr: Expression, exprList: Seq[Expression]): Boolean = { expr match { case exp: ScalaUDF => canEvaluate(exp, exprList) case _ => expr.references.subsetOf(AttributeSet(exprList)) } } } def supports(supported: Boolean, message: Any) { if (!supported) { throw new UnsupportedOperationException(s"unsupported operation: $message") } } }
Example 5
Source File: CarbonUDFTransformRule.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.optimizer import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, PredicateHelper, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.types.StringType import org.apache.carbondata.core.constants.CarbonCommonConstants class CarbonUDFTransformRule extends Rule[LogicalPlan] with PredicateHelper { override def apply(plan: LogicalPlan): LogicalPlan = { pushDownUDFToJoinLeftRelation(plan) } private def pushDownUDFToJoinLeftRelation(plan: LogicalPlan): LogicalPlan = { val output = plan.transform { case proj@Project(cols, Join( left, right, jointype: org.apache.spark.sql.catalyst.plans.JoinType, condition)) => var projectionToBeAdded: Seq[org.apache.spark.sql.catalyst.expressions.Alias] = Seq.empty var udfExists = false val newCols = cols.map { case a@Alias(s: ScalaUDF, name) if name.equalsIgnoreCase(CarbonCommonConstants.POSITION_ID) || name.equalsIgnoreCase(CarbonCommonConstants.CARBON_IMPLICIT_COLUMN_TUPLEID) => udfExists = true projectionToBeAdded :+= a AttributeReference(name, StringType, nullable = true)().withExprId(a.exprId) case other => other } if (udfExists) { val newLeft = left match { case Project(columns, logicalPlan) => Project(columns ++ projectionToBeAdded, logicalPlan) case filter: Filter => Project(filter.output ++ projectionToBeAdded, filter) case relation: LogicalRelation => Project(relation.output ++ projectionToBeAdded, relation) case other => other } Project(newCols, Join(newLeft, right, jointype, condition)) } else { proj } case other => other } output } }
Example 6
Source File: CarbonIUDRule.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.optimizer import org.apache.spark.sql.ProjectForUpdate import org.apache.spark.sql.catalyst.expressions.{NamedExpression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.command.mutation.CarbonProjectForUpdateCommand import org.apache.carbondata.core.constants.CarbonCommonConstants class CarbonIUDRule extends Rule[LogicalPlan] with PredicateHelper { override def apply(plan: LogicalPlan): LogicalPlan = { processPlan(plan) } private def processPlan(plan: LogicalPlan): LogicalPlan = { plan transform { case ProjectForUpdate(table, cols, Seq(updatePlan)) => var isTransformed = false val newPlan = updatePlan transform { case Project(pList, child) if !isTransformed => var (dest: Seq[NamedExpression], source: Seq[NamedExpression]) = pList .splitAt(pList.size - cols.size) // check complex column cols.foreach { col => val complexExists = "\"name\":\"" + col + "\"" if (dest.exists(m => m.dataType.json.contains(complexExists))) { throw new UnsupportedOperationException( "Unsupported operation on Complex data type") } } // check updated columns exists in table val diff = cols.diff(dest.map(_.name.toLowerCase)) if (diff.nonEmpty) { sys.error(s"Unknown column(s) ${ diff.mkString(",") } in table ${ table.tableName }") } // modify plan for updated column *in place* isTransformed = true source.foreach { col => val colName = col.name.substring(0, col.name.lastIndexOf(CarbonCommonConstants.UPDATED_COL_EXTENSION)) val updateIdx = dest.indexWhere(_.name.equalsIgnoreCase(colName)) dest = dest.updated(updateIdx, col) } Project(dest, child) } CarbonProjectForUpdateCommand( newPlan, table.tableIdentifier.database, table.tableIdentifier.table, cols) } } }
Example 7
Source File: PlanningTest.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid.test import java.util.TimeZone import com.github.nscala_time.time.Imports._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.expressions.{Expression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.Filter import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.test.sparklinedata.TestHive import org.apache.spark.sql.sources.druid.DruidPlanner import org.scalatest.BeforeAndAfterEach import org.sparklinedata.druid._ import org.sparklinedata.druid.client.test.BaseTest import org.sparklinedata.druid.metadata.DruidRelationInfo trait PlanningTestHelper extends PredicateHelper { System.setProperty("user.timezone", "UTC") TimeZone.setDefault(TimeZone.getTimeZone("UTC")) override def splitConjunctivePredicates(condition: Expression): Seq[Expression] = { super.splitConjunctivePredicates(condition) } } abstract class PlanningTest extends BaseTest with BeforeAndAfterEach with PlanningTestHelper { val dPlanner = new DruidPlanner(TestHive) var tab: DataFrame = _ var drInfo: DruidRelationInfo = _ var dqb: DruidQueryBuilder = _ var iCE: IntervalConditionExtractor = _ var iCE2: SparkIntervalConditionExtractor = _ override def beforeAll() = { super.beforeAll() tab = TestHive.table("orderLineItemPartSupplier") drInfo = tab.queryExecution.optimizedPlan. asInstanceOf[LogicalRelation].relation.asInstanceOf[DruidRelation].info } override protected def beforeEach(): Unit = { dqb = DruidQueryBuilder(drInfo) iCE = new IntervalConditionExtractor(dqb) iCE2 = new SparkIntervalConditionExtractor(dqb) } def validateFilter(filterStr: String, pushedToDruid: Boolean = true, filSpec: Option[FilterSpec] = None, intervals: List[Interval] = List() ): Unit = { val q = tab.where(filterStr) val filter = q.queryExecution.optimizedPlan.asInstanceOf[Filter] val dqbs = dPlanner.translateProjectFilter( Some(dqb), Seq(), splitConjunctivePredicates(filter.condition), true ) if (pushedToDruid) { assert(dqbs.size == 1) val odqb = dqbs(0) assert(odqb.filterSpec == filSpec) assert(odqb.queryIntervals.intervals == intervals) } } }
Example 8
Source File: SimpleTextHadoopFsRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 9
Source File: SimpleTextHadoopFsRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.catalog.CatalogUtils import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path( CatalogUtils.URIToString(makeQualifiedPath(file.getCanonicalPath)), s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 10
Source File: SimbaOptimizer.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba import org.apache.spark.sql.ExperimentalMethods import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.expressions.{And, Expression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkOptimizer import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.simba.plans.SpatialJoin class SimbaOptimizer(catalog: SessionCatalog, conf: SQLConf, experimentalMethods: ExperimentalMethods) extends SparkOptimizer(catalog, conf, experimentalMethods) { override def batches: Seq[Batch] = super.batches :+ Batch("SpatialJoinPushDown", FixedPoint(100), PushPredicateThroughSpatialJoin) } object PushPredicateThroughSpatialJoin extends Rule[LogicalPlan] with PredicateHelper { private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = { val (leftEvaluateCondition, rest) = condition.partition(_.references subsetOf left.outputSet) val (rightEvaluateCondition, commonCondition) = rest.partition(_.references subsetOf right.outputSet) (leftEvaluateCondition, rightEvaluateCondition, commonCondition) } def apply(plan: LogicalPlan): LogicalPlan = plan transform { // push the where condition down into join filter case f @ Filter(filterCondition, SpatialJoin(left, right, joinType, joinCondition)) => val (leftFilterConditions, rightFilterConditions, commonFilterCondition) = split(splitConjunctivePredicates(filterCondition), left, right) val newLeft = leftFilterConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left) val newRight = rightFilterConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right) val newJoinCond = (commonFilterCondition ++ joinCondition).reduceLeftOption(And) SpatialJoin(newLeft, newRight, joinType, newJoinCond) // push down the join filter into sub query scanning if applicable case f @ SpatialJoin(left, right, joinType, joinCondition) => val (leftJoinConditions, rightJoinConditions, commonJoinCondition) = split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right) val newLeft = leftJoinConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left) val newRight = rightJoinConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right) val newJoinCond = commonJoinCondition.reduceLeftOption(And) SpatialJoin(newLeft, newRight, joinType, newJoinCond) } }
Example 11
Source File: FilterExec.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution import org.apache.spark.sql.simba.expression._ import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal, PredicateHelper} import org.apache.spark.sql.catalyst.expressions.{SortOrder, And => SQLAnd, Not => SQLNot, Or => SQLOr} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class FilterExec(condition: Expression, child: SparkPlan) extends SimbaPlan with PredicateHelper { override def output: Seq[Attribute] = child.output private class DistanceOrdering(point: Expression, target: Point) extends Ordering[InternalRow] { override def compare(x: InternalRow, y: InternalRow): Int = { val shape_x = ShapeUtils.getShape(point, child.output, x) val shape_y = ShapeUtils.getShape(point, child.output, y) val dis_x = target.minDist(shape_x) val dis_y = target.minDist(shape_y) dis_x.compare(dis_y) } } // TODO change target partition from 1 to some good value // Note that target here must be an point literal in WHERE clause, // hence we can consider it as Point safely def knn(rdd: RDD[InternalRow], point: Expression, target: Point, k: Int): RDD[InternalRow] = sparkContext.parallelize(rdd.map(_.copy()).takeOrdered(k)(new DistanceOrdering(point, target)), 1) def applyCondition(rdd: RDD[InternalRow], condition: Expression): RDD[InternalRow] = { condition match { case InKNN(point, target, k) => val _target = target.asInstanceOf[Literal].value.asInstanceOf[Point] knn(rdd, point, _target, k.value.asInstanceOf[Number].intValue()) case now@And(left, right) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else applyCondition(rdd, left).map(_.copy()).intersection(applyCondition(rdd, right).map(_.copy())) case now@Or(left, right) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else applyCondition(rdd, left).map(_.copy()).union(applyCondition(rdd, right).map(_.copy())).distinct() case now@Not(c) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else rdd.map(_.copy()).subtract(applyCondition(rdd, c).map(_.copy())) case _ => rdd.mapPartitions(iter => iter.filter(newPredicate(condition, child.output).eval(_))) } } protected def doExecute(): RDD[InternalRow] = { val root_rdd = child.execute() condition transformUp { case SQLAnd(left, right) => And(left, right) case SQLOr(left, right)=> Or(left, right) case SQLNot(c) => Not(c) } applyCondition(root_rdd, condition) } override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = child.outputPartitioning }