org.apache.spark.sql.catalyst.expressions.Expression Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.Expression.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroDataToCatalyst.scala From spark-schema-registry with Apache License 2.0 | 6 votes |
package com.hortonworks.spark.registry.avro import java.io.ByteArrayInputStream import com.hortonworks.registries.schemaregistry.{SchemaVersionInfo, SchemaVersionKey} import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer import org.apache.avro.Schema import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{BinaryType, DataType} import scala.collection.JavaConverters._ case class AvroDataToCatalyst(child: Expression, schemaName: String, version: Option[Int], config: Map[String, Object]) extends UnaryExpression with ExpectsInputTypes { override def inputTypes = Seq(BinaryType) @transient private lazy val srDeser: AvroSnapshotDeserializer = { val obj = new AvroSnapshotDeserializer() obj.init(config.asJava) obj } @transient private lazy val srSchema = fetchSchemaVersionInfo(schemaName, version) @transient private lazy val avroSchema = new Schema.Parser().parse(srSchema.getSchemaText) override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType @transient private lazy val avroDeser= new AvroDeserializer(avroSchema, dataType) override def nullable: Boolean = true override def nullSafeEval(input: Any): Any = { val binary = input.asInstanceOf[Array[Byte]] val row = avroDeser.deserialize(srDeser.deserialize(new ByteArrayInputStream(binary), srSchema.getVersion)) val result = row match { case r: InternalRow => r.copy() case _ => row } result } override def simpleString: String = { s"from_sr(${child.sql}, ${dataType.simpleString})" } override def sql: String = { s"from_sr(${child.sql}, ${dataType.catalogString})" } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val expr = ctx.addReferenceObj("this", this) defineCodeGen(ctx, ev, input => s"(${ctx.boxedType(dataType)})$expr.nullSafeEval($input)") } private def fetchSchemaVersionInfo(schemaName: String, version: Option[Int]): SchemaVersionInfo = { val srClient = new SchemaRegistryClient(config.asJava) version.map(v => srClient.getSchemaVersionInfo(new SchemaVersionKey(schemaName, v))) .getOrElse(srClient.getLatestSchemaVersionInfo(schemaName)) } }
Example 2
Source File: ShapeUtils.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.util import org.apache.spark.sql.simba.{ShapeSerializer, ShapeType} import org.apache.spark.sql.simba.expression.PointWrapper import org.apache.spark.sql.simba.spatial.{Point, Shape} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, Expression, UnsafeArrayData} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan object ShapeUtils { def getPointFromRow(row: InternalRow, columns: List[Attribute], plan: SparkPlan, isPoint: Boolean): Point = { if (isPoint) { ShapeSerializer.deserialize(BindReferences.bindReference(columns.head, plan.output) .eval(row).asInstanceOf[UnsafeArrayData].toByteArray).asInstanceOf[Point] } else { Point(columns.toArray.map(BindReferences.bindReference(_, plan.output).eval(row) .asInstanceOf[Number].doubleValue())) } } def getPointFromRow(row: InternalRow, columns: List[Attribute], plan: LogicalPlan, isPoint: Boolean): Point = { if (isPoint) { ShapeSerializer.deserialize(BindReferences.bindReference(columns.head, plan.output) .eval(row).asInstanceOf[UnsafeArrayData].toByteArray).asInstanceOf[Point] } else { Point(columns.toArray.map(BindReferences.bindReference(_, plan.output).eval(row) .asInstanceOf[Number].doubleValue())) } } def getShape(expression: Expression, input: InternalRow): Shape = { if (!expression.isInstanceOf[PointWrapper] && expression.dataType.isInstanceOf[ShapeType]) { ShapeSerializer.deserialize(expression.eval(input).asInstanceOf[UnsafeArrayData].toByteArray) } else if (expression.isInstanceOf[PointWrapper]) { expression.eval(input).asInstanceOf[Shape] } else throw new UnsupportedOperationException("Query shape should be of ShapeType") } def getShape(expression: Expression, schema: Seq[Attribute], input: InternalRow): Shape = { if (!expression.isInstanceOf[PointWrapper] && expression.dataType.isInstanceOf[ShapeType]) { ShapeSerializer.deserialize(BindReferences.bindReference(expression, schema) .eval(input).asInstanceOf[UnsafeArrayData].toByteArray) } else if (expression.isInstanceOf[PointWrapper]) { BindReferences.bindReference(expression, schema).eval(input).asInstanceOf[Shape] } else throw new UnsupportedOperationException("Query shape should be of ShapeType") } }
Example 3
Source File: SemiJoinSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.{SQLConf, DataFrame, Row} import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression} import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class SemiJoinSuite extends SparkPlanTest with SharedSQLContext { private lazy val left = sqlContext.createDataFrame( sparkContext.parallelize(Seq( Row(1, 2.0), Row(1, 2.0), Row(2, 1.0), Row(2, 1.0), Row(3, 3.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("a", IntegerType).add("b", DoubleType)) private lazy val right = sqlContext.createDataFrame( sparkContext.parallelize(Seq( Row(2, 3.0), Row(2, 3.0), Row(3, 2.0), Row(4, 1.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("c", IntegerType).add("d", DoubleType)) private lazy val condition = { And((left.col("a") === right.col("c")).expr, LessThan(left.col("b").expr, right.col("d").expr)) } // Note: the input dataframes and expression must be evaluated lazily because // the SQLContext should be used only within a test to keep SQL tests stable private def testLeftSemiJoin( testName: String, leftRows: => DataFrame, rightRows: => DataFrame, condition: => Expression, expectedAnswer: Seq[Product]): Unit = { def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) ExtractEquiJoinKeys.unapply(join) } test(s"$testName using LeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => EnsureRequirements(left.sqlContext).apply( LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using BroadcastLeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using LeftSemiJoinBNL") { withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => LeftSemiJoinBNL(left, right, Some(condition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } testLeftSemiJoin( "basic test", left, right, condition, Seq( (2, 1.0), (2, 1.0) ) ) }
Example 4
Source File: PredicateUtil.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.util import org.apache.spark.sql.catalyst.expressions.{Expression, And, Or} object PredicateUtil { def toDNF(condition: Expression): Expression = { condition match { case Or(left, right) => Or(toDNF(left), toDNF(right)) case And(left, right) => var ans: Expression = null val tmp_left = toDNF(left) val tmp_right = toDNF(right) tmp_left match { case Or(l, r) => ans = Or(And(l, tmp_right), And(r, tmp_right)) case _ => } tmp_right match { case Or(l, r) => if (ans == null) ans = Or(And(tmp_left, l), And(tmp_left, r)) case _ => } if (ans == null) And(tmp_left, tmp_right) else toDNF(ans) case exp => exp } } def toCNF(condition: Expression): Expression = { condition match { case And(left, right) => And(toCNF(left), toCNF(right)) case Or(left, right) => var ans: Expression = null val tmp_left = toCNF(left) val tmp_right = toCNF(right) tmp_left match { case And(l, r) => ans = And(Or(l, tmp_right), Or(r, tmp_right)) case _ => } tmp_right match { case And(l, r) => if (ans == null) ans = And(Or(tmp_left, l), Or(tmp_left, r)) case _ => } if (ans == null) Or(tmp_left, tmp_right) else toCNF(ans) case exp => exp } } def dnfExtract(expression: Expression): Seq[Expression] = { expression match { case Or(left, right) => dnfExtract(left) ++ dnfExtract(right) case And(left @ And(l2, r2), right) => dnfExtract(And(l2, And(r2, right))) case other => other :: Nil } } def cnfExtract(expression: Expression): Seq[Expression] = { expression match { case And(left, right) => cnfExtract(left) ++ cnfExtract(right) case Or(left @ Or(l2, r2), right) => cnfExtract(Or(l2, Or(r2, right))) case other => other :: Nil } } def splitDNFPredicates(condition: Expression) = dnfExtract(toDNF(condition)) def splitCNFPredicates(condition: Expression) = cnfExtract(toCNF(condition)) def splitConjunctivePredicates(condition: Expression): Seq[Expression] = { condition match { case And(cond1, cond2) => splitConjunctivePredicates(cond1) ++ splitConjunctivePredicates(cond2) case other => other :: Nil } } def splitDisjunctivePredicates(condition: Expression): Seq[Expression] = { condition match { case Or(cond1, cond2) => splitDisjunctivePredicates(cond1) ++ splitDisjunctivePredicates(cond2) case other => other :: Nil } } }
Example 5
Source File: InRange.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.expression import org.apache.spark.sql.simba.{ShapeSerializer, ShapeType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, Predicate} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.simba.spatial.{MBR, Point, Shape} import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.sql.catalyst.util.GenericArrayData case class InRange(shape: Expression, range_low: Expression, range_high: Expression) extends Predicate with CodegenFallback{ override def nullable: Boolean = false override def eval(input: InternalRow): Any = { val eval_shape = ShapeUtils.getShape(shape, input) val eval_low = range_low.asInstanceOf[Literal].value.asInstanceOf[Point] val eval_high = range_high.asInstanceOf[Literal].value.asInstanceOf[Point] require(eval_shape.dimensions == eval_low.dimensions && eval_shape.dimensions == eval_high.dimensions) val mbr = MBR(eval_low, eval_high) mbr.intersects(eval_shape) } override def toString: String = s" **($shape) IN Rectangle ($range_low) - ($range_high)** " override def children: Seq[Expression] = Seq(shape, range_low, range_high) }
Example 6
Source File: FilterExec.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution import org.apache.spark.sql.simba.expression._ import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal, PredicateHelper} import org.apache.spark.sql.catalyst.expressions.{SortOrder, And => SQLAnd, Not => SQLNot, Or => SQLOr} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class FilterExec(condition: Expression, child: SparkPlan) extends SimbaPlan with PredicateHelper { override def output: Seq[Attribute] = child.output private class DistanceOrdering(point: Expression, target: Point) extends Ordering[InternalRow] { override def compare(x: InternalRow, y: InternalRow): Int = { val shape_x = ShapeUtils.getShape(point, child.output, x) val shape_y = ShapeUtils.getShape(point, child.output, y) val dis_x = target.minDist(shape_x) val dis_y = target.minDist(shape_y) dis_x.compare(dis_y) } } // TODO change target partition from 1 to some good value // Note that target here must be an point literal in WHERE clause, // hence we can consider it as Point safely def knn(rdd: RDD[InternalRow], point: Expression, target: Point, k: Int): RDD[InternalRow] = sparkContext.parallelize(rdd.map(_.copy()).takeOrdered(k)(new DistanceOrdering(point, target)), 1) def applyCondition(rdd: RDD[InternalRow], condition: Expression): RDD[InternalRow] = { condition match { case InKNN(point, target, k) => val _target = target.asInstanceOf[Literal].value.asInstanceOf[Point] knn(rdd, point, _target, k.value.asInstanceOf[Number].intValue()) case now@And(left, right) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else applyCondition(rdd, left).map(_.copy()).intersection(applyCondition(rdd, right).map(_.copy())) case now@Or(left, right) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else applyCondition(rdd, left).map(_.copy()).union(applyCondition(rdd, right).map(_.copy())).distinct() case now@Not(c) => if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))} else rdd.map(_.copy()).subtract(applyCondition(rdd, c).map(_.copy())) case _ => rdd.mapPartitions(iter => iter.filter(newPredicate(condition, child.output).eval(_))) } } protected def doExecute(): RDD[InternalRow] = { val root_rdd = child.execute() condition transformUp { case SQLAnd(left, right) => And(left, right) case SQLOr(left, right)=> Or(left, right) case SQLNot(c) => Not(c) } applyCondition(root_rdd, condition) } override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 7
Source File: CDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class CDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SparkPlan { override def outputPartitioning: Partitioning = left.outputPartitioning override def output: Seq[Attribute] = left.output ++ right.output final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = left.execute().cartesian(right.execute()).mapPartitions { iter => val joinedRow = new JoinedRow iter.filter { row => val point1 = ShapeUtils.getShape(left_key, left.output, row._1).asInstanceOf[Point] val point2 = ShapeUtils.getShape(right_key, right.output, row._2).asInstanceOf[Point] point1.minDist(point2) <= r }.map(row => joinedRow(row._1, row._2)) } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 8
Source File: CKJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan case class CKJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def outputPartitioning: Partitioning = left.outputPartitioning override def output: Seq[Attribute] = left.output ++ right.output final val k = l.value.asInstanceOf[Number].intValue() override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute() val right_rdd = right.execute() left_rdd.map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ).cartesian(right_rdd).map { case (l: (Point, InternalRow), r: InternalRow) => val tmp_point = ShapeUtils.getShape(right_key, right.output, r).asInstanceOf[Point] l._2 -> List((tmp_point.minDist(l._1), r)) }.reduceByKey { case (l_list: Seq[(Double, InternalRow)], r_list: Seq[(Double, InternalRow)]) => (l_list ++ r_list).sortWith(_._1 < _._1).take(k) }.flatMapValues(list => list).mapPartitions { iter => val joinedRow = new JoinedRow iter.map(r => joinedRow(r._1, r._2._2)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 9
Source File: RDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition} import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable case class RDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val sample_rate = simbaSessionState.simbaConf.sampleRate final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute().map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ) val right_rdd = right.execute().map(row => (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row) ) val dimension = right_rdd.first()._1.coord.length val (left_partitioned, left_mbr_bound) = STRPartition(left_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val left_part_size = left_partitioned.mapPartitions { iter => Array(iter.length).iterator }.collect() val left_rt = RTree(left_mbr_bound.zip(left_part_size).map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) val bc_rt = sparkContext.broadcast(left_rt) val right_dup = right_rdd.flatMap {x => bc_rt.value.circleRange(x._1, r).map(now => (now._2, x)) } val right_dup_partitioned = MapDPartition(right_dup, left_mbr_bound.length) left_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) => val ans = mutable.ListBuffer[InternalRow]() val right_data = rightIter.map(_._2).toArray if (right_data.length > 0) { val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node) leftIter.foreach {now => ans ++= right_index.circleRange(now._1, r) .map(x => new JoinedRow(now._2, right_data(x._2)._2)) } } ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 10
Source File: DJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition} import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable case class DJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val sample_rate = simbaSessionState.simbaConf.sampleRate final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val left_rdd = left.execute().map(row => (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row) ) val right_rdd = right.execute().map(row => (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row) ) val dimension = right_rdd.first()._1.coord.length val (left_partitioned, left_mbr_bound) = STRPartition(left_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val (right_partitioned, right_mbr_bound) = STRPartition(right_rdd, dimension, num_partitions, sample_rate, transfer_threshold, max_entries_per_node) val right_rt = RTree(right_mbr_bound.zip(Array.fill[Int](right_mbr_bound.length)(0)) .map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) val left_dup = new Array[Array[Int]](left_mbr_bound.length) val right_dup = new Array[Array[Int]](right_mbr_bound.length) var tot = 0 left_mbr_bound.foreach { now => val res = right_rt.circleRange(now._1, r) val tmp_arr = mutable.ArrayBuffer[Int]() res.foreach {x => if (right_dup(x._2) == null) right_dup(x._2) = Array(tot) else right_dup(x._2) = right_dup(x._2) :+ tot tmp_arr += tot tot += 1 } left_dup(now._2) = tmp_arr.toArray } val bc_left_dup = sparkContext.broadcast(left_dup) val bc_right_dup = sparkContext.broadcast(right_dup) val left_dup_rdd = left_partitioned.mapPartitionsWithIndex { (id, iter) => iter.flatMap {now => val tmp_list = bc_left_dup.value(id) if (tmp_list != null) tmp_list.map(x => (x, now)) else Array[(Int, (Point, InternalRow))]() } } val right_dup_rdd = right_partitioned.mapPartitionsWithIndex { (id, iter) => iter.flatMap {now => val tmp_list = bc_right_dup.value(id) if (tmp_list != null) tmp_list.map(x => (x, now)) else Array[(Int, (Point, InternalRow))]() } } val left_dup_partitioned = MapDPartition(left_dup_rdd, tot).map(_._2) val right_dup_partitioned = MapDPartition(right_dup_rdd, tot).map(_._2) left_dup_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) => val ans = mutable.ListBuffer[InternalRow]() val right_data = rightIter.toArray if (right_data.nonEmpty) { val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node) leftIter.foreach {now => ans ++= right_index.circleRange(now._1, r) .map(x => new JoinedRow(now._2, right_data(x._2)._2)) } } ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 11
Source File: BDJSparkR.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BDJSparkR(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val r = NumberUtil.literalToDouble(l) final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[InternalRow]() if (right_data.nonEmpty) { val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node) left_data.foreach(left => right_rtree.circleRange(left._1, r) .foreach(x => joined_ans += new JoinedRow(left._2, right_data(x._2)._2))) } joined_ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 12
Source File: BKJSparkR.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.index.RTree import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BKJSparkR(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode final val k = l.value.asInstanceOf[Number].intValue() private class DisOrdering extends Ordering[(InternalRow, Double)] { override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int = -x._2.compare(y._2) } override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]() if (right_data.nonEmpty) { val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node) left_data.foreach(left => joined_ans += ((left._2, right_rtree.kNN(left._1, k, keepSame = false) .map(x => (right_data(x._2)._2, x._1.minDist(left._1))))) ) } joined_ans.iterator }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions) .flatMap { now => now._2.map(x => new JoinedRow(now._1, x._1)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 13
Source File: BKJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.util.BoundedPriorityQueue import scala.collection.mutable import scala.util.Random case class BKJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val k = l.value.asInstanceOf[Number].intValue() private class DisOrdering extends Ordering[(InternalRow, Double)] { override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int = -x._2.compare(y._2) } override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]() left_data.foreach(left => { var pq = new BoundedPriorityQueue[(InternalRow, Double)](k)(new DisOrdering) right_data.foreach(right => pq += ((right._2, right._1.minDist(left._1)))) joined_ans += ((left._2, pq.toArray)) }) joined_ans.iterator }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions) .flatMap { now => now._2.map(x => new JoinedRow(now._1, x._1)) } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 14
Source File: BDJSpark.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.execution.join import org.apache.spark.sql.simba.execution.SimbaPlan import org.apache.spark.sql.simba.partitioner.MapDPartition import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal} import org.apache.spark.sql.execution.SparkPlan import scala.collection.mutable import scala.util.Random case class BDJSpark(left_key: Expression, right_key: Expression, l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan { override def output: Seq[Attribute] = left.output ++ right.output final val num_partitions = simbaSessionState.simbaConf.joinPartitions final val r = NumberUtil.literalToDouble(l) override protected def doExecute(): RDD[InternalRow] = { val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _))) val tot_dup_rdd = tot_rdd.flatMap {x => val rand_no = new Random().nextInt(num_partitions) var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]() if (x._1 == 0) { val base = rand_no * num_partitions for (i <- 0 until num_partitions) ans += ((base + i, x)) } else { for (i <- 0 until num_partitions) ans += ((i * num_partitions + rand_no, x)) } ans } val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions) tot_dup_partitioned.mapPartitions {iter => var left_data = mutable.ListBuffer[(Point, InternalRow)]() var right_data = mutable.ListBuffer[(Point, InternalRow)]() while (iter.hasNext) { val data = iter.next() if (data._2._1 == 0) { val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point] left_data += ((tmp_point, data._2._2)) } else { val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point] right_data += ((tmp_point, data._2._2)) } } val joined_ans = mutable.ListBuffer[InternalRow]() left_data.foreach {left => right_data.foreach {right => if (left._1.minDist(right._1) <= r) { joined_ans += new JoinedRow(left._2, right._2) } } } joined_ans.iterator } } override def children: Seq[SparkPlan] = Seq(left, right) }
Example 15
Source File: SimbaOptimizer.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba import org.apache.spark.sql.ExperimentalMethods import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.expressions.{And, Expression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkOptimizer import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.simba.plans.SpatialJoin class SimbaOptimizer(catalog: SessionCatalog, conf: SQLConf, experimentalMethods: ExperimentalMethods) extends SparkOptimizer(catalog, conf, experimentalMethods) { override def batches: Seq[Batch] = super.batches :+ Batch("SpatialJoinPushDown", FixedPoint(100), PushPredicateThroughSpatialJoin) } object PushPredicateThroughSpatialJoin extends Rule[LogicalPlan] with PredicateHelper { private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = { val (leftEvaluateCondition, rest) = condition.partition(_.references subsetOf left.outputSet) val (rightEvaluateCondition, commonCondition) = rest.partition(_.references subsetOf right.outputSet) (leftEvaluateCondition, rightEvaluateCondition, commonCondition) } def apply(plan: LogicalPlan): LogicalPlan = plan transform { // push the where condition down into join filter case f @ Filter(filterCondition, SpatialJoin(left, right, joinType, joinCondition)) => val (leftFilterConditions, rightFilterConditions, commonFilterCondition) = split(splitConjunctivePredicates(filterCondition), left, right) val newLeft = leftFilterConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left) val newRight = rightFilterConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right) val newJoinCond = (commonFilterCondition ++ joinCondition).reduceLeftOption(And) SpatialJoin(newLeft, newRight, joinType, newJoinCond) // push down the join filter into sub query scanning if applicable case f @ SpatialJoin(left, right, joinType, joinCondition) => val (leftJoinConditions, rightJoinConditions, commonJoinCondition) = split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right) val newLeft = leftJoinConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left) val newRight = rightJoinConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right) val newJoinCond = commonJoinCondition.reduceLeftOption(And) SpatialJoin(newLeft, newRight, joinType, newJoinCond) } }
Example 16
Source File: SpatialJoin.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.plans import org.apache.spark.sql.simba.expression.{InCircleRange, InKNN} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LogicalPlan} import org.apache.spark.sql.types.BooleanType case class SpatialJoin(left: LogicalPlan, right: LogicalPlan, joinType: SpatialJoinType, condition: Option[Expression]) extends BinaryNode { override def output: Seq[Attribute] = { joinType match { case KNNJoin => require(condition.get.isInstanceOf[InKNN]) left.output ++ right.output case ZKNNJoin => require(condition.get.isInstanceOf[InKNN]) left.output ++ right.output case DistanceJoin => require(condition.get.isInstanceOf[InCircleRange]) left.output ++ right.output.map(_.withNullability(true)) case _ => left.output ++ right.output } } def selfJoinResolved: Boolean = left.outputSet.intersect(right.outputSet).isEmpty // Joins are only resolved if they don't introduce ambiguous expression ids. override lazy val resolved: Boolean = { childrenResolved && expressions.forall(_.resolved) && selfJoinResolved && condition.forall(_.dataType == BooleanType) } }
Example 17
Source File: SqlExtensionProviderSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, Literal, UnaryExpression} import org.apache.spark.sql.types.{DataType, IntegerType} import io.projectglow.GlowSuite class SqlExtensionProviderSuite extends GlowSuite { override def beforeAll(): Unit = { super.beforeAll() SqlExtensionProvider.registerFunctions( spark.sessionState.conf, spark.sessionState.functionRegistry, "test-functions.yml") } private lazy val sess = spark test("one arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("one_arg_test(id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("one_arg_test()").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("one_arg_test(id, id)").collect() } } test("two arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("two_arg_test(id, id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("two_arg_test(id)").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("two_arg_test(id, id, id)").collect() } } test("var args function") { import sess.implicits._ assert(spark.range(1).selectExpr("var_args_test(id, id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("var_args_test(id, id, id, id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("var_args_test(id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("var_args_test()").collect() } } test("can call optional arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("optional_arg_test(id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("optional_arg_test(id, id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("optional_arg_test()").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("optional_arg_test(id, id, id)").collect() } } } trait TestExpr extends Expression with CodegenFallback { override def dataType: DataType = IntegerType override def nullable: Boolean = true override def eval(input: InternalRow): Any = 1 } case class OneArgExpr(child: Expression) extends UnaryExpression with TestExpr case class TwoArgExpr(left: Expression, right: Expression) extends BinaryExpression with TestExpr case class VarArgsExpr(arg: Expression, varArgs: Seq[Expression]) extends TestExpr { override def children: Seq[Expression] = arg +: varArgs } case class OptionalArgExpr(required: Expression, optional: Expression) extends TestExpr { def this(required: Expression) = this(required, Literal(1)) override def children: Seq[Expression] = Seq(required, optional) }
Example 18
Source File: LinearRegressionExpr.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import breeze.linalg.DenseVector import org.apache.spark.TaskContext import org.apache.spark.sql.SQLUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, TernaryExpression} import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types._ object LinearRegressionExpr { private val matrixUDT = SQLUtils.newMatrixUDT() private val state = new ThreadLocal[CovariateQRContext] def doLinearRegression(genotypes: Any, phenotypes: Any, covariates: Any): InternalRow = { if (state.get() == null) { // Save the QR factorization of the covariate matrix since it's the same for every row state.set(CovariateQRContext.computeQR(matrixUDT.deserialize(covariates).toDense)) TaskContext.get().addTaskCompletionListener[Unit](_ => state.remove()) } LinearRegressionGwas.linearRegressionGwas( new DenseVector[Double](genotypes.asInstanceOf[ArrayData].toDoubleArray()), new DenseVector[Double](phenotypes.asInstanceOf[ArrayData].toDoubleArray()), state.get() ) } } case class LinearRegressionExpr( genotypes: Expression, phenotypes: Expression, covariates: Expression) extends TernaryExpression with ImplicitCastInputTypes { private val matrixUDT = SQLUtils.newMatrixUDT() override def dataType: DataType = StructType( Seq( StructField("beta", DoubleType), StructField("standardError", DoubleType), StructField("pValue", DoubleType))) override def inputTypes: Seq[DataType] = Seq(ArrayType(DoubleType), ArrayType(DoubleType), matrixUDT) override def children: Seq[Expression] = Seq(genotypes, phenotypes, covariates) override protected def nullSafeEval(genotypes: Any, phenotypes: Any, covariates: Any): Any = { LinearRegressionExpr.doLinearRegression(genotypes, phenotypes, covariates) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen( ctx, ev, (genotypes, phenotypes, covariates) => { s""" |${ev.value} = io.projectglow.sql.expressions.LinearRegressionExpr.doLinearRegression($genotypes, $phenotypes, $covariates); """.stripMargin } ) } }
Example 19
Source File: HiveAcidRelation.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.datasource import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan} import org.apache.spark.sql.types._ import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf} import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert} import org.apache.spark.sql.catalyst.AliasIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import collection.JavaConversions._ case class HiveAcidRelation(sparkSession: SparkSession, fullyQualifiedTableName: String, parameters: Map[String, String]) extends BaseRelation with InsertableRelation with PrunedFilteredScan with Logging { private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession( sparkSession, fullyQualifiedTableName ) private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession, hiveAcidMetadata, parameters) private val readOptions = SparkAcidConf(sparkSession, parameters) override def sqlContext: SQLContext = sparkSession.sqlContext override val schema: StructType = if (readOptions.includeRowIds) { hiveAcidMetadata.tableSchemaWithRowId } else { hiveAcidMetadata.tableSchema } override def insert(data: DataFrame, overwrite: Boolean): Unit = { // sql insert into and overwrite if (overwrite) { hiveAcidTable.insertOverwrite(data) } else { hiveAcidTable.insertInto(data) } } def update(condition: Option[Column], newValues: Map[String, Column]): Unit = { hiveAcidTable.update(condition, newValues) } def delete(condition: Column): Unit = { hiveAcidTable.delete(condition) } override def sizeInBytes: Long = { val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong } def merge(sourceDf: DataFrame, mergeExpression: Expression, matchedClause: Seq[MergeWhenClause], notMatched: Option[MergeWhenNotInsert], sourceAlias: Option[AliasIdentifier], targetAlias: Option[AliasIdentifier]): Unit = { hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause, notMatched, sourceAlias, targetAlias) } def getHiveAcidTable(): HiveAcidTable = { hiveAcidTable } // FIXME: should it be true / false. Recommendation seems to // be to leave it as true override val needConversion: Boolean = false override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val readOptions = SparkAcidConf(sparkSession, parameters) // sql "select *" hiveAcidTable.getRdd(requiredColumns, filters, readOptions) } }
Example 20
Source File: UpdateCommand.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command import com.qubole.spark.hiveacid.HiveAcidErrors import com.qubole.spark.hiveacid.datasource.HiveAcidRelation import org.apache.spark.sql.{Column, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation case class UpdateCommand( table: LogicalPlan, setExpressions: Map[String, Expression], condition: Option[Expression]) extends RunnableCommand { override def children: Seq[LogicalPlan] = Seq(table) override def output: Seq[Attribute] = Seq.empty override lazy val resolved: Boolean = childrenResolved override def run(sparkSession: SparkSession): Seq[Row] = { if (children.size != 1) { throw new IllegalArgumentException("UPDATE command should have one table to update, whereas this has: " + children.size) } children(0) match { case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => { val setColumns = setExpressions.mapValues(expr => new Column(expr)) val updateFilterColumn = condition.map(new Column(_)) relation.update(updateFilterColumn, setColumns) } case LogicalRelation(_, _, Some(catalogTable), _) => throw HiveAcidErrors.tableNotAcidException(catalogTable.qualifiedName) case _ => throw HiveAcidErrors.tableNotAcidException(table.toString()) } Seq.empty[Row] } }
Example 21
Source File: MergeCommand.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command import com.qubole.spark.hiveacid.HiveAcidErrors import com.qubole.spark.hiveacid.datasource.HiveAcidRelation import com.qubole.spark.hiveacid.merge.{MergeCondition, MergeWhenClause, MergeWhenNotInsert} import org.apache.spark.sql.catalyst.AliasIdentifier import org.apache.spark.sql.{Row, SparkSession, SqlUtils} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation case class MergeCommand(targetTable: LogicalPlan, sourceTable: LogicalPlan, matched: Seq[MergeWhenClause], notMatched: Option[MergeWhenClause], mergeCondition: MergeCondition, sourceAlias: Option[AliasIdentifier], targetAlias: Option[AliasIdentifier]) extends RunnableCommand { override def children: Seq[LogicalPlan] = Seq(targetTable, sourceTable) override def output: Seq[Attribute] = Seq.empty override lazy val resolved: Boolean = childrenResolved override def run(sparkSession: SparkSession): Seq[Row] = { val insertClause: Option[MergeWhenNotInsert] = notMatched match { case Some(i: MergeWhenNotInsert) => Some(i) case None => None case _ => throw HiveAcidErrors.mergeValidationError("WHEN NOT Clause has to be INSERT CLAUSE") } children.head match { case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => relation.merge(SqlUtils.logicalPlanToDataFrame(sparkSession, sourceTable), mergeCondition.expression, matched, insertClause, sourceAlias, targetAlias) case SubqueryAlias(_, LogicalRelation(relation: HiveAcidRelation, _, _, _)) => relation.merge(SqlUtils.logicalPlanToDataFrame(sparkSession, sourceTable), mergeCondition.expression, matched, insertClause, sourceAlias, targetAlias) case _ => throw HiveAcidErrors.tableNotAcidException(targetTable.toString()) } Seq.empty } }
Example 22
Source File: DeleteCommand.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command import com.qubole.spark.hiveacid.HiveAcidErrors import com.qubole.spark.hiveacid.datasource.HiveAcidRelation import org.apache.spark.sql.{Column, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.LogicalRelation case class DeleteCommand( table: LogicalPlan, condition: Expression) extends RunnableCommand { // We don't want `table` in children as sometimes we don't want to transform it. override def children: Seq[LogicalPlan] = Seq(table) override def output: Seq[Attribute] = Seq.empty override lazy val resolved: Boolean = childrenResolved override def run(sparkSession: SparkSession): Seq[Row] = { if (children.size != 1) { throw new IllegalArgumentException("DELETE command should specify exactly one table, whereas this has: " + children.size) } children(0) match { case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => { relation.delete(new Column(condition)) } case _ => throw HiveAcidErrors.tableNotAcidException(table.toString()) } Seq.empty[Row] } }
Example 23
Source File: MergePlan.scala From spark-acid with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.parser.plans.logical import com.qubole.spark.hiveacid.merge.{MergeWhenClause} import org.apache.spark.sql.{SparkSession, SqlUtils} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} case class MergePlan(sourcePlan: LogicalPlan, targetPlan: LogicalPlan, condition: Expression, matched: Seq[MergeWhenClause], notMatched: Option[MergeWhenClause]) extends Command { override def children: Seq[LogicalPlan] = Seq(sourcePlan, targetPlan) override def output: Seq[Attribute] = Seq.empty } object MergePlan { def resolve(sparkSession: SparkSession, mergePlan: MergePlan): MergePlan = { MergeWhenClause.validate(mergePlan.matched ++ mergePlan.notMatched) val resolvedCondition = SqlUtils.resolveReferences(sparkSession, mergePlan.condition, mergePlan.children, true, None) val resolvedMatched = MergeWhenClause.resolve(sparkSession, mergePlan, mergePlan.matched) val resolvedNotMatched = mergePlan.notMatched.map { x => x.resolve(sparkSession, mergePlan) } MergePlan(mergePlan.sourcePlan, mergePlan.targetPlan, resolvedCondition, resolvedMatched, resolvedNotMatched) } }
Example 24
Source File: SqlUtils.scala From spark-acid with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.types.StructType object SqlUtils { def convertToDF(sparkSession: SparkSession, plan : LogicalPlan): DataFrame = { Dataset.ofRows(sparkSession, plan) } def resolveReferences(sparkSession: SparkSession, expr: Expression, planContaining: LogicalPlan, failIfUnresolved: Boolean, exprName: Option[String] = None): Expression = { resolveReferences(sparkSession, expr, Seq(planContaining), failIfUnresolved, exprName) } def resolveReferences(sparkSession: SparkSession, expr: Expression, planContaining: Seq[LogicalPlan], failIfUnresolved: Boolean, exprName: Option[String]): Expression = { val newPlan = FakeLogicalPlan(expr, planContaining) val resolvedExpr = sparkSession.sessionState.analyzer.execute(newPlan) match { case FakeLogicalPlan(resolvedExpr: Expression, _) => // Return even if it did not successfully resolve resolvedExpr case _ => expr // This is unexpected } if (failIfUnresolved) { resolvedExpr.flatMap(_.references).filter(!_.resolved).foreach { attr => { val failedMsg = exprName match { case Some(name) => s"${attr.sql} resolution in $name given these columns: "+ planContaining.flatMap(_.output).map(_.name).mkString(",") case _ => s"${attr.sql} resolution failed given these columns: "+ planContaining.flatMap(_.output).map(_.name).mkString(",") } attr.failAnalysis(failedMsg) } } } resolvedExpr } def hasSparkStopped(sparkSession: SparkSession): Boolean = { sparkSession.sparkContext.stopped.get() } def createDataFrameUsingAttributes(sparkSession: SparkSession, rdd: RDD[Row], schema: StructType, attributes: Seq[Attribute]): DataFrame = { val encoder = RowEncoder(schema) val catalystRows = rdd.map(encoder.toRow) val logicalPlan = LogicalRDD( attributes, catalystRows, isStreaming = false)(sparkSession) Dataset.ofRows(sparkSession, logicalPlan) } def analysisException(cause: String): Throwable = { new AnalysisException(cause) } } case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan]) extends LogicalPlan { override def output: Seq[Attribute] = children.foldLeft(Seq[Attribute]())((out, child) => out ++ child.output) }
Example 25
Source File: HiveAcidUtils.scala From spark-acid with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import scala.collection.JavaConverters._ import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTablePartition, CatalogUtils} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, InterpretedPredicate, PrettyAttribute} object HiveAcidUtils { def prunePartitionsByFilter( hiveAcidMetadata: HiveAcidMetadata, inputPartitions: Seq[CatalogTablePartition], predicates: Option[Expression], defaultTimeZoneId: String): Seq[CatalogTablePartition] = { if (predicates.isEmpty) { inputPartitions } else { val partitionSchema = hiveAcidMetadata.partitionSchema val partitionColumnNames = hiveAcidMetadata.partitionSchema.fieldNames.toSet val nonPartitionPruningPredicates = predicates.filterNot { _.references.map(_.name).toSet.subsetOf(partitionColumnNames) } if (nonPartitionPruningPredicates.nonEmpty) { throw new AnalysisException("Expected only partition pruning predicates: " + nonPartitionPruningPredicates) } val boundPredicate = InterpretedPredicate.create(predicates.get.transform { case att: Attribute => val index = partitionSchema.indexWhere(_.name == att.name) BoundReference(index, partitionSchema(index).dataType, nullable = true) }) inputPartitions.filter { p => boundPredicate.eval(p.toRow(partitionSchema, defaultTimeZoneId)) } } } def convertToCatalogTablePartition(hp: com.qubole.shaded.hadoop.hive.ql.metadata.Partition): CatalogTablePartition = { val apiPartition = hp.getTPartition val properties: Map[String, String] = if (hp.getParameters != null) { hp.getParameters.asScala.toMap } else { Map.empty } CatalogTablePartition( spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty), storage = CatalogStorageFormat( locationUri = Option(CatalogUtils.stringToURI(apiPartition.getSd.getLocation)), inputFormat = Option(apiPartition.getSd.getInputFormat), outputFormat = Option(apiPartition.getSd.getOutputFormat), serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib), compressed = apiPartition.getSd.isCompressed, properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) .map(_.asScala.toMap).orNull), createTime = apiPartition.getCreateTime.toLong * 1000, lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000, parameters = properties, stats = None) // TODO: need to implement readHiveStats } }
Example 26
Source File: IntervalTreeJoin.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.genApp import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow} import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} @DeveloperApi case class IntervalTreeJoin(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)), List(condition(2), condition(3))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) (new Interval[Int](v1Key.getInt(0), v1Key.getInt(1)), x.copy()) }) val v2 = right.execute() val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) (new Interval[Int](v2Key.getInt(0), v2Key.getInt(1)), x.copy()) }) if (v1.count <= v2.count) { val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v1kv, v2kv) .flatMap(l => l._2 .map(r => (l._1, r))) v3.map { case (l: InternalRow, r: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } else { val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.map { case (r: InternalRow, l: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } } }
Example 27
Source File: IntervalTreeJoinChromosome.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.methods.genApp import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow} import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.biodatageeks.sequila.rangejoins.genApp.Interval @DeveloperApi case class IntervalTreeJoinChromosome(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1),condition(4)), List(condition(2), condition(3),condition(5))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) ((v1Key.getString(2),new Interval[Int](v1Key.getInt(0), v1Key.getInt(1))), x.copy()) }) val v2 = right.execute() val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) ((v2Key.getString(2),new Interval[Int](v2Key.getInt(0), v2Key.getInt(1))), x.copy()) }) if (v1.count <= v2.count) { val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v1kv, v2kv) .flatMap(l => l._2 .map(r => (l._1, r))) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) p.map(r => joiner.join(r._1.asInstanceOf[UnsafeRow], r._2.asInstanceOf[UnsafeRow])) } ) } else { val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(right.schema, left.schema) p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow])) } ) } } }
Example 28
Source File: MetastoreIndexSuite.scala From parquet-index with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import com.github.lightcopy.testutil.UnitTestSuite import com.github.lightcopy.testutil.implicits._ // Test catalog to check internal methods private[datasources] class TestIndex extends MetastoreIndex { private var internalIndexFilters: Seq[Filter] = Nil override def tablePath(): Path = ??? override def partitionSchema: StructType = ??? override def indexSchema: StructType = ??? override def dataSchema: StructType = ??? override def setIndexFilters(filters: Seq[Filter]) = { internalIndexFilters = filters } override def indexFilters: Seq[Filter] = internalIndexFilters override def listFilesWithIndexSupport( partitionFilters: Seq[Expression], dataFilters: Seq[Expression], indexFilters: Seq[Filter]): Seq[PartitionDirectory] = ??? override def inputFiles: Array[String] = ??? override def sizeInBytes: Long = ??? } class MetastoreIndexSuite extends UnitTestSuite { test("provide sequence of path based on table path") { val catalog = new TestIndex() { override def tablePath(): Path = new Path("test") } catalog.rootPaths should be (Seq(new Path("test"))) } test("when using listFiles directly supply empty index filter") { var indexSeq: Seq[Filter] = null var filterSeq: Seq[Expression] = null val catalog = new TestIndex() { override def listFilesWithIndexSupport( partitionFilters: Seq[Expression], dataFilters: Seq[Expression], indexFilters: Seq[Filter]): Seq[PartitionDirectory] = { indexSeq = indexFilters filterSeq = partitionFilters Seq.empty } } catalog.listFiles(Seq.empty, Seq.empty) indexSeq should be (Nil) filterSeq should be (Nil) } test("refresh should be no-op by default") { val catalog = new TestIndex() catalog.refresh() } }
Example 29
Source File: SimilarityFunctions.scala From spark-stringmetric with MIT License | 5 votes |
package com.github.mrpowers.spark.stringmetric import com.github.mrpowers.spark.stringmetric.expressions.HammingDistance import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.functions._ import java.util.Locale import org.apache.commons.text.similarity.{ CosineDistance, JaccardSimilarity, JaroWinklerDistance, FuzzyScore } object SimilarityFunctions { private def withExpr(expr: Expression): Column = new Column(expr) val cosine_distance = udf[Option[Double], String, String](cosineDistanceFun) def cosineDistanceFun(s1: String, s2: String): Option[Double] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val cd = new CosineDistance() Some(cd(s1, s2)) } val fuzzy_score = udf[Option[Integer], String, String](fuzzyScoreFun) def fuzzyScoreFun(s1: String, s2: String): Option[Integer] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val f = new FuzzyScore(Locale.ENGLISH) Some(f.fuzzyScore(str1, str2)) } def hamming(s1: Column, s2: Column): Column = withExpr { HammingDistance(s1.expr, s2.expr) } val jaccard_similarity = udf[Option[Double], String, String](jaccardSimilarityFun) def jaccardSimilarityFun(s1: String, s2: String): Option[Double] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val j = new JaccardSimilarity() Some(j.apply(str1, str2)) } val jaro_winkler = udf[Option[Double], String, String](jaroWinlkerFun) def jaroWinlkerFun(s1: String, s2: String): Option[Double] = { val str1 = Option(s1).getOrElse(return None) val str2 = Option(s2).getOrElse(return None) val j = new JaroWinklerDistance() Some(j.apply(str1, str2)) } }
Example 30
Source File: CatalystDataToAvro.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.avro import com.hortonworks.registries.schemaregistry.{SchemaCompatibility, SchemaMetadata} import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotSerializer import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{BinaryType, DataType} import scala.collection.JavaConverters._ case class CatalystDataToAvro( child: Expression, schemaName: String, recordName: String, nameSpace: String, config: Map[String, Object] ) extends UnaryExpression { override def dataType: DataType = BinaryType private val topLevelRecordName = if (recordName == "") schemaName else recordName @transient private lazy val avroType = SchemaConverters.toAvroType(child.dataType, child.nullable, topLevelRecordName, nameSpace) @transient private lazy val avroSer = new AvroSerializer(child.dataType, avroType, child.nullable) @transient private lazy val srSer: AvroSnapshotSerializer = { val obj = new AvroSnapshotSerializer() obj.init(config.asJava) obj } @transient private lazy val srClient = new SchemaRegistryClient(config.asJava) @transient private lazy val schemaMetadata = { var schemaMetadataInfo = srClient.getSchemaMetadataInfo(schemaName) if (schemaMetadataInfo == null) { val generatedSchemaMetadata = new SchemaMetadata.Builder(schemaName). `type`(AvroSchemaProvider.TYPE) .schemaGroup("Autogenerated group") .description("Autogenerated schema") .compatibility(SchemaCompatibility.BACKWARD).build srClient.addSchemaMetadata(generatedSchemaMetadata) generatedSchemaMetadata } else { schemaMetadataInfo.getSchemaMetadata } } override def nullSafeEval(input: Any): Any = { val avroData = avroSer.serialize(input) srSer.serialize(avroData.asInstanceOf[Object], schemaMetadata) } override def simpleString: String = { s"to_sr(${child.sql}, ${child.dataType.simpleString})" } override def sql: String = { s"to_sr(${child.sql}, ${child.dataType.catalogString})" } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val expr = ctx.addReferenceObj("this", this) defineCodeGen(ctx, ev, input => s"(byte[]) $expr.nullSafeEval($input)") } }
Example 31
Source File: MyUDF.scala From spark-tools with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.TimestampType object MyUDF { private def myTimestampCast(xs: Seq[Expression]): Expression = { val expSource = xs.head expSource.dataType match { case LongType => new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr case TimestampType => expSource } } def register(sparkSession: SparkSession): Unit = sparkSession.sessionState.functionRegistry .registerFunction(FunctionIdentifier("toTs",None), myTimestampCast) }
Example 32
Source File: MyUDF.scala From spark-tools with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.TimestampType object MyUDF { private def myTimestampCast(xs: Seq[Expression]): Expression = { val expSource = xs.head expSource.dataType match { case LongType => new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr case TimestampType => expSource } } def register(sparkSession: SparkSession): Unit = sparkSession.sessionState.functionRegistry .registerFunction("toTs", myTimestampCast) }
Example 33
Source File: SQLBuilderTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 34
Source File: ScriptTransformation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 35
Source File: CodegenFallback.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic} trait CodegenFallback extends Expression { protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { foreach { case n: Nondeterministic => n.setInitialValues() case _ => } // LeafNode does not need `input` val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW val idx = ctx.references.length ctx.references += this val objectTerm = ctx.freshName("obj") val placeHolder = ctx.registerComment(this.toString) if (nullable) { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); boolean ${ev.isNull} = $objectTerm == null; ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)}; if (!${ev.isNull}) { ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; }""") } else { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; """, isNull = "false") } } }
Example 36
Source File: SubstituteUnresolvedOrdinals.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.CatalystConf import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transform { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 37
Source File: ResolveTableValuedFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} tvf("start" -> LongType, "end" -> LongType, "step" -> LongType, "numPartitions" -> IntegerType) { case Seq(start: Long, end: Long, step: Long, numPartitions: Int) => Range(start, end, step, Some(numPartitions)) }) ) override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => builtinFunctions.get(u.functionName) match { case Some(tvf) => val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { case Some(casted) => Some(resolver(casted.map(_.eval()))) case _ => None } } resolved.headOption.getOrElse { val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ") u.failAnalysis( s"""error: table-valued function ${u.functionName} with alternatives: |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")} |cannot be applied to: (${argTypes})""".stripMargin) } case _ => u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function") } } }
Example 38
Source File: RuleExecutorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } val message = intercept[TreeNodeException[LogicalPlan]] { ToFixedPoint.execute(Literal(100)) }.getMessage assert(message.contains("Max iterations (10) reached for batch fixedPoint")) } }
Example 39
Source File: ShuffledHashJoinExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 40
Source File: CartesianProductExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsInternal { iter => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition: (InternalRow) => Boolean = newPredicate(condition.get, left.output ++ right.output) val joined = new JoinedRow iter.filter { r => boundCondition(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 41
Source File: subquery.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, DataType, StructType} case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]() plan transformAllExpressions { case sub: ExecSubqueryExpression => val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]()) val sameResult = sameSchema.find(_.sameResult(sub.plan)) if (sameResult.isDefined) { sub.withNewPlan(sameResult.get) } else { sameSchema += sub.plan sub } } } }
Example 42
Source File: GroupedIterator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection} object GroupedIterator { def apply( input: Iterator[InternalRow], keyExpressions: Seq[Expression], inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { if (input.hasNext) { new GroupedIterator(input.buffered, keyExpressions, inputSchema) } else { Iterator.empty } } } def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator def next(): (InternalRow, Iterator[InternalRow]) = { assert(hasNext) // Ensure we have fetched the next iterator. val ret = (keyProjection(currentGroup), currentIterator) currentIterator = null ret } private def fetchNextGroupIterator(): Boolean = { assert(currentIterator == null) if (currentRow == null && input.hasNext) { currentRow = input.next() } if (currentRow == null) { // These is no data left, return false. false } else { // Skip to next group. // currentRow may be overwritten by `hasNext`, so we should compare them first. while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) { currentRow = input.next() } if (keyOrdering.compare(currentGroup, currentRow) == 0) { // We are in the last group, there is no more groups, return false. false } else { // Now the `currentRow` is the first row of next group. currentGroup = currentRow.copy() currentIterator = createGroupValuesIterator() true } } } private def createGroupValuesIterator(): Iterator[InternalRow] = { new Iterator[InternalRow] { def hasNext: Boolean = currentRow != null || fetchNextRowInGroup() def next(): InternalRow = { assert(hasNext) val res = currentRow currentRow = null res } private def fetchNextRowInGroup(): Boolean = { assert(currentRow == null) if (input.hasNext) { // The inner iterator should NOT consume the input into next group, here we use `head` to // peek the next input, to see if we should continue to process it. if (keyOrdering.compare(currentGroup, input.head) == 0) { // Next input is in the current group. Continue the inner iterator. currentRow = input.next() true } else { // Next input is not in the right group. End this inner iterator. false } } else { // There is no more data, return false. false } } } } }
Example 43
Source File: NativeFunctionRegistration.scala From spark-alchemy with Apache License 2.0 | 5 votes |
package com.swoop.alchemy.spark.expressions import org.apache.spark.sql.EncapsulationViolator.createAnalysisException import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription, ExpressionInfo, RuntimeReplaceable} import scala.reflect.ClassTag import scala.util.{Failure, Success, Try} // based on Spark's FunctionRegistry @ossSpark trait NativeFunctionRegistration extends FunctionRegistration { type FunctionBuilder = Seq[Expression] => Expression def expressions: Map[String, (ExpressionInfo, FunctionBuilder)] def registerFunctions(fr: FunctionRegistry): Unit = { expressions.foreach { case (name, (info, builder)) => fr.registerFunction(FunctionIdentifier(name), info, builder) } } def registerFunctions(spark: SparkSession): Unit = { registerFunctions(spark.sessionState.functionRegistry) } protected def expressionInfo[T <: Expression : ClassTag](name: String): ExpressionInfo = { val clazz = scala.reflect.classTag[T].runtimeClass val df = clazz.getAnnotation(classOf[ExpressionDescription]) if (df != null) { new ExpressionInfo(clazz.getCanonicalName, null, name, df.usage(), df.extended()) } else { new ExpressionInfo(clazz.getCanonicalName, name) } } }
Example 44
Source File: DeltaPushFilter.scala From connectors with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import scala.collection.immutable.HashSet import scala.collection.JavaConverters._ import org.apache.hadoop.hive.ql.exec.{FunctionRegistry, SerializationUtilities} import org.apache.hadoop.hive.ql.lib._ import org.apache.hadoop.hive.ql.parse.SemanticException import org.apache.hadoop.hive.ql.plan.{ExprNodeColumnDesc, ExprNodeConstantDesc, ExprNodeGenericFuncDesc} import org.apache.hadoop.hive.ql.udf.generic._ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{And, EqualNullSafe, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, InSet, LessThan, LessThanOrEqual, Like, Literal, Not} object DeltaPushFilter extends Logging { lazy val supportedPushDownUDFs = Array( "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS", "org.apache.hadoop.hive.ql.udf.UDFLike", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn" ) def partitionFilterConverter(hiveFilterExprSeriablized: String): Seq[Expression] = { if (hiveFilterExprSeriablized != null) { val filterExpr = SerializationUtilities.deserializeExpression(hiveFilterExprSeriablized) val opRules = new java.util.LinkedHashMap[Rule, NodeProcessor]() val nodeProcessor = new NodeProcessor() { @throws[SemanticException] def process(nd: Node, stack: java.util.Stack[Node], procCtx: NodeProcessorCtx, nodeOutputs: Object*): Object = { nd match { case e: ExprNodeGenericFuncDesc if FunctionRegistry.isOpAnd(e) => nodeOutputs.map(_.asInstanceOf[Expression]).reduce(And) case e: ExprNodeGenericFuncDesc => val (columnDesc, constantDesc) = if (nd.getChildren.get(0).isInstanceOf[ExprNodeColumnDesc]) { (nd.getChildren.get(0), nd.getChildren.get(1)) } else { (nd.getChildren.get(1), nd.getChildren.get(0)) } val columnAttr = UnresolvedAttribute( columnDesc.asInstanceOf[ExprNodeColumnDesc].getColumn) val constantVal = Literal(constantDesc.asInstanceOf[ExprNodeConstantDesc].getValue) nd.asInstanceOf[ExprNodeGenericFuncDesc].getGenericUDF match { case f: GenericUDFOPNotEqualNS => Not(EqualNullSafe(columnAttr, constantVal)) case f: GenericUDFOPNotEqual => Not(EqualTo(columnAttr, constantVal)) case f: GenericUDFOPEqualNS => EqualNullSafe(columnAttr, constantVal) case f: GenericUDFOPEqual => EqualTo(columnAttr, constantVal) case f: GenericUDFOPGreaterThan => GreaterThan(columnAttr, constantVal) case f: GenericUDFOPEqualOrGreaterThan => GreaterThanOrEqual(columnAttr, constantVal) case f: GenericUDFOPLessThan => LessThan(columnAttr, constantVal) case f: GenericUDFOPEqualOrLessThan => LessThanOrEqual(columnAttr, constantVal) case f: GenericUDFBridge if f.getUdfName.equals("like") => Like(columnAttr, constantVal) case f: GenericUDFIn => val inConstantVals = nd.getChildren.asScala .filter(_.isInstanceOf[ExprNodeConstantDesc]) .map(_.asInstanceOf[ExprNodeConstantDesc].getValue) .map(Literal(_)).toSet InSet(columnAttr, HashSet() ++ inConstantVals) case _ => throw new RuntimeException(s"Unsupported func(${nd.getName}) " + s"which can not be pushed down to delta") } case _ => null } } } val disp = new DefaultRuleDispatcher(nodeProcessor, opRules, null) val ogw = new DefaultGraphWalker(disp) val topNodes = new java.util.ArrayList[Node]() topNodes.add(filterExpr) val nodeOutput = new java.util.HashMap[Node, Object]() try { ogw.startWalking(topNodes, nodeOutput) } catch { case ex: Exception => throw new RuntimeException(ex) } logInfo(s"converted partition filter expr:" + s"${nodeOutput.get(filterExpr).asInstanceOf[Expression].toJSON}") Seq(nodeOutput.get(filterExpr).asInstanceOf[Expression]) } else Seq.empty[org.apache.spark.sql.catalyst.expressions.Expression] } }
Example 45
Source File: TimestampCast.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.codegen.{ CodegenContext, ExprCode, CodeGenerator, JavaCode, Block } import org.apache.spark.sql.catalyst.expressions.{ Expression, NullIntolerant, UnaryExpression } import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{ DataType, LongType, TimestampType } case class TimestampToNanos(child: Expression) extends TimestampCast { val dataType: DataType = LongType protected def cast(childPrim: String): String = s"$childPrim * 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] * 1000L } case class NanosToTimestamp(child: Expression) extends TimestampCast { val dataType: DataType = TimestampType protected def cast(childPrim: String): String = s"$childPrim / 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] / 1000L } object TimestampToNanos { private[this] def castCode(ctx: CodegenContext, childPrim: String, childNull: String, resultPrim: String, resultNull: String, resultType: DataType): Block = { code""" boolean $resultNull = $childNull; ${CodeGenerator.javaType(resultType)} $resultPrim = ${CodeGenerator.defaultValue(resultType)}; if (!${childNull}) { $resultPrim = (long) ${cast(childPrim)}; } """ } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val eval = child.genCode(ctx) ev.copy(code = eval.code + castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType)) } }
Example 46
Source File: SparkWrapper.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} object SparkWrapper { def getVersion: String = { "SparkWrapper-2.3" } def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { SubqueryAlias(identifier, child) } def newAlias(child: Expression, name: String): Alias = { Alias(child, name)() } def newAttributeReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } def callSessionCatalogCreateTable( obj: SessionCatalog, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { obj.createTable(tableDefinition, ignoreIfExists) } }
Example 47
Source File: SparkWrapper.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} object SparkWrapper { def getVersion: String = { "SparkWrapper-2.4" } def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { SubqueryAlias(identifier, child) } def newAlias(child: Expression, name: String): Alias = { Alias(child, name)() } def newAttributeReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } def callSessionCatalogCreateTable( obj: SessionCatalog, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { obj.createTable(tableDefinition, ignoreIfExists) } }
Example 48
Source File: parser.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.extensions import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} import org.apache.spark.sql.catalyst.parser._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.execution.command.{ CacheTableCommand, CreateViewCommand, ExplainCommand, UncacheTableCommand } import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{SparkSession, TiContext} case class TiParser(getOrCreateTiContext: SparkSession => TiContext)( sparkSession: SparkSession, delegate: ParserInterface) extends ParserInterface { private lazy val tiContext = getOrCreateTiContext(sparkSession) private lazy val internal = new SparkSqlParser(sparkSession.sqlContext.conf) private def needQualify(tableIdentifier: TableIdentifier) = tableIdentifier.database.isEmpty && tiContext.sessionCatalog .getTempView(tableIdentifier.table) .isEmpty }
Example 49
Source File: TiAggregation.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import com.pingcap.tispark.TiDBRelation import com.pingcap.tispark.utils.ReflectionUtil import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources.LogicalRelation object TiAggregation { type ReturnType = (Seq[NamedExpression], Seq[AggregateExpression], Seq[NamedExpression], LogicalPlan) def unapply(plan: LogicalPlan): Option[ReturnType] = ReflectionUtil.callTiAggregationImplUnapply(plan) } object TiAggregationProjection { type ReturnType = (Seq[Expression], LogicalPlan, TiDBRelation, Seq[NamedExpression]) def unapply(plan: LogicalPlan): Option[ReturnType] = plan match { // Only push down aggregates projection when all filters can be applied and // all projection expressions are column references case PhysicalOperation( projects, filters, rel @ LogicalRelation(source: TiDBRelation, _, _, _)) if projects.forall(_.isInstanceOf[Attribute]) => Some((filters, rel, source, projects)) case _ => Option.empty[ReturnType] } }
Example 50
Source File: XmlDataToCatalyst.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import com.databricks.spark.xml.parsers.StaxXmlParser case class XmlDataToCatalyst( child: Expression, schema: DataType, options: XmlOptions) extends UnaryExpression with CodegenFallback with ExpectsInputTypes { override lazy val dataType: DataType = schema @transient lazy val rowSchema: StructType = schema match { case st: StructType => st case ArrayType(st: StructType, _) => st } override def nullSafeEval(xml: Any): Any = xml match { case string: UTF8String => CatalystTypeConverters.convertToCatalyst( StaxXmlParser.parseColumn(string.toString, rowSchema, options)) case string: String => StaxXmlParser.parseColumn(string, rowSchema, options) case arr: GenericArrayData => CatalystTypeConverters.convertToCatalyst( arr.array.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options))) case arr: Array[_] => arr.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options)) case _ => null } override def inputTypes: Seq[DataType] = schema match { case _: StructType => Seq(StringType) case ArrayType(_: StructType, _) => Seq(ArrayType(StringType)) } }
Example 51
Source File: RedisRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.redis import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.execution.datasources.redis.RedisSpecialStrategy trait RedisRelationTrait { val parameters: Map[String, String] val schema: StructType lazy val redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(parameters.get(URL).get)) } case class RedisRelationImpl(val parameters: Map[String, String], val schema: StructType) extends RedisRelationTrait case class RedisRelation( parameters: Map[String, String], schema: StructType, filter: Seq[Expression] = Nil)(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedScan with RedisRelationTrait { override def toString: String = s"RedisRelation(${filter.mkString(",")})" val partitionNum: Int = parameters.getOrElse("partitionNum", "1").toInt override def buildScan(requiredColumns: Array[String]): RDD[Row] = { val filters = filter .map(RedisSpecialStrategy.getAttr) .groupBy(_._1) .map(tup => (tup._1, tup._2.map(_._2))) new RedisRDD(sqlContext.sparkContext, this, filters, requiredColumns, partitionNum) } }
Example 52
Source File: ScriptTransformation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 53
Source File: CodegenFallback.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ trait CodegenFallback extends Expression { protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { // LeafNode does not need `input` val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW val idx = ctx.references.length ctx.references += this var childIndex = idx this.foreach { case n: Nondeterministic => // This might add the current expression twice, but it won't hurt. ctx.references += n childIndex += 1 ctx.addPartitionInitializationStatement( s""" |((Nondeterministic) references[$childIndex]) | .initialize(partitionIndex); """.stripMargin) case _ => } val objectTerm = ctx.freshName("obj") val placeHolder = ctx.registerComment(this.toString) val javaType = CodeGenerator.javaType(this.dataType) if (nullable) { ev.copy(code = code""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); boolean ${ev.isNull} = $objectTerm == null; $javaType ${ev.value} = ${CodeGenerator.defaultValue(this.dataType)}; if (!${ev.isNull}) { ${ev.value} = (${CodeGenerator.boxedType(this.dataType)}) $objectTerm; }""") } else { ev.copy(code = code""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); $javaType ${ev.value} = (${CodeGenerator.boxedType(this.dataType)}) $objectTerm; """, isNull = FalseLiteral) } } }
Example 54
Source File: SubstituteUnresolvedOrdinals.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 55
Source File: ResolveTableValuedFunctions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import java.util.Locale import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} tvf("start" -> LongType, "end" -> LongType, "step" -> LongType, "numPartitions" -> IntegerType) { case Seq(start: Long, end: Long, step: Long, numPartitions: Int) => Range(start, end, step, Some(numPartitions)) }) ) override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => // The whole resolution is somewhat difficult to understand here due to too much abstractions. // We should probably rewrite the following at some point. Reynold was just here to improve // error messages and didn't have time to do a proper rewrite. val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { case Some(tvf) => def failAnalysis(): Nothing = { val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ") u.failAnalysis( s"""error: table-valued function ${u.functionName} with alternatives: |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")} |cannot be applied to: ($argTypes)""".stripMargin) } val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { case Some(casted) => try { Some(resolver(casted.map(_.eval()))) } catch { case e: AnalysisException => failAnalysis() } case _ => None } } resolved.headOption.getOrElse { failAnalysis() } case _ => u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function") } // If alias names assigned, add `Project` with the aliases if (u.outputNames.nonEmpty) { val outputAttrs = resolvedFunc.output // Checks if the number of the aliases is equal to expected one if (u.outputNames.size != outputAttrs.size) { u.failAnalysis(s"Number of given aliases does not match number of output columns. " + s"Function name: ${u.functionName}; number of aliases: " + s"${u.outputNames.size}; number of output columns: ${outputAttrs.size}.") } val aliases = outputAttrs.zip(u.outputNames).map { case (attr, name) => Alias(attr, name)() } Project(aliases, resolvedFunc) } else { resolvedFunc } } }
Example 56
Source File: QueryPlanSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.dsl.plans import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal, NamedExpression} import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} import org.apache.spark.sql.types.IntegerType class QueryPlanSuite extends SparkFunSuite { test("origin remains the same after mapExpressions (SPARK-23823)") { CurrentOrigin.setPosition(0, 0) val column = AttributeReference("column", IntegerType)(NamedExpression.newExprId) val query = plans.DslLogicalPlan(plans.table("table")).select(column) CurrentOrigin.reset() val mappedQuery = query mapExpressions { case _: Expression => Literal(1) } val mappedOrigin = mappedQuery.expressions.apply(0).origin assert(mappedOrigin == Origin.apply(Some(0), Some(0))) } }
Example 57
Source File: RuleExecutorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } val message = intercept[TreeNodeException[LogicalPlan]] { ToFixedPoint.execute(Literal(100)) }.getMessage assert(message.contains("Max iterations (10) reached for batch fixedPoint")) } test("structural integrity checker") { object WithSIChecker extends RuleExecutor[Expression] { override protected def isPlanIntegral(expr: Expression): Boolean = expr match { case IntegerLiteral(_) => true case _ => false } val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(WithSIChecker.execute(Literal(10)) === Literal(9)) val message = intercept[TreeNodeException[LogicalPlan]] { WithSIChecker.execute(Literal(10.1)) }.getMessage assert(message.contains("the structural integrity of the plan is broken")) } }
Example 58
Source File: CheckCartesianProductsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.scalatest.Matchers._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED class CheckCartesianProductsSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil } val testRelation1 = LocalRelation('a.int, 'b.int) val testRelation2 = LocalRelation('c.int, 'd.int) val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter) val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists)) test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") { withSQLConf(CROSS_JOINS_ENABLED.key -> "true") { noException should be thrownBy { for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) { performCartesianProductCheck(joinType) } } } } test("CheckCartesianProducts throws an exception for join types that require a join condition") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { val thrownException = the [AnalysisException] thrownBy { performCartesianProductCheck(joinType) } assert(thrownException.message.contains("Detected implicit cartesian product")) } } } test("CheckCartesianProducts doesn't throw an exception if a join condition is present") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType, Some('a === 'd)) } } } } test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithoutRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType) } } } } private def performCartesianProductCheck( joinType: JoinType, condition: Option[Expression] = None): Unit = { val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze val optimizedPlan = Optimize.execute(analyzedPlan) comparePlans(analyzedPlan, optimizedPlan) } }
Example 59
Source File: ShuffledHashJoinExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener[Unit](_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 60
Source File: CartesianProductExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 61
Source File: DataSourceV2StringFormat.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.sources.v2.DataSourceV2 import org.apache.spark.util.Utils def pushedFilters: Seq[Expression] private def sourceName: String = source match { case registered: DataSourceRegister => registered.shortName() // source.getClass.getSimpleName can cause Malformed class name error, // call safer `Utils.getSimpleName` instead case _ => Utils.getSimpleName(source.getClass) } def metadataString: String = { val entries = scala.collection.mutable.ArrayBuffer.empty[(String, String)] if (pushedFilters.nonEmpty) { entries += "Filters" -> pushedFilters.mkString("[", ", ", "]") } // TODO: we should only display some standard options like path, table, etc. if (options.nonEmpty) { entries += "Options" -> Utils.redact(options).map { case (k, v) => s"$k=$v" }.mkString("[", ",", "]") } val outputStr = Utils.truncatedString(output, "[", ", ", "]") val entriesStr = if (entries.nonEmpty) { Utils.truncatedString(entries.map { case (key, value) => key + ": " + StringUtils.abbreviate(value, 100) }, " (", ", ", ")") } else { "" } s"$sourceName$outputStr$entriesStr" } }
Example 62
Source File: DataSourcePartitioning.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression} import org.apache.spark.sql.catalyst.plans.physical import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning} class DataSourcePartitioning( partitioning: Partitioning, colNames: AttributeMap[String]) extends physical.Partitioning { override val numPartitions: Int = partitioning.numPartitions() override def satisfies0(required: physical.Distribution): Boolean = { super.satisfies0(required) || { required match { case d: physical.ClusteredDistribution if isCandidate(d.clustering) => val attrs = d.clustering.map(_.asInstanceOf[Attribute]) partitioning.satisfy( new ClusteredDistribution(attrs.map { a => val name = colNames.get(a) assert(name.isDefined, s"Attribute ${a.name} is not found in the data source output") name.get }.toArray)) case _ => false } } } private def isCandidate(clustering: Seq[Expression]): Boolean = { clustering.forall { case a: Attribute => colNames.contains(a) case _ => false } } }
Example 63
Source File: DataSourceV2Strategy.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.{sources, Strategy} import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition} import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns} import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader object DataSourceV2Strategy extends Strategy { // TODO: nested column pruning. private def pruneColumns( reader: DataSourceReader, relation: DataSourceV2Relation, exprs: Seq[Expression]): Seq[AttributeReference] = { reader match { case r: SupportsPushDownRequiredColumns => val requiredColumns = AttributeSet(exprs.flatMap(_.references)) val neededOutput = relation.output.filter(requiredColumns.contains) if (neededOutput != relation.output) { r.pruneColumns(neededOutput.toStructType) val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap r.readSchema().toAttributes.map { // We have to keep the attribute id during transformation. a => a.withExprId(nameToAttr(a.name).exprId) } } else { relation.output } case _ => relation.output } } override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => val reader = relation.newReader() // `pushedFilters` will be pushed down and evaluated in the underlying data sources. // `postScanFilters` need to be evaluated after the scan. // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. val (pushedFilters, postScanFilters) = pushFilters(reader, filters) val output = pruneColumns(reader, relation, project ++ postScanFilters) logInfo( s""" |Pushing operators to ${relation.source.getClass} |Pushed Filters: ${pushedFilters.mkString(", ")} |Post-Scan Filters: ${postScanFilters.mkString(",")} |Output: ${output.mkString(", ")} """.stripMargin) val scan = DataSourceV2ScanExec( output, relation.source, relation.options, pushedFilters, reader) val filterCondition = postScanFilters.reduceLeftOption(And) val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan) // always add the projection, which will produce unsafe rows required by some operators ProjectExec(project, withFilter) :: Nil case r: StreamingDataSourceV2Relation => // ensure there is a projection, which will produce unsafe rows required by some operators ProjectExec(r.output, DataSourceV2ScanExec(r.output, r.source, r.options, r.pushedFilters, r.reader)) :: Nil case WriteToDataSourceV2(writer, query) => WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil case AppendData(r: DataSourceV2Relation, query, _) => WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil case WriteToContinuousDataSource(writer, query) => WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil case Repartition(1, false, child) => val isContinuous = child.collectFirst { case StreamingDataSourceV2Relation(_, _, _, r: ContinuousReader) => r }.isDefined if (isContinuous) { ContinuousCoalesceExec(1, planLater(child)) :: Nil } else { Nil } case _ => Nil } }
Example 64
Source File: Exchange.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 65
Source File: subquery.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, DataType, StructType} case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of subqueries to avoid O(N*N) sameResult calls. val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]() plan transformAllExpressions { case sub: ExecSubqueryExpression => val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]()) val sameResult = sameSchema.find(_.sameResult(sub.plan)) if (sameResult.isDefined) { sub.withNewPlan(sameResult.get) } else { sameSchema += sub.plan sub } } } }
Example 66
Source File: GroupedIterator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection} object GroupedIterator { def apply( input: Iterator[InternalRow], keyExpressions: Seq[Expression], inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { if (input.hasNext) { new GroupedIterator(input.buffered, keyExpressions, inputSchema) } else { Iterator.empty } } } def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator def next(): (InternalRow, Iterator[InternalRow]) = { assert(hasNext) // Ensure we have fetched the next iterator. val ret = (keyProjection(currentGroup), currentIterator) currentIterator = null ret } private def fetchNextGroupIterator(): Boolean = { assert(currentIterator == null) if (currentRow == null && input.hasNext) { currentRow = input.next() } if (currentRow == null) { // These is no data left, return false. false } else { // Skip to next group. // currentRow may be overwritten by `hasNext`, so we should compare them first. while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) { currentRow = input.next() } if (keyOrdering.compare(currentGroup, currentRow) == 0) { // We are in the last group, there is no more groups, return false. false } else { // Now the `currentRow` is the first row of next group. currentGroup = currentRow.copy() currentIterator = createGroupValuesIterator() true } } } private def createGroupValuesIterator(): Iterator[InternalRow] = { new Iterator[InternalRow] { def hasNext: Boolean = currentRow != null || fetchNextRowInGroup() def next(): InternalRow = { assert(hasNext) val res = currentRow currentRow = null res } private def fetchNextRowInGroup(): Boolean = { assert(currentRow == null) if (input.hasNext) { // The inner iterator should NOT consume the input into next group, here we use `head` to // peek the next input, to see if we should continue to process it. if (keyOrdering.compare(currentGroup, input.head) == 0) { // Next input is in the current group. Continue the inner iterator. currentRow = input.next() true } else { // Next input is not in the right group. End this inner iterator. false } } else { // There is no more data, return false. false } } } } }
Example 67
Source File: AnnotationParser.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parser import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{AnnotationReference, Expression, Literal} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String protected def toTableMetadata(metadata: Map[String, Expression]): Metadata = { val res = new MetadataBuilder() metadata.foreach { case (k, v:Literal) => v.dataType match { case StringType => if (k.equals("?")) { sys.error("column metadata key can not be ?") } if (k.equals("*")) { sys.error("column metadata key can not be *") } res.putString(k, v.value.asInstanceOf[UTF8String].toString) case LongType => res.putLong(k, v.value.asInstanceOf[Long]) case DoubleType => res.putDouble(k, v.value.asInstanceOf[Double]) case NullType => res.putString(k, null) case a:ArrayType => res.putString(k, v.value.toString) } case (k, v:AnnotationReference) => sys.error("column metadata can not have a reference to another column metadata") } res.build() } }
Example 68
Source File: ERPCurrencyConversionExpression.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.currency.erp import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes} import org.apache.spark.sql.currency.CurrencyConversionException import org.apache.spark.sql.currency.erp.ERPConversionLoader.RConversionOptionsCurried import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import scala.util.control.NonFatal case class ERPCurrencyConversionExpression( conversionFunction: RConversionOptionsCurried, children: Seq[Expression]) extends Expression with ImplicitCastInputTypes with CodegenFallback { protected val CLIENT_INDEX = 0 protected val CONVERSION_TYPE_INDEX = 1 protected val AMOUNT_INDEX = 2 protected val FROM_INDEX = 3 protected val TO_INDEX = 4 protected val DATE_INDEX = 5 protected val NUM_ARGS = 6 protected val errorMessage = "Currency conversion library encountered an internal error" override def eval(input: InternalRow): Any = { val inputArguments = children.map(_.eval(input)) require(inputArguments.length == NUM_ARGS, "wrong number of arguments") // parse arguments val client = Option(inputArguments(CLIENT_INDEX).asInstanceOf[UTF8String]).map(_.toString) val conversionType = Option(inputArguments(CONVERSION_TYPE_INDEX).asInstanceOf[UTF8String]).map(_.toString) val amount = Option(inputArguments(AMOUNT_INDEX).asInstanceOf[Decimal].toJavaBigDecimal) val sourceCurrency = Option(inputArguments(FROM_INDEX).asInstanceOf[UTF8String]).map(_.toString) val targetCurrency = Option(inputArguments(TO_INDEX).asInstanceOf[UTF8String]).map(_.toString) val date = Option(inputArguments(DATE_INDEX).asInstanceOf[UTF8String]).map(_.toString) // perform conversion val conversion = conversionFunction(client, conversionType, sourceCurrency, targetCurrency, date) val resultTry = conversion(amount) // If 'resultTry' holds a 'Failure', we have to propagate it because potential failure // handling already took place. We just wrap it in case it is a cryptic error. resultTry.recover { case NonFatal(err) => throw new CurrencyConversionException(errorMessage, err) }.get.map(Decimal.apply).orNull } override def dataType: DataType = DecimalType.forType(DoubleType) override def nullable: Boolean = true override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, DecimalType, StringType, StringType, StringType) def inputNames: Seq[String] = Seq("client", "conversion_type", "amount", "source", "target", "date") def getChild(name: String): Option[Expression] = { inputNames.zip(children).find { case (n, _) => name == n }.map(_._2) } }
Example 69
Source File: BasicCurrencyConversionFunction.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.currency.basic import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.currency._ import org.apache.spark.sql.util.ValidatingPropertyMap._ import scala.util.Try protected object BasicCurrencyConversionConfig { private def updateRatesMapByTable(ratesTable: String, sqlContext: SQLContext): Unit = { val ratesTableData = sqlContext.sql(s"SELECT * FROM $ratesTable").collect() ratesTableData.foreach { row => val from = row.getString(0) val to = row.getString(1) val date = row.getString(2).replaceAll("-", "").toInt val rate = Try(row.getDecimal(3)).recover { case ex: ClassCastException => new java.math.BigDecimal(row.getDouble(3)) }.get ratesMap.put((from, to), date, rate) } } }
Example 70
Source File: HierarchyPlan.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.LevelMatcher import org.apache.spark.sql.hierarchy._ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.RddUtils val schemaWithNode = StructType(child.schema.fields ++ Seq(StructField("", NodeType, nullable = false))) val resultInternalRdd = RDDConversions.rowToRowRdd(cachedResultRdd, schemaWithNode.fields.map(_.dataType)) resultInternalRdd } } private[sql] case class AdjacencyListHierarchyPlan(child: SparkPlan, parenthoodExp: Expression, startWhere: Option[Expression], orderBy: Seq[SortOrder], node: Attribute, dataType: DataType) extends HierarchyPlan(child, node) { override protected val builder: HierarchyBuilder[Row, Row] = HierarchyRowBroadcastBuilder(child.output, parenthoodExp, startWhere, orderBy) override protected val pathDataType = dataType } private[sql] case class LevelHierarchyPlan(child: SparkPlan, levels: Seq[Expression], startWhere: Option[Expression], orderBy: Seq[SortOrder], matcher: LevelMatcher, node: Attribute, dataType: DataType) extends HierarchyPlan(child, node) { override protected val builder: HierarchyBuilder[Row, Row] = HierarchyRowLevelBasedBuilder( child.output, levels, startWhere, orderBy, matcher) override protected val pathDataType = dataType }
Example 71
Source File: FunctionBuilders.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.{Expression, BinaryExpression, UnaryExpression} import scala.reflect.ClassTag object FunctionBuilders { type ExpressionBuilder = Seq[Expression] => Expression def expression[T <: Expression](arity: Int)(implicit tag: ClassTag[T]): ExpressionBuilder = { val argTypes = (1 to arity).map(x => classOf[Expression]) val constructor = tag.runtimeClass.getDeclaredConstructor(argTypes: _*) (expressions: Seq[Expression]) => { if (expressions.size != arity) { throw new IllegalArgumentException( s"Invalid number of arguments: ${expressions.size} (must be equal to $arity)" ) } constructor.newInstance(expressions: _*).asInstanceOf[Expression] } } def unaryExpression[T <: UnaryExpression](implicit tag: ClassTag[T]): ExpressionBuilder = expression[T](1) def binaryExpression[T <: BinaryExpression](implicit tag: ClassTag[T]): ExpressionBuilder = expression[T](2) def reverse(expressionBuilder: ExpressionBuilder): ExpressionBuilder = (expressions: Seq[Expression]) => { expressionBuilder(expressions.reverse) } }
Example 72
Source File: MetadataAccessorSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import org.apache.spark.sql.catalyst.expressions.{Literal, Expression} import org.scalatest.FunSuite // scalastyle:off magic.number class MetadataAccessorSuite extends FunSuite { test("expression map is written correctly to Metadata") { val expressionMap = Map[String, Expression] ( "stringKey" -> Literal.create("stringValue", StringType), "longKey" -> Literal.create(10L, LongType), "doubleKey" -> Literal.create(1.234, DoubleType), "nullKey" -> Literal.create(null, NullType) ) val actual = MetadataAccessor.expressionMapToMetadata(expressionMap) assertResult("stringValue")(actual.getString("stringKey")) assertResult(10)(actual.getLong("longKey")) assertResult(1.234)(actual.getDouble("doubleKey")) assertResult(null)(actual.getString("nullKey")) } test("metadata propagation works correctly") { val oldMetadata = new MetadataBuilder() .putString("key1", "value1") .putString("key2", "value2") .putLong("key3", 10L) .build() val newMetadata = new MetadataBuilder() .putString("key1", "overriden") .putString("key4", "value4") .build() val expected = new MetadataBuilder() .putString("key1", "overriden") .putString("key2", "value2") .putLong("key3", 10L) .putString("key4", "value4") .build() val actual = MetadataAccessor.propagateMetadata(oldMetadata, newMetadata) assertResult(expected)(actual) } test("filter metadata works correctly") { val metadata = new MetadataBuilder() .putString("key1", "value1") .putString("key2", "value2") .putLong("key3", 10L) .build() val expected1 = new MetadataBuilder() .putString("key1", "value1") .build() assertResult(expected1)(MetadataAccessor.filterMetadata(metadata, ("key1" :: Nil).toSet)) assertResult(metadata)(MetadataAccessor.filterMetadata(metadata, ("*" :: Nil).toSet)) } }
Example 73
Source File: SqlBuilderSuiteBase.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.DataType import org.scalatest.FunSuite import scala.util.matching.Regex trait SqlBuilderSuiteBase { self: FunSuite => val sqlBuilder: SqlBuilder // scalastyle:ignore def testExpressionToSql(sql: String)(expr: Expression): Unit = { val cleanSql = cleanUpSql(sql) test(s"expressionToSql: $cleanSql | with $expr") { assertResult(cleanSql)(sqlBuilder.expressionToSql(expr)) } } def testBuildSelect(sql: String) (i1: SqlLikeRelation, i2: Seq[String], i3: Seq[Filter]): Unit = { val cleanSql = cleanUpSql(sql) test(s"buildSelect: $cleanSql | with $i1 $i2 $i3") { assertResult(cleanSql)(sqlBuilder.buildSelect(i1, i2, i3)) } } def testLogicalPlan(sql: String)(plan: LogicalPlan): Unit = { val cleanSql = cleanUpSql(sql) test(s"logical plan: $cleanSql | with $plan") { assertResult(cleanSql)(sqlBuilder.logicalPlanToSql(plan)) } } def testLogicalPlanInternal(sql: String)(plan: LogicalPlan): Unit = { val cleanSql = cleanUpSql(sql) test(s"logical plan (internal): $cleanSql | with $plan") { assertResult(cleanSql)(sqlBuilder.internalLogicalPlanToSql(plan, noProject = true)) } } def testUnsupportedLogicalPlan(plan: LogicalPlan): Unit = { test(s"invalid logical plan: $plan") { intercept[RuntimeException] { sqlBuilder.logicalPlanToSql(plan) } } } private def cleanUpSql(q: String): String = q.replaceAll("\\s+", " ").trim def testUnsupportedLogicalPlanInternal(plan: LogicalPlan): Unit = { test(s"invalid logical plan (internal): $plan") { intercept[RuntimeException] { sqlBuilder.internalLogicalPlanToSql(plan) } } } def testGeneratedSqlDataType(expected: String)(dataType: DataType): Unit = { test(s"The generated sql type for ${dataType.simpleString} is $expected") { val generated = sqlBuilder.typeToSql(dataType) assertResult(expected)(generated) } } }
Example 74
Source File: DummyRelationUtils.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.{ColumnName, Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.sources.sql.SqlLikeRelation import org.apache.spark.sql.types.{StructField, StructType} case class DummyCatalystSourceRelation( schema: StructType, isMultiplePartitionExecutionFunc: Option[Seq[CatalystSource] => Boolean] = None, supportsLogicalPlanFunc: Option[LogicalPlan => Boolean] = None, supportsExpressionFunc: Option[Expression => Boolean] = None, logicalPlanToRDDFunc: Option[LogicalPlan => RDD[Row]] = None) (@transient implicit val sqlContext: SQLContext) extends BaseRelation with CatalystSource { override def isMultiplePartitionExecution(relations: Seq[CatalystSource]): Boolean = isMultiplePartitionExecutionFunc.forall(_.apply(relations)) override def supportsLogicalPlan(plan: LogicalPlan): Boolean = supportsLogicalPlanFunc.forall(_.apply(plan)) override def supportsExpression(expr: Expression): Boolean = supportsExpressionFunc.forall(_.apply(expr)) override def logicalPlanToRDD(plan: LogicalPlan): RDD[Row] = logicalPlanToRDDFunc.getOrElse( (plan: LogicalPlan) => new LogicalPlanRDD(plan, sqlContext.sparkContext)).apply(plan) } }
Example 75
Source File: ColumnarShuffledHashJoinExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import java.util.concurrent.TimeUnit._ import com.intel.sparkColumnarPlugin.vectorized._ import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import scala.collection.mutable.ListBuffer import org.apache.arrow.vector.ipc.message.ArrowFieldNode import org.apache.arrow.vector.ipc.message.ArrowRecordBatch import org.apache.arrow.vector.types.pojo.ArrowType import org.apache.arrow.vector.types.pojo.Field import org.apache.arrow.vector.types.pojo.Schema import org.apache.arrow.gandiva.expression._ import org.apache.arrow.gandiva.evaluator._ import io.netty.buffer.ArrowBuf import com.google.common.collect.Lists; import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized.ExpressionEvaluator import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide} class ColumnarShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends ShuffledHashJoinExec( leftKeys, rightKeys, joinType, buildSide, condition, left, right) { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "joinTime" -> SQLMetrics.createTimingMetric(sparkContext, "join time"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def supportsColumnar = true //TODO() Disable code generation //override def supportCodegen: Boolean = false override def doExecuteColumnar(): RDD[ColumnarBatch] = { val numOutputRows = longMetric("numOutputRows") val joinTime = longMetric("joinTime") val buildTime = longMetric("buildTime") val resultSchema = this.schema streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) => //val hashed = buildHashedRelation(buildIter) //join(streamIter, hashed, numOutputRows) val vjoin = ColumnarShuffledHashJoin.create(leftKeys, rightKeys, resultSchema, joinType, buildSide, condition, left, right, buildTime, joinTime, numOutputRows) val vjoinResult = vjoin.columnarInnerJoin(streamIter, buildIter) TaskContext.get().addTaskCompletionListener[Unit](_ => { vjoin.close() }) new CloseableColumnBatchIterator(vjoinResult) } } }
Example 76
Source File: FileSourceScanExecAdapter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.oap.adapter import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.BitSet object FileSourceScanExecAdapter { def createFileSourceScanExec( relation: HadoopFsRelation, output: Seq[Attribute], requiredSchema: StructType, partitionFilters: Seq[Expression], optionalBucketSets: Option[BitSet], dataFilters: Seq[Expression], metastoreTableIdentifier: Option[TableIdentifier]): FileSourceScanExec = { FileSourceScanExec( relation, output, requiredSchema, partitionFilters, optionalBucketSets, dataFilters, metastoreTableIdentifier) } }
Example 77
Source File: OapAggUtils.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, Partial} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.oap.OapAggregationFileScanExec object OapAggUtils { private def createAggregate( requiredChildDistributionExpressions: Option[Seq[Expression]] = None, groupingExpressions: Seq[NamedExpression] = Nil, aggregateExpressions: Seq[AggregateExpression] = Nil, aggregateAttributes: Seq[Attribute] = Nil, initialInputBufferOffset: Int = 0, resultExpressions: Seq[NamedExpression] = Nil, child: SparkPlan): SparkPlan = { if (requiredChildDistributionExpressions.isDefined) { // final aggregate, fall back to Spark HashAggregateExec. HashAggregateExec( requiredChildDistributionExpressions = requiredChildDistributionExpressions, groupingExpressions = groupingExpressions, aggregateExpressions = aggregateExpressions, aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, child = child) } else { // Apply partial aggregate optimizations. OapAggregateExec( requiredChildDistributionExpressions = None, groupingExpressions = groupingExpressions, aggregateExpressions = aggregateExpressions, aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, child = child) } } def planAggregateWithoutDistinct( groupingExpressions: Seq[NamedExpression], aggregateExpressions: Seq[AggregateExpression], resultExpressions: Seq[NamedExpression], child: SparkPlan): Seq[SparkPlan] = { val useHash = HashAggregateExec.supportsAggregate( aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)) if (!child.isInstanceOf[OapAggregationFileScanExec] || !useHash) { // Child can not leverage oap optimization reading. Nil } else { // 1. Create an Aggregate Operator for partial aggregations. val groupingAttributes = groupingExpressions.map(_.toAttribute) val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial)) val partialAggregateAttributes = partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) val partialResultExpressions = groupingAttributes ++ partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes) val partialAggregate = createAggregate( requiredChildDistributionExpressions = None, groupingExpressions = groupingExpressions, aggregateExpressions = partialAggregateExpressions, aggregateAttributes = partialAggregateAttributes, initialInputBufferOffset = 0, resultExpressions = partialResultExpressions, child = child) // 2. Create an Aggregate Operator for final aggregations. val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final)) // The attributes of the final aggregation buffer, which is presented as input to the result // projection: val finalAggregateAttributes = finalAggregateExpressions.map(_.resultAttribute) val finalAggregate = createAggregate( requiredChildDistributionExpressions = Some(groupingAttributes), groupingExpressions = groupingAttributes, aggregateExpressions = finalAggregateExpressions, aggregateAttributes = finalAggregateAttributes, initialInputBufferOffset = groupingExpressions.length, resultExpressions = resultExpressions, child = partialAggregate) finalAggregate :: Nil } } }
Example 78
Source File: ArrowScan.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.v2.arrow import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScan import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration case class ArrowScan( sparkSession: SparkSession, fileIndex: PartitioningAwareFileIndex, readDataSchema: StructType, readPartitionSchema: StructType, pushedFilters: Array[Filter], options: CaseInsensitiveStringMap, partitionFilters: Seq[Expression] = Seq.empty, dataFilters: Seq[Expression] = Seq.empty) extends FileScan { override def createReaderFactory(): PartitionReaderFactory = { val caseSensitiveMap = options.asCaseSensitiveMap().asScala.toMap val hconf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) val broadcastedConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hconf)) ArrowPartitionReaderFactory( sparkSession.sessionState.conf, broadcastedConf, readDataSchema, readPartitionSchema, pushedFilters, new ArrowOptions(options.asScala.toMap)) } override def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) }
Example 79
Source File: SparkExtension.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.sql import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} class SparkExtension extends (SparkSessionExtensions => Unit) { def apply(e: SparkSessionExtensions): Unit = { e.injectParser(SparkAtlasConnectorParser) } } case class SparkAtlasConnectorParser(spark: SparkSession, delegate: ParserInterface) extends ParserInterface { override def parsePlan(sqlText: String): LogicalPlan = { SQLQuery.set(sqlText) delegate.parsePlan(sqlText) } override def parseExpression(sqlText: String): Expression = delegate.parseExpression(sqlText) override def parseTableIdentifier(sqlText: String): TableIdentifier = delegate.parseTableIdentifier(sqlText) override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = delegate.parseFunctionIdentifier(sqlText) override def parseTableSchema(sqlText: String): StructType = delegate.parseTableSchema(sqlText) override def parseDataType(sqlText: String): DataType = delegate.parseDataType(sqlText) } object SQLQuery { private[this] val sqlQuery = new ThreadLocal[String] def get(): String = sqlQuery.get def set(s: String): Unit = sqlQuery.set(s) }
Example 80
Source File: SQLBuilderTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 81
Source File: ScriptTransformation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 82
Source File: EventTimeWatermark.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends LogicalPlan { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override val children: Seq[LogicalPlan] = child :: Nil }
Example 83
Source File: CodegenFallback.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic} trait CodegenFallback extends Expression { protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { // LeafNode does not need `input` val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW val idx = ctx.references.length ctx.references += this var childIndex = idx this.foreach { case n: Nondeterministic => // This might add the current expression twice, but it won't hurt. ctx.references += n childIndex += 1 ctx.addPartitionInitializationStatement( s""" |((Nondeterministic) references[$childIndex]) | .initialize(partitionIndex); """.stripMargin) case _ => } val objectTerm = ctx.freshName("obj") val placeHolder = ctx.registerComment(this.toString) if (nullable) { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); boolean ${ev.isNull} = $objectTerm == null; ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)}; if (!${ev.isNull}) { ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; }""") } else { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; """, isNull = "false") } } }
Example 84
Source File: SubstituteUnresolvedOrdinals.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.CatalystConf import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transform { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 85
Source File: RuleExecutorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } val message = intercept[TreeNodeException[LogicalPlan]] { ToFixedPoint.execute(Literal(100)) }.getMessage assert(message.contains("Max iterations (10) reached for batch fixedPoint")) } }
Example 86
Source File: ShuffledHashJoinExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 87
Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 88
Source File: package.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv import scala.language.implicitConversions import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.carbondata.mv.plans._ import org.apache.carbondata.mv.plans.modular.{JoinEdge, ModularPlan} import org.apache.carbondata.mv.plans.modular.Flags._ import org.apache.carbondata.mv.plans.util._ package object dsl { object Plans { implicit class DslModularPlan(val modularPlan: ModularPlan) { def select(outputExprs: NamedExpression*) (inputExprs: Expression*) (predicateExprs: Expression*) (aliasMap: Map[Int, String]) (joinEdges: JoinEdge*): ModularPlan = { modular .Select( outputExprs, inputExprs, predicateExprs, aliasMap, joinEdges, Seq(modularPlan), NoFlags, Seq.empty, Seq.empty) } def groupBy(outputExprs: NamedExpression*) (inputExprs: Expression*) (predicateExprs: Expression*): ModularPlan = { modular .GroupBy(outputExprs, inputExprs, predicateExprs, None, modularPlan, NoFlags, Seq.empty) } def harmonize: ModularPlan = modularPlan.harmonized } implicit class DslModularPlans(val modularPlans: Seq[ModularPlan]) { def select(outputExprs: NamedExpression*) (inputExprs: Expression*) (predicateList: Expression*) (aliasMap: Map[Int, String]) (joinEdges: JoinEdge*): ModularPlan = { modular .Select( outputExprs, inputExprs, predicateList, aliasMap, joinEdges, modularPlans, NoFlags, Seq.empty, Seq.empty) } def union(): ModularPlan = modular.Union(modularPlans, NoFlags, Seq.empty) } implicit class DslLogical2Modular(val logicalPlan: LogicalPlan) { def resolveonly: LogicalPlan = analysis.SimpleAnalyzer.execute(logicalPlan) def modularize: ModularPlan = modular.SimpleModularizer.modularize(logicalPlan).next def optimize: LogicalPlan = BirdcageOptimizer.execute(logicalPlan) } } }
Example 89
Source File: package.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, PredicateHelper, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.carbondata.mv.plans.modular.ModularPlan import org.apache.carbondata.mv.plans.util.{CheckSPJG, LogicalPlanSignatureGenerator, Signature} def canEvaluate(exp: ScalaUDF, exprList: Seq[Expression]): Boolean = { var canBeDerived = false exprList.forall { case udf: ScalaUDF => if (udf.children.length == exp.children.length) { if (udf.children.zip(exp.children).forall(e => e._1.sql.equalsIgnoreCase(e._2.sql))) { canBeDerived = true } } canBeDerived case _ => canBeDerived } } def canEvaluate(expr: Expression, exprList: Seq[Expression]): Boolean = { expr match { case exp: ScalaUDF => canEvaluate(exp, exprList) case _ => expr.references.subsetOf(AttributeSet(exprList)) } } } def supports(supported: Boolean, message: Any) { if (!supported) { throw new UnsupportedOperationException(s"unsupported operation: $message") } } }
Example 90
Source File: ExpressionHelper.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv.plans.modular import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression} import org.apache.spark.sql.types.{DataType, Metadata} object ExpressionHelper { def createReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata, exprId: ExprId, qualifier: Option[String], attrRef : NamedExpression = null): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier) } def createAlias( child: Expression, name: String, exprId: ExprId = NamedExpression.newExprId, qualifier: Option[String] = None, explicitMetadata: Option[Metadata] = None, namedExpr : Option[NamedExpression] = None ) : Alias = { Alias(child, name)(exprId, qualifier, explicitMetadata) } def getTheLastQualifier(reference: AttributeReference): String = { reference.qualifier.head } }
Example 91
Source File: ExpressionHelper.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv.plans.modular import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression} import org.apache.spark.sql.types.{DataType, Metadata} object ExpressionHelper { def createReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata, exprId: ExprId, qualifier: Option[String], attrRef : NamedExpression = null): AttributeReference = { val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty AttributeReference(name, dataType, nullable, metadata)(exprId, qf) } def createAlias( child: Expression, name: String, exprId: ExprId, qualifier: Option[String]) : Alias = { val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty Alias(child, name)(exprId, qf, None) } def getTheLastQualifier(reference: AttributeReference): String = { reference.qualifier.reverse.head } }
Example 92
Source File: CarbonBoundReference.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources.Filter case class CastExpr(expr: Expression) extends Filter { override def references: Array[String] = null } case class FalseExpr() extends Filter { override def references: Array[String] = null } case class CarbonEndsWith(expr: Expression) extends Filter { override def references: Array[String] = null } case class CarbonContainsWith(expr: Expression) extends Filter { override def references: Array[String] = null }
Example 93
Source File: CleanFilesPostEventListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.secondaryindex.events import scala.collection.JavaConverters._ import org.apache.log4j.Logger import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.index.CarbonIndexUtil import org.apache.spark.sql.optimizer.CarbonFilters import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.mutate.CarbonUpdateUtil import org.apache.carbondata.core.statusmanager.SegmentStatusManager import org.apache.carbondata.events.{CleanFilesPostEvent, Event, OperationContext, OperationEventListener} class CleanFilesPostEventListener extends OperationEventListener with Logging { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) override def onEvent(event: Event, operationContext: OperationContext): Unit = { event match { case cleanFilesPostEvent: CleanFilesPostEvent => LOGGER.info("Clean files post event listener called") val carbonTable = cleanFilesPostEvent.carbonTable val indexTables = CarbonIndexUtil .getIndexCarbonTables(carbonTable, cleanFilesPostEvent.sparkSession) indexTables.foreach { indexTable => val partitions: Option[Seq[PartitionSpec]] = CarbonFilters.getPartitions( Seq.empty[Expression], cleanFilesPostEvent.sparkSession, indexTable) SegmentStatusManager.deleteLoadsAndUpdateMetadata( indexTable, true, partitions.map(_.asJava).orNull) CarbonUpdateUtil.cleanUpDeltaFiles(indexTable, true) } } } }
Example 94
Source File: MergeProjection.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation.merge import java.sql.{Date, Timestamp} import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection} import org.apache.spark.sql.catalyst.util.DateTimeUtils case class MergeProjection( @transient tableCols: Seq[String], @transient statusCol : String, @transient ds: Dataset[Row], @transient rltn: CarbonDatasourceHadoopRelation, @transient sparkSession: SparkSession, @transient mergeAction: MergeAction) { private val cutOffDate = Integer.MAX_VALUE >> 1 val isUpdate = mergeAction.isInstanceOf[UpdateAction] val isDelete = mergeAction.isInstanceOf[DeleteAction] def apply(row: GenericRowWithSchema): InternalRow = { // TODO we can avoid these multiple conversions if this is added as a SparkPlan node. val values = row.values.map { case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s) case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d) case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b) case d: Date => DateTimeUtils.fromJavaDate(d) case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) case value => value } projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow] } val (projection, output) = generateProjection private def generateProjection: (Projection, Array[Expression]) = { val existingDsOutput = rltn.carbonRelation.schema.toAttributes val colsMap = mergeAction match { case UpdateAction(updateMap) => updateMap case InsertAction(insertMap) => insertMap case _ => null } if (colsMap != null) { val output = new Array[Expression](tableCols.length) val expecOutput = new Array[Expression](tableCols.length) colsMap.foreach { case (k, v) => val tableIndex = tableCols.indexOf(k.toString().toLowerCase) if (tableIndex < 0) { throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap") } output(tableIndex) = v.expr.transform { case a: Attribute if !a.resolved => ds.queryExecution.analyzed.resolveQuoted(a.name, sparkSession.sessionState.analyzer.resolver).get } expecOutput(tableIndex) = existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get } if (output.contains(null)) { throw new CarbonMergeDataSetException(s"Not all columns are mapped") } (new InterpretedMutableProjection(output++Seq( ds.queryExecution.analyzed.resolveQuoted(statusCol, sparkSession.sessionState.analyzer.resolver).get), ds.queryExecution.analyzed.output), expecOutput) } else { (null, null) } } }
Example 95
Source File: CarbonExpressions.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.execution.command.DescribeTableCommand import org.apache.spark.sql.types.DataType object CarbonScalaUDF { def unapply(expression: Expression): Option[(ScalaUDF)] = { expression match { case a: ScalaUDF => Some(a) case _ => None } } } }
Example 96
Source File: SparkSqlAdapter.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.util.ThreadLocalSessionInfo object SparkSqlAdapter { def initSparkSQL(): Unit = { } def getScanForSegments( @transient relation: HadoopFsRelation, output: Seq[Attribute], outputSchema: StructType, partitionFilters: Seq[Expression], dataFilters: Seq[Expression], tableIdentifier: Option[TableIdentifier] ): FileSourceScanExec = { FileSourceScanExec( relation, output, outputSchema, partitionFilters, dataFilters, tableIdentifier) } def addSparkSessionListener(sparkSession: SparkSession): Unit = { sparkSession.sparkContext.addSparkListener(new SparkListener { override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { CarbonEnv.carbonEnvMap.remove(sparkSession) ThreadLocalSessionInfo.unsetAll() } }) } }
Example 97
Source File: SparkSqlAdapter.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.types.StructType object SparkSqlAdapter { def initSparkSQL(): Unit = { } def getScanForSegments( @transient relation: HadoopFsRelation, output: Seq[Attribute], outputSchema: StructType, partitionFilters: Seq[Expression], dataFilters: Seq[Expression], tableIdentifier: Option[TableIdentifier] ): FileSourceScanExec = { FileSourceScanExec( relation, output, outputSchema, partitionFilters, None, dataFilters, tableIdentifier) } }
Example 98
Source File: DeltaSourceUtils.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.sources import java.util.Locale import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources import org.apache.spark.sql.sources.Filter object DeltaSourceUtils { val NAME = "delta" val ALT_NAME = "delta" // Batch relations don't pass partitioning columns to `CreatableRelationProvider`s, therefore // as a hack, we pass in the partitioning columns among the options. val PARTITIONING_COLUMNS_KEY = "__partition_columns" def isDeltaDataSourceName(name: String): Boolean = { name.toLowerCase(Locale.ROOT) == NAME || name.toLowerCase(Locale.ROOT) == ALT_NAME } def translateFilters(filters: Array[Filter]): Expression = filters.map { case sources.EqualTo(attribute, value) => expressions.EqualTo(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.EqualNullSafe(attribute, value) => expressions.EqualNullSafe(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.GreaterThan(attribute, value) => expressions.GreaterThan(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.GreaterThanOrEqual(attribute, value) => expressions.GreaterThanOrEqual( UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.LessThan(attribute, value) => expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.LessThanOrEqual(attribute, value) => expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value)) case sources.In(attribute, values) => expressions.In(UnresolvedAttribute(attribute), values.map(createLiteral)) case sources.IsNull(attribute) => expressions.IsNull(UnresolvedAttribute(attribute)) case sources.IsNotNull(attribute) => expressions.IsNotNull(UnresolvedAttribute(attribute)) case sources.Not(otherFilter) => expressions.Not(translateFilters(Array(otherFilter))) case sources.And(filter1, filter2) => expressions.And(translateFilters(Array(filter1)), translateFilters(Array(filter2))) case sources.Or(filter1, filter2) => expressions.Or(translateFilters(Array(filter1)), translateFilters(Array(filter2))) case sources.StringStartsWith(attribute, value) => new expressions.Like( UnresolvedAttribute(attribute), expressions.Literal.create(s"${value}%")) case sources.StringEndsWith(attribute, value) => new expressions.Like( UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}")) case sources.StringContains(attribute, value) => new expressions.Like( UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}%")) case sources.AlwaysTrue() => expressions.Literal.TrueLiteral case sources.AlwaysFalse() => expressions.Literal.FalseLiteral }.reduce(expressions.And) }
Example 99
Source File: AnalysisHelper.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.util import org.apache.spark.sql.delta.DeltaErrors import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan trait AnalysisHelper { import AnalysisHelper._ protected def tryResolveReferences( sparkSession: SparkSession)( expr: Expression, planContainingExpr: LogicalPlan): Expression = { val newPlan = FakeLogicalPlan(expr, planContainingExpr.children) sparkSession.sessionState.analyzer.execute(newPlan) match { case FakeLogicalPlan(resolvedExpr, _) => // Return even if it did not successfully resolve return resolvedExpr case _ => // This is unexpected throw DeltaErrors.analysisException( s"Could not resolve expression $expr", plan = Option(planContainingExpr)) } } protected def toDataset(sparkSession: SparkSession, logicalPlan: LogicalPlan): Dataset[Row] = { Dataset.ofRows(sparkSession, logicalPlan) } protected def improveUnsupportedOpError(f: => Unit): Unit = { val possibleErrorMsgs = Seq( "is only supported with v2 table", // full error: DELETE is only supported with v2 tables "is not supported temporarily", // full error: UPDATE TABLE is not supported temporarily "Table does not support read", "Table implementation does not support writes" ).map(_.toLowerCase()) def isExtensionOrCatalogError(error: Exception): Boolean = { possibleErrorMsgs.exists(m => error.getMessage().toLowerCase().contains(m)) } try { f } catch { case e: Exception if isExtensionOrCatalogError(e) => throw DeltaErrors.configureSparkSessionWithExtensionAndCatalog(e) } } } object AnalysisHelper { case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan]) extends LogicalPlan { override def output: Seq[Attribute] = Nil } }
Example 100
Source File: DeltaInvariantCheckerExec.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.schema import org.apache.spark.sql.delta.DeltaErrors import org.apache.spark.sql.delta.schema.Invariants.NotNull import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BindReferences, Expression, GetStructField, Literal, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.{NullType, StructType} private def buildExtractors(invariant: Invariant): Option[Expression] = { assert(invariant.column.nonEmpty) val topLevelColumn = invariant.column.head val topLevelRefOpt = output.collectFirst { case a: AttributeReference if SchemaUtils.DELTA_COL_RESOLVER(a.name, topLevelColumn) => a } val rejectColumnNotFound = isNullNotOkay(invariant) if (topLevelRefOpt.isEmpty) { if (rejectColumnNotFound) { throw DeltaErrors.notNullInvariantException(invariant) } } if (invariant.column.length == 1) { topLevelRefOpt.map(BindReferences.bindReference[Expression](_, output)) } else { topLevelRefOpt.flatMap { topLevelRef => val boundTopLevel = BindReferences.bindReference[Expression](topLevelRef, output) try { val nested = invariant.column.tail.foldLeft(boundTopLevel) { case (e, fieldName) => e.dataType match { case StructType(fields) => val ordinal = fields.indexWhere(f => SchemaUtils.DELTA_COL_RESOLVER(f.name, fieldName)) if (ordinal == -1) { throw new IndexOutOfBoundsException(s"Not nullable column not found in struct: " + s"${fields.map(_.name).mkString("[", ",", "]")}") } GetStructField(e, ordinal, Some(fieldName)) case _ => throw new UnsupportedOperationException( "Invariants on nested fields other than StructTypes are not supported.") } } Some(nested) } catch { case i: IndexOutOfBoundsException if rejectColumnNotFound => throw InvariantViolationException(invariant, i.getMessage) case _: IndexOutOfBoundsException if !rejectColumnNotFound => None } } } } override protected def doExecute(): RDD[InternalRow] = { if (invariants.isEmpty) return child.execute() val boundRefs = invariants.map { invariant => CheckDeltaInvariant(buildExtractors(invariant).getOrElse(Literal(null, NullType)), invariant) } child.execute().mapPartitionsInternal { rows => val assertions = GenerateUnsafeProjection.generate(boundRefs) rows.map { row => assertions(row) row } } } override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 101
Source File: CheckDeltaInvariant.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.schema import org.apache.spark.sql.delta.schema.Invariants.{ArbitraryExpression, NotNull} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{Block, CodegenContext, ExprCode, JavaCode, TrueLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{DataType, NullType} case class CheckDeltaInvariant( child: Expression, invariant: Invariant) extends UnaryExpression with NonSQLExpression { override def dataType: DataType = NullType override def foldable: Boolean = false override def nullable: Boolean = true override def flatArguments: Iterator[Any] = Iterator(child) private def assertRule(input: InternalRow): Unit = invariant.rule match { case NotNull if child.eval(input) == null => throw InvariantViolationException(invariant, "") case ArbitraryExpression(expr) => val resolvedExpr = expr.transform { case _: UnresolvedAttribute => child } val result = resolvedExpr.eval(input) if (result == null || result == false) { throw InvariantViolationException( invariant, s"Value ${child.eval(input)} violates requirement.") } } override def eval(input: InternalRow): Any = { assertRule(input) null } private def generateNotNullCode(ctx: CodegenContext): Block = { val childGen = child.genCode(ctx) val invariantField = ctx.addReferenceObj("errMsg", invariant) code"""${childGen.code} | |if (${childGen.isNull}) { | throw org.apache.spark.sql.delta.schema.InvariantViolationException.apply( | $invariantField, ""); |} """.stripMargin } private def generateExpressionValidationCode(expr: Expression, ctx: CodegenContext): Block = { val resolvedExpr = expr.transform { case _: UnresolvedAttribute => child } val elementValue = child.genCode(ctx) val childGen = resolvedExpr.genCode(ctx) val invariantField = ctx.addReferenceObj("errMsg", invariant) val eValue = ctx.freshName("elementResult") code"""${elementValue.code} |${childGen.code} | |if (${childGen.isNull} || ${childGen.value} == false) { | Object $eValue = "null"; | if (!${elementValue.isNull}) { | $eValue = (Object) ${elementValue.value}; | } | throw org.apache.spark.sql.delta.schema.InvariantViolationException.apply( | $invariantField, "Value " + $eValue + " violates requirement."); |} """.stripMargin } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val code = invariant.rule match { case NotNull => generateNotNullCode(ctx) case ArbitraryExpression(expr) => generateExpressionValidationCode(expr, ctx) } ev.copy(code = code, isNull = TrueLiteral, value = JavaCode.literal("null", NullType)) } }
Example 102
Source File: DeltaSourceSnapshot.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.files import org.apache.spark.sql.delta.{DeltaLog, DeltaTableUtils, Snapshot} import org.apache.spark.sql.delta.sources.IndexedFile import org.apache.spark.sql.delta.util.StateCache import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.functions.lit class DeltaSourceSnapshot( val spark: SparkSession, val snapshot: Snapshot, val filters: Seq[Expression]) extends SnapshotIterator with StateCache { protected val version = snapshot.version protected val path = snapshot.path protected lazy val (partitionFilters, dataFilters) = { val partitionCols = snapshot.metadata.partitionColumns filters.partition { e => DeltaTableUtils.isPredicatePartitionColumnsOnly(e, partitionCols, spark) } } protected def initialFiles: Dataset[IndexedFile] = { import spark.implicits._ cacheDS( snapshot.allFiles.sort("modificationTime", "path") .rdd.zipWithIndex() .toDF("add", "index") .withColumn("version", lit(version)) .withColumn("isLast", lit(false)) .as[IndexedFile], s"Delta Source Snapshot #$version - ${snapshot.redactedPath}").getDS } override def close(unpersistSnapshot: Boolean): Unit = { super.close(unpersistSnapshot) if (unpersistSnapshot) { snapshot.uncache() } } } trait SnapshotIterator { self: DeltaSourceSnapshot => private var result: Iterable[IndexedFile] = _ def iterator(): Iterator[IndexedFile] = { import spark.implicits._ if (result == null) { result = DeltaLog.filterFileList( snapshot.metadata.partitionSchema, initialFiles.toDF(), partitionFilters, Seq("add")).as[IndexedFile].collect().toIterable } // This will always start from the beginning and re-use resources. If any exceptions were to // be thrown, the stream would stop, we would call stop on the source, and that will make // sure that we clean up resources. result.toIterator } def close(unpersistSnapshot: Boolean): Unit = { } }
Example 103
Source File: DeltaTableOperations.scala From delta with Apache License 2.0 | 5 votes |
package io.delta.tables.execution import scala.collection.Map import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate} import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand} import org.apache.spark.sql.delta.util.AnalysisHelper import io.delta.tables.DeltaTable import org.apache.spark.sql.{functions, Column, DataFrame, Dataset} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical._ trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable => protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError { val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition) toDataset(sparkSession, delete) } protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = { val history = new DeltaHistoryManager(deltaLog) val spark = self.toDF.sparkSession spark.createDataFrame(history.getHistory(limit)) } protected def executeGenerate(tblIdentifier: String, mode: String): Unit = { val tableId: TableIdentifier = sparkSession .sessionState .sqlParser .parseTableIdentifier(tblIdentifier) val generate = DeltaGenerateCommand(mode, tableId) generate.run(sparkSession) } protected def executeUpdate( set: Map[String, Column], condition: Option[Column]): Unit = improveUnsupportedOpError { val assignments = set.map { case (targetColName, column) => Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr) }.toSeq val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr)) toDataset(sparkSession, update) } protected def executeVacuum( deltaLog: DeltaLog, retentionHours: Option[Double]): DataFrame = { VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours) sparkSession.emptyDataFrame } protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = { map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap } protected def sparkSession = self.toDF.sparkSession }
Example 104
Source File: DruidOperatorSchema.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, NamedExpression} import org.apache.spark.sql.types.DataType import org.sparklinedata.druid.{DruidOperatorAttribute, DruidQueryBuilder} lazy val pushedDownExprToDruidAttr : Map[Expression, DruidOperatorAttribute] = buildPushDownDruidAttrsMap private def pushDownExpressionMap : Map[String, (Expression, DataType, DataType, String)] = dqb.outputAttributeMap.filter(t => t._2._1 != null) private def buildPushDownDruidAttrsMap : Map[Expression, DruidOperatorAttribute] = (pushDownExpressionMap map { case (nm, (e, oDT, dDT, tf)) => { (e -> druidAttrMap(nm)) } }) private def buildDruidOpAttr : Map[String, DruidOperatorAttribute] = (dqb.outputAttributeMap map { case (nm, (e, oDT, dDT, tf)) => { val druidEid = e match { case null => NamedExpression.newExprId case n: NamedExpression => n.exprId case _ => NamedExpression.newExprId } (nm -> DruidOperatorAttribute(druidEid, nm, dDT, tf)) } } ) }
Example 105
Source File: PlanningTest.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid.test import java.util.TimeZone import com.github.nscala_time.time.Imports._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.expressions.{Expression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.Filter import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.test.sparklinedata.TestHive import org.apache.spark.sql.sources.druid.DruidPlanner import org.scalatest.BeforeAndAfterEach import org.sparklinedata.druid._ import org.sparklinedata.druid.client.test.BaseTest import org.sparklinedata.druid.metadata.DruidRelationInfo trait PlanningTestHelper extends PredicateHelper { System.setProperty("user.timezone", "UTC") TimeZone.setDefault(TimeZone.getTimeZone("UTC")) override def splitConjunctivePredicates(condition: Expression): Seq[Expression] = { super.splitConjunctivePredicates(condition) } } abstract class PlanningTest extends BaseTest with BeforeAndAfterEach with PlanningTestHelper { val dPlanner = new DruidPlanner(TestHive) var tab: DataFrame = _ var drInfo: DruidRelationInfo = _ var dqb: DruidQueryBuilder = _ var iCE: IntervalConditionExtractor = _ var iCE2: SparkIntervalConditionExtractor = _ override def beforeAll() = { super.beforeAll() tab = TestHive.table("orderLineItemPartSupplier") drInfo = tab.queryExecution.optimizedPlan. asInstanceOf[LogicalRelation].relation.asInstanceOf[DruidRelation].info } override protected def beforeEach(): Unit = { dqb = DruidQueryBuilder(drInfo) iCE = new IntervalConditionExtractor(dqb) iCE2 = new SparkIntervalConditionExtractor(dqb) } def validateFilter(filterStr: String, pushedToDruid: Boolean = true, filSpec: Option[FilterSpec] = None, intervals: List[Interval] = List() ): Unit = { val q = tab.where(filterStr) val filter = q.queryExecution.optimizedPlan.asInstanceOf[Filter] val dqbs = dPlanner.translateProjectFilter( Some(dqb), Seq(), splitConjunctivePredicates(filter.condition), true ) if (pushedToDruid) { assert(dqbs.size == 1) val odqb = dqbs(0) assert(odqb.filterSpec == filSpec) assert(odqb.queryIntervals.intervals == intervals) } } }
Example 106
Source File: Serialize.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.expressions import java.io.ByteArrayOutputStream import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, _} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.opencypher.morpheus.impl.expressions.EncodeLong.encodeLong import org.opencypher.morpheus.impl.expressions.Serialize._ import org.opencypher.okapi.impl.exception case class Serialize(children: Seq[Expression]) extends Expression { override def dataType: DataType = BinaryType override def nullable: Boolean = false // TODO: Only write length if more than one column is serialized override def eval(input: InternalRow): Any = { // TODO: Reuse from a pool instead of allocating a new one for each serialization val out = new ByteArrayOutputStream() children.foreach { child => child.dataType match { case BinaryType => write(child.eval(input).asInstanceOf[Array[Byte]], out) case StringType => write(child.eval(input).asInstanceOf[UTF8String], out) case IntegerType => write(child.eval(input).asInstanceOf[Int], out) case LongType => write(child.eval(input).asInstanceOf[Long], out) case other => throw exception.UnsupportedOperationException(s"Cannot serialize Spark data type $other.") } } out.toByteArray } override protected def doGenCode( ctx: CodegenContext, ev: ExprCode ): ExprCode = { ev.isNull = FalseLiteral val out = ctx.freshName("out") val serializeChildren = children.map { child => val childEval = child.genCode(ctx) s"""|${childEval.code} |if (!${childEval.isNull}) { | ${Serialize.getClass.getName.dropRight(1)}.write(${childEval.value}, $out); |}""".stripMargin }.mkString("\n") val baos = classOf[ByteArrayOutputStream].getName ev.copy( code = code"""|$baos $out = new $baos(); |$serializeChildren |byte[] ${ev.value} = $out.toByteArray();""".stripMargin) } } object Serialize { val supportedTypes: Set[DataType] = Set(BinaryType, StringType, IntegerType, LongType) @inline final def write(value: Array[Byte], out: ByteArrayOutputStream): Unit = { out.write(encodeLong(value.length)) out.write(value) } @inline final def write( value: Boolean, out: ByteArrayOutputStream ): Unit = write(if (value) 1.toLong else 0.toLong, out) @inline final def write(value: Byte, out: ByteArrayOutputStream): Unit = write(value.toLong, out) @inline final def write(value: Int, out: ByteArrayOutputStream): Unit = write(value.toLong, out) @inline final def write(value: Long, out: ByteArrayOutputStream): Unit = write(encodeLong(value), out) @inline final def write(value: UTF8String, out: ByteArrayOutputStream): Unit = write(value.getBytes, out) @inline final def write(value: String, out: ByteArrayOutputStream): Unit = write(value.getBytes, out) }
Example 107
Source File: EncodeLong.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.expressions import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression} import org.apache.spark.sql.types.{BinaryType, DataType, LongType} import org.opencypher.morpheus.api.value.MorpheusElement._ case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes { override val dataType: DataType = BinaryType override val inputTypes: Seq[LongType] = Seq(LongType) override protected def nullSafeEval(input: Any): Any = EncodeLong.encodeLong(input.asInstanceOf[Long]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))") } object EncodeLong { private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2) private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2) private final val otherBitsMask = ~varLength7BitMask private final val maxBytesForLongVarEncoding = 10 // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def encodeLong(l: Long): Array[Byte] = { val tempResult = new Array[Byte](maxBytesForLongVarEncoding) var remainder = l var index = 0 while ((remainder & otherBitsMask) != 0) { tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte remainder >>>= 7 index += 1 } tempResult(index) = remainder.toByte val result = new Array[Byte](index + 1) System.arraycopy(tempResult, 0, result, 0, index + 1) result } // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def decodeLong(input: Array[Byte]): Long = { assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input") var index = 0 var currentByte = input(index) var decoded = currentByte & varLength7BitMask var nextLeftShift = 7 while ((currentByte & moreBytesBitMask) != 0) { index += 1 currentByte = input(index) decoded |= (currentByte & varLength7BitMask) << nextLeftShift nextLeftShift += 7 } assert(index == input.length - 1, s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.") decoded } implicit class ColumnLongOps(val c: Column) extends AnyVal { def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name) def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr)) } }
Example 108
Source File: SQLBuilderTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 109
Source File: ScriptTransformation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 110
Source File: EventTimeWatermark.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends LogicalPlan { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override val children: Seq[LogicalPlan] = child :: Nil }
Example 111
Source File: CodegenFallback.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic} trait CodegenFallback extends Expression { protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { // LeafNode does not need `input` val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW val idx = ctx.references.length ctx.references += this var childIndex = idx this.foreach { case n: Nondeterministic => // This might add the current expression twice, but it won't hurt. ctx.references += n childIndex += 1 ctx.addPartitionInitializationStatement( s""" |((Nondeterministic) references[$childIndex]) | .initialize(partitionIndex); """.stripMargin) case _ => } val objectTerm = ctx.freshName("obj") val placeHolder = ctx.registerComment(this.toString) if (nullable) { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); boolean ${ev.isNull} = $objectTerm == null; ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)}; if (!${ev.isNull}) { ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; }""") } else { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; """, isNull = "false") } } }
Example 112
Source File: SubstituteUnresolvedOrdinals.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.CatalystConf import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transform { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 113
Source File: RuleExecutorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } val message = intercept[TreeNodeException[LogicalPlan]] { ToFixedPoint.execute(Literal(100)) }.getMessage assert(message.contains("Max iterations (10) reached for batch fixedPoint")) } }
Example 114
Source File: ShuffledHashJoinExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 115
Source File: CartesianProductExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.hadoop.security.UserGroupInformation import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { private[this] val user = UserGroupInformation.getCurrentUser.getShortUserName override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get(user).blockManager, SparkEnv.get(user).serializerManager, context, null, null, 1024, SparkEnv.get(user).memoryManager.pageSizeBytes, SparkEnv.get(user).conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 116
Source File: OTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class OTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } @transient private lazy val broadcastFuture = future { prevBatch match { case None => // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation: Broadcast[JHashSet[Row]] = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = OTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 117
Source File: MTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class MTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } val watcher = controller.getWatcher @transient private lazy val broadcastFuture = future { // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() prevBatch match { case None => val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => // TODO: fix this integrity error by supporting join whose both branches may grow val hashed = HashedSet(input.iterator) val previous = controller.broadcasts((opId, bId)).value.asInstanceOf[JHashSet[Row]] if (!previous.containsAll(hashed)) { watcher += -1 logError(s"Integrity Error in MTBLeftSemiHashJoin(Op $opId, Batch $currentBatch)") } controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = MTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 118
Source File: OTShuffledHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning} import org.apache.spark.sql.execution.joins.{BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.ComposeRDDFunctions._ import org.apache.spark.sql.hive.online._ import org.apache.spark.storage.{OLABlockId, StorageLevel} case class OTShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil def retrieveState(): RDD[HashedRelation] = prevBatch match { case Some(bId) => val numParts = controller.olaBlocks(opId, bId) OLABlockRDD.create[HashedRelation](sparkContext, opId.id, Array((numParts, bId)), numParts) case None => sys.error(s"Unexpected prevBatch = $prevBatch") } override def doExecute() = { prevBatch match { case None => val buildRdd = buildPlan.execute() controller.olaBlocks((opId, currentBatch)) = buildRdd.partitions.length buildRdd.zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = HashedRelation(buildIter, buildSideKeyGenerator) SparkEnv.get.blockManager.putSingle( OLABlockId(opId.id, currentBatch, index), hashed, StorageLevel.MEMORY_AND_DISK) hashJoin(streamIter, hashed) } case Some(_) => retrieveState().zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = buildIter.next() hashJoin(streamIter, hashed) } } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = OTShuffledHashJoin(leftKeys, rightKeys, buildSide, left, right)(controller, newTrace, opId) }
Example 119
Source File: OTBroadcastHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent._ import scala.concurrent.duration._ case class OTBroadcastHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } @transient private lazy val broadcastFuture = future { prevBatch match { case None => // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute().map(_.copy()).collect() val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[HashedRelation]] } }(BroadcastHashJoin.broadcastHashJoinExecutionContext) override def doExecute() = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => hashJoin(streamedIter, broadcastRelation.value) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = OTBroadcastHashJoin(leftKeys, rightKeys, buildSide, left, right)( controller, newTrace, opId) join.broadcastFuture join } }
Example 120
Source File: binary.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.mathfuncs import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, BinaryExpression, Expression, Row} import org.apache.spark.sql.types._ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String) extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product => type EvaluatedType = Any override def symbol: String = null override def expectedChildTypes: Seq[DataType] = Seq(DoubleType, DoubleType) override def nullable: Boolean = left.nullable || right.nullable override def toString: String = s"$name($left, $right)" override lazy val resolved = left.resolved && right.resolved && left.dataType == right.dataType && !DecimalType.isFixed(left.dataType) override def dataType: DataType = DoubleType override def eval(input: Row): Any = { val evalE1 = left.eval(input) if (evalE1 == null) { null } else { val evalE2 = right.eval(input) if (evalE2 == null) { null } else { val result = f(evalE1.asInstanceOf[Double], evalE2.asInstanceOf[Double]) if (result.isNaN) null else result } } } } case class Atan2( left: Expression, right: Expression) extends BinaryMathExpression(math.atan2, "ATAN2") { override def eval(input: Row): Any = { val evalE1 = left.eval(input) if (evalE1 == null) { null } else { val evalE2 = right.eval(input) if (evalE2 == null) { null } else { // With codegen, the values returned by -0.0 and 0.0 are different. Handled with +0.0 val result = math.atan2(evalE1.asInstanceOf[Double] + 0.0, evalE2.asInstanceOf[Double] + 0.0) if (result.isNaN) null else result } } } } case class Hypot( left: Expression, right: Expression) extends BinaryMathExpression(math.hypot, "HYPOT") case class Pow(left: Expression, right: Expression) extends BinaryMathExpression(math.pow, "POWER")
Example 121
Source File: FunctionRegistry.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.CatalystConf import org.apache.spark.sql.catalyst.expressions.Expression import scala.collection.mutable object StringKeyHashMap { def apply[T](caseSensitive: Boolean): StringKeyHashMap[T] = caseSensitive match { case false => new StringKeyHashMap[T](_.toLowerCase) case true => new StringKeyHashMap[T](identity) } } class StringKeyHashMap[T](normalizer: (String) => String) { private val base = new collection.mutable.HashMap[String, T]() def apply(key: String): T = base(normalizer(key)) def get(key: String): Option[T] = base.get(normalizer(key)) def put(key: String, value: T): Option[T] = base.put(normalizer(key), value) def remove(key: String): Option[T] = base.remove(normalizer(key)) def iterator: Iterator[(String, T)] = base.toIterator }
Example 122
Source File: RuleExecutorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(100)) === Literal(90)) } }
Example 123
Source File: LeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class LeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } }
Example 124
Source File: BroadcastHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.util.ThreadUtils import scala.concurrent._ import scala.concurrent.duration._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Row, Expression} import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class BroadcastHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { val timeout: Duration = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil @transient lazy val broadcastFuture = future { // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute().map(_.copy()).collect() val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length) sparkContext.broadcast(hashed) }(BroadcastHashJoin.broadcastHashJoinExecutionContext) protected override def doExecute(): RDD[Row] = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => hashJoin(streamedIter, broadcastRelation.value) } } } object BroadcastHashJoin { private[sql] val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 128)) }
Example 125
Source File: BroadcastLeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class BroadcastLeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val broadcastedRelation = sparkContext.broadcast(hashSet) streamedPlan.execute().mapPartitions { streamIter => val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue) }) } } }
Example 126
Source File: ShuffledHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class ShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[Row] = { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashed = HashedRelation(buildIter, buildSideKeyGenerator) hashJoin(streamIter, hashed) } } }
Example 127
Source File: FramelessInternals.scala From frameless with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.{Alias, CreateStruct} import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.types._ import org.apache.spark.sql.types.ObjectType import scala.reflect.ClassTag object FramelessInternals { def objectTypeFor[A](implicit classTag: ClassTag[A]): ObjectType = ObjectType(classTag.runtimeClass) def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = { ds.toDF.queryExecution.analyzed.resolve(colNames, ds.sparkSession.sessionState.analyzer.resolver).getOrElse { throw new AnalysisException( s"""Cannot resolve column name "$colNames" among (${ds.schema.fieldNames.mkString(", ")})""") } } def expr(column: Column): Expression = column.expr def column(column: Column): Expression = column.expr def logicalPlan(ds: Dataset[_]): LogicalPlan = ds.logicalPlan def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution = ds.sparkSession.sessionState.executePlan(plan) def joinPlan(ds: Dataset[_], plan: LogicalPlan, leftPlan: LogicalPlan, rightPlan: LogicalPlan): LogicalPlan = { val joined = executePlan(ds, plan) val leftOutput = joined.analyzed.output.take(leftPlan.output.length) val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length) Project(List( Alias(CreateStruct(leftOutput), "_1")(), Alias(CreateStruct(rightOutput), "_2")() ), joined.analyzed) } def mkDataset[T](sqlContext: SQLContext, plan: LogicalPlan, encoder: Encoder[T]): Dataset[T] = new Dataset(sqlContext, plan, encoder) def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = Dataset.ofRows(sparkSession, logicalPlan) // because org.apache.spark.sql.types.UserDefinedType is private[spark] type UserDefinedType[A >: Null] = org.apache.spark.sql.types.UserDefinedType[A] case class DisambiguateRight[T](tagged: Expression) extends Expression with NonSQLExpression { def eval(input: InternalRow): Any = tagged.eval(input) def nullable: Boolean = false def children: Seq[Expression] = tagged :: Nil def dataType: DataType = tagged.dataType protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ??? override def genCode(ctx: CodegenContext): ExprCode = tagged.genCode(ctx) } }
Example 128
package frameless.functions import frameless.TypedEncoder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, NonSQLExpression} import org.apache.spark.sql.types.DataType case class FramelessLit[A](obj: A, encoder: TypedEncoder[A]) extends Expression with NonSQLExpression { override def nullable: Boolean = encoder.nullable override def toString: String = s"FramelessLit($obj)" def eval(input: InternalRow): Any = { val ctx = new CodegenContext() val eval = genCode(ctx) val codeBody = s""" public scala.Function1<InternalRow, Object> generate(Object[] references) { return new FramelessLitEvalImpl(references); } class FramelessLitEvalImpl extends scala.runtime.AbstractFunction1<InternalRow, Object> { private final Object[] references; ${ctx.declareMutableStates()} ${ctx.declareAddedFunctions()} public FramelessLitEvalImpl(Object[] references) { this.references = references; ${ctx.initMutableStates()} } public java.lang.Object apply(java.lang.Object z) { InternalRow ${ctx.INPUT_ROW} = (InternalRow) z; ${eval.code} return ${eval.isNull} ? ((Object)null) : ((Object)${eval.value}); } } """ val code = CodeFormatter.stripOverlappingComments( new CodeAndComment(codeBody, ctx.getPlaceHolderToComments())) val (clazz, _) = CodeGenerator.compile(code) val codegen = clazz.generate(ctx.references.toArray).asInstanceOf[InternalRow => AnyRef] codegen(input) } def dataType: DataType = encoder.catalystRepr def children: Seq[Expression] = Nil override def genCode(ctx: CodegenContext): ExprCode = { encoder.toCatalyst(new Literal(obj, encoder.jvmRepr)).genCode(ctx) } protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ??? }
Example 129
Source File: Sources.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.types.StructType import tech.sourced.engine.util.{CompiledFilter, Filter} def getSources(tableSource: Option[String], schema: StructType): Seq[String] = tableSource match { case Some(ts) => Seq(ts) case None => schema .map(_.metadata.getString(SourceKey)) .distinct .sortWith(Sources.compare(_, _) < 0) } def getFiltersBySource(filters: Seq[Expression]): Map[String, Seq[CompiledFilter]] = filters.flatMap(Filter.compile) .map(e => (e.sources.distinct, e)) .filter(_._1.lengthCompare(1) == 0) .groupBy(_._1) .map { case (k, v) => (k.head, v.map(_._2)) } }
Example 130
Source File: RuleExecutorSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(100)) === Literal(90)) } }
Example 131
Source File: ShuffledHashJoin.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics @DeveloperApi case class ShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[InternalRow] = { val (numBuildRows, numStreamedRows) = buildSide match { case BuildLeft => (longMetric("numLeftRows"), longMetric("numRightRows")) case BuildRight => (longMetric("numRightRows"), longMetric("numLeftRows")) } val numOutputRows = longMetric("numOutputRows") buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashed = HashedRelation(buildIter, numBuildRows, buildSideKeyGenerator) hashJoin(streamIter, numStreamedRows, hashed, numOutputRows) } } }
Example 132
Source File: SemiJoinSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.{SQLConf, DataFrame, Row} import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical.Join import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression} import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} //半连接测试套件 class SemiJoinSuite extends SparkPlanTest with SharedSQLContext { private lazy val left = ctx.createDataFrame( ctx.sparkContext.parallelize(Seq( Row(1, 2.0), Row(1, 2.0), Row(2, 1.0), Row(2, 1.0), Row(3, 3.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("a", IntegerType).add("b", DoubleType)) private lazy val right = ctx.createDataFrame( ctx.sparkContext.parallelize(Seq( Row(2, 3.0), Row(2, 3.0), Row(3, 2.0), Row(4, 1.0), Row(null, null), Row(null, 5.0), Row(6, null) )), new StructType().add("c", IntegerType).add("d", DoubleType)) private lazy val condition = { And((left.col("a") === right.col("c")).expr, LessThan(left.col("b").expr, right.col("d").expr)) } // Note: the input dataframes and expression must be evaluated lazily because // the SQLContext should be used only within a test to keep SQL tests stable private def testLeftSemiJoin( testName: String, leftRows: => DataFrame, rightRows: => DataFrame, condition: => Expression, expectedAnswer: Seq[Product]): Unit = { def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = { val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition)) ExtractEquiJoinKeys.unapply(join) } test(s"$testName using LeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => EnsureRequirements(left.sqlContext).apply( LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using BroadcastLeftSemiJoinHash") { extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) => withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } test(s"$testName using LeftSemiJoinBNL") { withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") { checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) => LeftSemiJoinBNL(left, right, Some(condition)), expectedAnswer.map(Row.fromTuple), sortAnswers = true) } } } //测试左半连接 testLeftSemiJoin( "basic test", left, right, condition, Seq( (2, 1.0), (2, 1.0) ) ) }
Example 133
Source File: StatefulApproxQuantile.scala From deequ with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile.PercentileDigest import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, Literal} import org.apache.spark.sql.types._ private[sql] case class StatefulApproxQuantile( child: Expression, accuracyExpression: Expression, override val mutableAggBufferOffset: Int, override val inputAggBufferOffset: Int) extends TypedImperativeAggregate[PercentileDigest] with ImplicitCastInputTypes { def this(child: Expression, accuracyExpression: Expression) = { this(child, accuracyExpression, 0, 0) } def this(child: Expression) = { this(child, Literal(ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) } // Mark as lazy so that accuracyExpression is not evaluated during tree transformation. private lazy val accuracy: Double = accuracyExpression.eval().asInstanceOf[Double] override def inputTypes: Seq[AbstractDataType] = { Seq(DoubleType, TypeCollection(DoubleType, ArrayType(DoubleType)), IntegerType) } override def checkInputDataTypes(): TypeCheckResult = { val defaultCheck = super.checkInputDataTypes() if (defaultCheck.isFailure) { defaultCheck } else if (!accuracyExpression.foldable) { TypeCheckFailure(s"The accuracy provided must be a constant literal") } else if (accuracy <= 0) { TypeCheckFailure( s"The accuracy provided must be a positive integer literal (current value = $accuracy)") } else { TypeCheckSuccess } } override def createAggregationBuffer(): PercentileDigest = { val relativeError = 1.0D / accuracy new PercentileDigest(relativeError) } override def update(buffer: PercentileDigest, inputRow: InternalRow): PercentileDigest = { val value = child.eval(inputRow) // Ignore empty rows, for example: percentile_approx(null) if (value != null) { buffer.add(value.asInstanceOf[Double]) } buffer } override def merge(buffer: PercentileDigest, other: PercentileDigest): PercentileDigest = { buffer.merge(other) buffer } override def eval(buffer: PercentileDigest): Any = { // instead of evaluating the PercentileDigest quantile summary here, // serialize the digest and return it as byte array serialize(buffer) } override def withNewMutableAggBufferOffset(newOffset: Int): StatefulApproxQuantile = copy(mutableAggBufferOffset = newOffset) override def withNewInputAggBufferOffset(newOffset: Int): StatefulApproxQuantile = copy(inputAggBufferOffset = newOffset) override def children: Seq[Expression] = Seq(child, accuracyExpression) // Returns null for empty inputs override def nullable: Boolean = true override def dataType: DataType = BinaryType override def prettyName: String = "percentile_approx" override def serialize(digest: PercentileDigest): Array[Byte] = { ApproximatePercentile.serializer.serialize(digest) } override def deserialize(bytes: Array[Byte]): PercentileDigest = { ApproximatePercentile.serializer.deserialize(bytes) } }
Example 134
Source File: ScriptTransformation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 135
Source File: CodegenFallback.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic} trait CodegenFallback extends Expression { protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { // LeafNode does not need `input` val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW val idx = ctx.references.length ctx.references += this var childIndex = idx this.foreach { case n: Nondeterministic => // This might add the current expression twice, but it won't hurt. ctx.references += n childIndex += 1 ctx.addPartitionInitializationStatement( s""" |((Nondeterministic) references[$childIndex]) | .initialize(partitionIndex); """.stripMargin) case _ => } val objectTerm = ctx.freshName("obj") val placeHolder = ctx.registerComment(this.toString) if (nullable) { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); boolean ${ev.isNull} = $objectTerm == null; ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)}; if (!${ev.isNull}) { ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; }""") } else { ev.copy(code = s""" $placeHolder Object $objectTerm = ((Expression) references[$idx]).eval($input); ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm; """, isNull = "false") } } }
Example 136
Source File: SubstituteUnresolvedOrdinals.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.IntegerType class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] { private def isIntLiteral(e: Expression) = e match { case Literal(_, IntegerType) => true case _ => false } def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) => val newOrders = s.order.map { case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) => val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) withOrigin(order.origin)(order.copy(child = newOrdinal)) case other => other } withOrigin(s.origin)(s.copy(order = newOrders)) case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) => val newGroups = a.groupingExpressions.map { case ordinal @ Literal(index: Int, IntegerType) => withOrigin(ordinal.origin)(UnresolvedOrdinal(index)) case other => other } withOrigin(a.origin)(a.copy(groupingExpressions = newGroups)) } }
Example 137
Source File: QueryPlanSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.dsl.plans import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal, NamedExpression} import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} import org.apache.spark.sql.types.IntegerType class QueryPlanSuite extends SparkFunSuite { test("origin remains the same after mapExpressions (SPARK-23823)") { CurrentOrigin.setPosition(0, 0) val column = AttributeReference("column", IntegerType)(NamedExpression.newExprId) val query = plans.DslLogicalPlan(plans.table("table")).select(column) CurrentOrigin.reset() val mappedQuery = query mapExpressions { case _: Expression => Literal(1) } val mappedOrigin = mappedQuery.expressions.apply(0).origin assert(mappedOrigin == Origin.apply(Some(0), Some(0))) } }
Example 138
Source File: RuleExecutorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } val message = intercept[TreeNodeException[LogicalPlan]] { ToFixedPoint.execute(Literal(100)) }.getMessage assert(message.contains("Max iterations (10) reached for batch fixedPoint")) } test("structural integrity checker") { object WithSIChecker extends RuleExecutor[Expression] { override protected def isPlanIntegral(expr: Expression): Boolean = expr match { case IntegerLiteral(_) => true case _ => false } val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(WithSIChecker.execute(Literal(10)) === Literal(9)) val message = intercept[TreeNodeException[LogicalPlan]] { WithSIChecker.execute(Literal(10.1)) }.getMessage assert(message.contains("the structural integrity of the plan is broken")) } }
Example 139
Source File: CheckCartesianProductsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.scalatest.Matchers._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED class CheckCartesianProductsSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil } val testRelation1 = LocalRelation('a.int, 'b.int) val testRelation2 = LocalRelation('c.int, 'd.int) val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter) val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists)) test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") { withSQLConf(CROSS_JOINS_ENABLED.key -> "true") { noException should be thrownBy { for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) { performCartesianProductCheck(joinType) } } } } test("CheckCartesianProducts throws an exception for join types that require a join condition") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { val thrownException = the [AnalysisException] thrownBy { performCartesianProductCheck(joinType) } assert(thrownException.message.contains("Detected implicit cartesian product")) } } } test("CheckCartesianProducts doesn't throw an exception if a join condition is present") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType, Some('a === 'd)) } } } } test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithoutRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType) } } } } private def performCartesianProductCheck( joinType: JoinType, condition: Option[Expression] = None): Unit = { val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze val optimizedPlan = Optimize.execute(analyzedPlan) comparePlans(analyzedPlan, optimizedPlan) } }
Example 140
Source File: ShuffledHashJoinExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"), "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe")) override def requiredChildDistribution: Seq[Distribution] = HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val avgHashProbe = longMetric("avgHashProbe") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows, avgHashProbe) } } }
Example 141
Source File: CartesianProductExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 142
Source File: DataSourcePartitioning.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression} import org.apache.spark.sql.catalyst.plans.physical import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning} class DataSourcePartitioning( partitioning: Partitioning, colNames: AttributeMap[String]) extends physical.Partitioning { override val numPartitions: Int = partitioning.numPartitions() override def satisfies(required: physical.Distribution): Boolean = { super.satisfies(required) || { required match { case d: physical.ClusteredDistribution if isCandidate(d.clustering) => val attrs = d.clustering.map(_.asInstanceOf[Attribute]) partitioning.satisfy( new ClusteredDistribution(attrs.map { a => val name = colNames.get(a) assert(name.isDefined, s"Attribute ${a.name} is not found in the data source output") name.get }.toArray)) case _ => false } } } private def isCandidate(clustering: Seq[Expression]): Boolean = { clustering.forall { case a: Attribute => colNames.contains(a) case _ => false } } }
Example 143
Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 144
Source File: PythonUDF.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import org.apache.spark.api.python.PythonFunction import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, Unevaluable, UserDefinedExpression} import org.apache.spark.sql.types.DataType case class PythonUDF( name: String, func: PythonFunction, dataType: DataType, children: Seq[Expression], evalType: Int, udfDeterministic: Boolean) extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression { override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic) override def toString: String = s"$name(${children.mkString(", ")})" override def nullable: Boolean = true }
Example 145
Source File: StarryHashJoinExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import com.github.passionke.starry.SparkPlanExecutor import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} case class StarryHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val avgHashProbe = longMetric("avgHashProbe") val rows = SparkPlanExecutor.doExec(buildPlan) val hashed = HashedRelation(rows.iterator, buildKeys, rows.length, null) streamedPlan.execute().mapPartitions { streamedIter => join(streamedIter, hashed, numOutputRows, avgHashProbe) } } }
Example 146
Source File: GPlanExpander.scala From ingraph with Eclipse Public License 1.0 | 5 votes |
package ingraph.compiler.cypher2gplan import ingraph.model.{expr, gplan} import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.{expressions => cExpr} object GPlanExpander { def expandGPlan(rawQueryPlan: gplan.GNode): gplan.GNode = { // should there be other rule sets (partial functions), combine them using orElse, // e.g. pfunc1 orElse pfunc2 // expanding GetVertices involves creating other GetVertices, so transformUp is to avoid infinite recursion val full = rawQueryPlan.transformUp(gplanExpander) full.asInstanceOf[gplan.GNode] } val gplanExpander: PartialFunction[LogicalPlan, LogicalPlan] = { // Nullary case gplan.GetVertices(vertexAttribute) if vertexAttribute.properties.nonEmpty => { val condition: Expression = propertyMapToCondition(vertexAttribute.properties, vertexAttribute.name) gplan.Selection(condition, gplan.GetVertices(vertexAttribute)) } case gplan.Expand(srcVertexAttribute, trgVertexAttribute, edge, dir, child) if edge.properties.nonEmpty || trgVertexAttribute.properties.nonEmpty => { val selectionOnEdge = gplan.Selection(propertyMapToCondition(edge.properties, edge.name), gplan.Expand(srcVertexAttribute, trgVertexAttribute, edge, dir, child)) val selectionOnTargetVertex = gplan.Selection(propertyMapToCondition(trgVertexAttribute.properties, trgVertexAttribute.name), selectionOnEdge) selectionOnTargetVertex } } def propertyMapToCondition(properties: expr.types.TPropertyMap, baseName: String): Expression = { properties.map( (p) => cExpr.EqualTo(UnresolvedAttribute(Seq(baseName, p._1)), p._2) ) .foldLeft[Expression]( cExpr.Literal(true) )( (b, a) => cExpr.And(b, a) ) } }
Example 147
Source File: monotonicaggregates.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, Greatest, Least, Literal, Unevaluable} import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, DataType} abstract class MonotonicAggregateFunction extends DeclarativeAggregate with Serializable {} case class MMax(child: Expression) extends MonotonicAggregateFunction { override def children: Seq[Expression] = child :: Nil override def nullable: Boolean = true // Return data type. override def dataType: DataType = child.dataType // Expected input data type. override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType) override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function mmax") private lazy val mmax = AttributeReference("mmax", child.dataType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = mmax :: Nil override lazy val initialValues: Seq[Literal] = Seq( Least(Seq(mmin.left, mmin.right)) ) } override lazy val evaluateExpression: AttributeReference = mmin } case class MonotonicAggregateExpression(aggregateFunction: MonotonicAggregateFunction, mode: AggregateMode, isDistinct: Boolean) extends Expression with Unevaluable { override def children: Seq[Expression] = aggregateFunction :: Nil override def dataType: DataType = aggregateFunction.dataType override def foldable: Boolean = false override def nullable: Boolean = aggregateFunction.nullable override def references: AttributeSet = { val childReferences = mode match { case Partial | Complete => aggregateFunction.references.toSeq case PartialMerge | Final => aggregateFunction.aggBufferAttributes } AttributeSet(childReferences) } override def prettyString: String = aggregateFunction.prettyString override def toString: String = s"(${aggregateFunction},mode=$mode,isDistinct=$isDistinct)" }
Example 148
Source File: ShuffleHashJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, PartitioningCollection} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffleHashJoin(leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { @transient final protected val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext] val cacheBuildSide = bigDatalogContext.getConf.getBoolean("spark.datalog.shufflehashjoin.cachebuildside", true) override lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) var cachedBuildPlan: RDD[HashedRelation] = null override def output: Seq[Attribute] = left.output ++ right.output override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected override def doExecute(): RDD[InternalRow] = { val numStreamedRows = buildSide match { case BuildLeft => longMetric("numRightRows") case BuildRight => longMetric("numLeftRows") } val numOutputRows = longMetric("numOutputRows") if (cacheBuildSide) { if (cachedBuildPlan == null) { cachedBuildPlan = buildPlan.execute() .mapPartitionsInternal(iter => Iterator(HashedRelation(iter, SQLMetrics.nullLongMetric, buildSideKeyGenerator))) .persist() } cachedBuildPlan.zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => hashJoin(streamedIter, numStreamedRows, buildIter.next(), numOutputRows)} } else { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => val hashedRelation = HashedRelation(buildIter, SQLMetrics.nullLongMetric, buildSideKeyGenerator) hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows) } } } }
Example 149
Source File: operators.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LeafNode, LogicalPlan, Statistics, UnaryNode} case class Recursion(name: String, isLinear: Boolean, left: LogicalPlan, right: LogicalPlan, partitioning: Seq[Int]) extends BinaryNode { // left is exitRules plan // right is recursive rules plan override def output: Seq[Attribute] = right.output } case class MutualRecursion(name: String, isLinear: Boolean, left: LogicalPlan, right: LogicalPlan, partitioning: Seq[Int]) extends BinaryNode { override def output: Seq[Attribute] = right.output override def children: Seq[LogicalPlan] = { if (left == null) Seq(right) else Seq(left, right) } override def generateTreeString(depth: Int, lastChildren: Seq[Boolean], builder: StringBuilder): StringBuilder = { if (depth > 0) { lastChildren.init.foreach { isLast => val prefixFragment = if (isLast) " " else ": " builder.append(prefixFragment) } val branch = if (lastChildren.last) "+- " else ":- " builder.append(branch) } builder.append(simpleString) builder.append("\n") if (children.nonEmpty) { val exitRule = children.init if (exitRule != null) exitRule.foreach(_.generateTreeString(depth + 1, lastChildren :+ false, builder)) children.last.generateTreeString(depth + 1, lastChildren :+ true, builder) } builder } } case class LinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode { override def statistics: Statistics = Statistics(Long.MaxValue) var name = _name } case class NonLinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode { override def statistics: Statistics = Statistics(Long.MaxValue) def name = "all_" + _name } case class MonotonicAggregate(groupingExpressions: Seq[Expression], aggregateExpressions: Seq[NamedExpression], child: LogicalPlan, partitioning: Seq[Int]) extends UnaryNode { override lazy val resolved: Boolean = !expressions.exists(!_.resolved) && childrenResolved override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute) } case class AggregateRecursion(name: String, isLinear: Boolean, left: LogicalPlan, right: LogicalPlan, partitioning: Seq[Int]) extends BinaryNode { // left is exitRules plan // right is recursive rules plan override def output: Seq[Attribute] = right.output } case class AggregateRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode { override def statistics: Statistics = Statistics(Long.MaxValue) var name = _name } case class CacheHint(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 150
Source File: RuleExecutorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.trees import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal} import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} class RuleExecutorSuite extends SparkFunSuite { object DecrementLiterals extends Rule[Expression] { def apply(e: Expression): Expression = e transform { case IntegerLiteral(i) if i > 0 => Literal(i - 1) } } test("only once") { object ApplyOnce extends RuleExecutor[Expression] { val batches = Batch("once", Once, DecrementLiterals) :: Nil } assert(ApplyOnce.execute(Literal(10)) === Literal(9)) } test("to fixed point") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(10)) === Literal(0)) } test("to maxIterations") { object ToFixedPoint extends RuleExecutor[Expression] { val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil } assert(ToFixedPoint.execute(Literal(100)) === Literal(90)) } }
Example 151
Source File: FilterNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate case class FilterNode(conf: SQLConf, condition: Expression, child: LocalNode) extends UnaryLocalNode(conf) { private[this] var predicate: (InternalRow) => Boolean = _ override def output: Seq[Attribute] = child.output override def open(): Unit = { child.open() predicate = GeneratePredicate.generate(condition, child.output) } override def next(): Boolean = { var found = false while (!found && child.next()) { found = predicate.apply(child.fetch()) } found } override def fetch(): InternalRow = child.fetch() override def close(): Unit = child.close() }
Example 152
Source File: ExpandNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Projection} case class ExpandNode( conf: SQLConf, projections: Seq[Seq[Expression]], output: Seq[Attribute], child: LocalNode) extends UnaryLocalNode(conf) { assert(projections.size > 0) private[this] var result: InternalRow = _ private[this] var idx: Int = _ private[this] var input: InternalRow = _ private[this] var groups: Array[Projection] = _ override def open(): Unit = { child.open() groups = projections.map(ee => newProjection(ee, child.output)).toArray idx = groups.length } override def next(): Boolean = { if (idx >= groups.length) { if (child.next()) { input = child.fetch() idx = 0 } else { return false } } result = groups(idx)(input) idx += 1 true } override def fetch(): InternalRow = result override def close(): Unit = child.close() }
Example 153
Source File: LocalNodeTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.SparkFunSuite import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Expression, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType} class LocalNodeTest extends SparkFunSuite { protected val conf: SQLConf = new SQLConf protected val kvIntAttributes = Seq( AttributeReference("k", IntegerType)(), AttributeReference("v", IntegerType)()) protected val joinNameAttributes = Seq( AttributeReference("id1", IntegerType)(), AttributeReference("name", StringType)()) protected val joinNicknameAttributes = Seq( AttributeReference("id2", IntegerType)(), AttributeReference("nickname", StringType)()) protected def resolveExpressions( expressions: Seq[Expression], localNode: LocalNode): Seq[Expression] = { require(localNode.expressions.forall(_.resolved)) val inputMap = localNode.output.map { a => (a.name, a) }.toMap expressions.map { expression => expression.transformUp { case UnresolvedAttribute(Seq(u)) => inputMap.getOrElse(u, sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap")) } } } }
Example 154
Source File: BasicCurrencyConversionExpression.scala From HANAVora-Extensions with Apache License 2.0 | 4 votes |
package org.apache.spark.sql.currency.basic import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String case class BasicCurrencyConversionExpression( conversion: BasicCurrencyConversion, children: Seq[Expression]) extends Expression with ImplicitCastInputTypes with CodegenFallback { protected val AMOUNT_INDEX = 0 protected val FROM_INDEX = 1 protected val TO_INDEX = 2 protected val DATE_INDEX = 3 protected val NUM_ARGS = 4 override def eval(input: InternalRow): Any = { val inputArguments = children.map(_.eval(input)) require(inputArguments.length == NUM_ARGS, "wrong number of arguments") val sourceCurrency = Option(inputArguments(FROM_INDEX).asInstanceOf[UTF8String]).map(_.toString) val targetCurrency = Option(inputArguments(TO_INDEX).asInstanceOf[UTF8String]).map(_.toString) val amount = Option(inputArguments(AMOUNT_INDEX).asInstanceOf[Decimal].toJavaBigDecimal) val date = Option(inputArguments(DATE_INDEX).asInstanceOf[UTF8String]).map(_.toString) (amount, sourceCurrency, targetCurrency, date) match { case (Some(a), Some(s), Some(t), Some(d)) => nullSafeEval(a, s, t, d) case _ => null } } def nullSafeEval(amount: java.math.BigDecimal, sourceCurrency: String, targetCurrency: String, date: String): Any = { conversion.convert(amount, sourceCurrency, targetCurrency, date) .get .map(Decimal.apply) .orNull } override def dataType: DataType = DecimalType.forType(DoubleType) override def nullable: Boolean = true // TODO(MD, CS): use DateType but support date string override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType, StringType, StringType, StringType) }