org.apache.spark.sql.catalyst.expressions.Expression Scala Example

Source File: AvroDataToCatalyst.scala From spark-schema-registry with Apache License 2.0

6 votes

package com.hortonworks.spark.registry.avro

import java.io.ByteArrayInputStream

import com.hortonworks.registries.schemaregistry.{SchemaVersionInfo, SchemaVersionKey}
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import org.apache.avro.Schema
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{BinaryType, DataType}

import scala.collection.JavaConverters._


case class AvroDataToCatalyst(child: Expression, schemaName: String, version: Option[Int], config: Map[String, Object])
  extends UnaryExpression with ExpectsInputTypes {

  override def inputTypes = Seq(BinaryType)

  @transient private lazy val srDeser: AvroSnapshotDeserializer = {
    val obj = new AvroSnapshotDeserializer()
    obj.init(config.asJava)
    obj
  }

  @transient private lazy val srSchema = fetchSchemaVersionInfo(schemaName, version)

  @transient private lazy val avroSchema = new Schema.Parser().parse(srSchema.getSchemaText)

  override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType

  @transient private lazy val avroDeser= new AvroDeserializer(avroSchema, dataType)

  override def nullable: Boolean = true

  override def nullSafeEval(input: Any): Any = {
    val binary = input.asInstanceOf[Array[Byte]]
    val row = avroDeser.deserialize(srDeser.deserialize(new ByteArrayInputStream(binary), srSchema.getVersion))
    val result = row match {
      case r: InternalRow => r.copy()
      case _ => row
    }
    result
  }

  override def simpleString: String = {
    s"from_sr(${child.sql}, ${dataType.simpleString})"
  }

  override def sql: String = {
    s"from_sr(${child.sql}, ${dataType.catalogString})"
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val expr = ctx.addReferenceObj("this", this)
    defineCodeGen(ctx, ev, input =>
      s"(${ctx.boxedType(dataType)})$expr.nullSafeEval($input)")
  }

  private def fetchSchemaVersionInfo(schemaName: String, version: Option[Int]): SchemaVersionInfo = {
    val srClient = new SchemaRegistryClient(config.asJava)
    version.map(v => srClient.getSchemaVersionInfo(new SchemaVersionKey(schemaName, v)))
      .getOrElse(srClient.getLatestSchemaVersionInfo(schemaName))
  }

}

Source File: ShapeUtils.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.util

import org.apache.spark.sql.simba.{ShapeSerializer, ShapeType}
import org.apache.spark.sql.simba.expression.PointWrapper
import org.apache.spark.sql.simba.spatial.{Point, Shape}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, Expression, UnsafeArrayData}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan


object ShapeUtils {
  def getPointFromRow(row: InternalRow, columns: List[Attribute], plan: SparkPlan,
                      isPoint: Boolean): Point = {
    if (isPoint) {
      ShapeSerializer.deserialize(BindReferences.bindReference(columns.head, plan.output)
        .eval(row).asInstanceOf[UnsafeArrayData].toByteArray).asInstanceOf[Point]
    } else {
      Point(columns.toArray.map(BindReferences.bindReference(_, plan.output).eval(row)
        .asInstanceOf[Number].doubleValue()))
    }
  }
  def getPointFromRow(row: InternalRow, columns: List[Attribute], plan: LogicalPlan,
                      isPoint: Boolean): Point = {
    if (isPoint) {
      ShapeSerializer.deserialize(BindReferences.bindReference(columns.head, plan.output)
        .eval(row).asInstanceOf[UnsafeArrayData].toByteArray).asInstanceOf[Point]
    } else {
      Point(columns.toArray.map(BindReferences.bindReference(_, plan.output).eval(row)
        .asInstanceOf[Number].doubleValue()))
    }
  }

  def getShape(expression: Expression, input: InternalRow): Shape = {
    if (!expression.isInstanceOf[PointWrapper] && expression.dataType.isInstanceOf[ShapeType]) {
      ShapeSerializer.deserialize(expression.eval(input).asInstanceOf[UnsafeArrayData].toByteArray)
    } else if (expression.isInstanceOf[PointWrapper]) {
      expression.eval(input).asInstanceOf[Shape]
    } else throw new UnsupportedOperationException("Query shape should be of ShapeType")
  }

  def getShape(expression: Expression, schema: Seq[Attribute], input: InternalRow): Shape = {
    if (!expression.isInstanceOf[PointWrapper] && expression.dataType.isInstanceOf[ShapeType]) {
      ShapeSerializer.deserialize(BindReferences.bindReference(expression, schema)
        .eval(input).asInstanceOf[UnsafeArrayData].toByteArray)
    } else if (expression.isInstanceOf[PointWrapper]) {
      BindReferences.bindReference(expression, schema).eval(input).asInstanceOf[Shape]
    } else throw new UnsupportedOperationException("Query shape should be of ShapeType")
  }

}

Source File: SemiJoinSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.sql.{SQLConf, DataFrame, Row}
import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
import org.apache.spark.sql.catalyst.plans.Inner
import org.apache.spark.sql.catalyst.plans.logical.Join
import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression}
import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}

class SemiJoinSuite extends SparkPlanTest with SharedSQLContext {

  private lazy val left = sqlContext.createDataFrame(
    sparkContext.parallelize(Seq(
      Row(1, 2.0),
      Row(1, 2.0),
      Row(2, 1.0),
      Row(2, 1.0),
      Row(3, 3.0),
      Row(null, null),
      Row(null, 5.0),
      Row(6, null)
    )), new StructType().add("a", IntegerType).add("b", DoubleType))

  private lazy val right = sqlContext.createDataFrame(
    sparkContext.parallelize(Seq(
      Row(2, 3.0),
      Row(2, 3.0),
      Row(3, 2.0),
      Row(4, 1.0),
      Row(null, null),
      Row(null, 5.0),
      Row(6, null)
    )), new StructType().add("c", IntegerType).add("d", DoubleType))

  private lazy val condition = {
    And((left.col("a") === right.col("c")).expr,
      LessThan(left.col("b").expr, right.col("d").expr))
  }

  // Note: the input dataframes and expression must be evaluated lazily because
  // the SQLContext should be used only within a test to keep SQL tests stable
  private def testLeftSemiJoin(
      testName: String,
      leftRows: => DataFrame,
      rightRows: => DataFrame,
      condition: => Expression,
      expectedAnswer: Seq[Product]): Unit = {

    def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = {
      val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition))
      ExtractEquiJoinKeys.unapply(join)
    }

    test(s"$testName using LeftSemiJoinHash") {
      extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) =>
        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
            EnsureRequirements(left.sqlContext).apply(
              LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)),
            expectedAnswer.map(Row.fromTuple),
            sortAnswers = true)
        }
      }
    }

    test(s"$testName using BroadcastLeftSemiJoinHash") {
      extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) =>
        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
            BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition),
            expectedAnswer.map(Row.fromTuple),
            sortAnswers = true)
        }
      }
    }

    test(s"$testName using LeftSemiJoinBNL") {
      withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
        checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
          LeftSemiJoinBNL(left, right, Some(condition)),
          expectedAnswer.map(Row.fromTuple),
          sortAnswers = true)
      }
    }
  }

  testLeftSemiJoin(
    "basic test",
    left,
    right,
    condition,
    Seq(
      (2, 1.0),
      (2, 1.0)
    )
  )
}

Source File: PredicateUtil.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.util

import org.apache.spark.sql.catalyst.expressions.{Expression, And, Or}


object PredicateUtil {
  def toDNF(condition: Expression): Expression = {
    condition match {
      case Or(left, right) =>
        Or(toDNF(left), toDNF(right))
      case And(left, right) =>
        var ans: Expression = null
        val tmp_left = toDNF(left)
        val tmp_right = toDNF(right)
        tmp_left match {
          case Or(l, r) =>
            ans = Or(And(l, tmp_right), And(r, tmp_right))
          case _ =>
        }
        tmp_right match {
          case Or(l, r) =>
            if (ans == null) ans = Or(And(tmp_left, l), And(tmp_left, r))
          case _ =>
        }
        if (ans == null) And(tmp_left, tmp_right)
        else toDNF(ans)
      case exp => exp
    }
  }

  def toCNF(condition: Expression): Expression = {
    condition match {
      case And(left, right) =>
        And(toCNF(left), toCNF(right))
      case Or(left, right) =>
        var ans: Expression = null
        val tmp_left = toCNF(left)
        val tmp_right = toCNF(right)
        tmp_left match {
          case And(l, r) =>
            ans = And(Or(l, tmp_right), Or(r, tmp_right))
          case _ =>
        }
        tmp_right match {
          case And(l, r) =>
            if (ans == null) ans = And(Or(tmp_left, l), Or(tmp_left, r))
          case _ =>
        }
        if (ans == null) Or(tmp_left, tmp_right)
        else toCNF(ans)
      case exp => exp
    }
  }
  def dnfExtract(expression: Expression): Seq[Expression] = {
    expression match {
      case Or(left, right) =>
        dnfExtract(left) ++ dnfExtract(right)
      case And(left @ And(l2, r2), right) =>
        dnfExtract(And(l2, And(r2, right)))
      case other =>
        other :: Nil
    }
  }

  def cnfExtract(expression: Expression): Seq[Expression] = {
    expression match {
      case And(left, right) =>
        cnfExtract(left) ++ cnfExtract(right)
      case Or(left @ Or(l2, r2), right) =>
        cnfExtract(Or(l2, Or(r2, right)))
      case other =>
        other :: Nil
    }
  }

  def splitDNFPredicates(condition: Expression) = dnfExtract(toDNF(condition))

  def splitCNFPredicates(condition: Expression) = cnfExtract(toCNF(condition))


  def splitConjunctivePredicates(condition: Expression): Seq[Expression] = {
    condition match {
      case And(cond1, cond2) =>
        splitConjunctivePredicates(cond1) ++ splitConjunctivePredicates(cond2)
      case other => other :: Nil
    }
  }

  def splitDisjunctivePredicates(condition: Expression): Seq[Expression] = {
    condition match {
      case Or(cond1, cond2) =>
        splitDisjunctivePredicates(cond1) ++ splitDisjunctivePredicates(cond2)
      case other => other :: Nil
    }
  }

}

Source File: InRange.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.expression

import org.apache.spark.sql.simba.{ShapeSerializer, ShapeType}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, Predicate}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.simba.spatial.{MBR, Point, Shape}
import org.apache.spark.sql.simba.util.ShapeUtils
import org.apache.spark.sql.catalyst.util.GenericArrayData


case class InRange(shape: Expression, range_low: Expression, range_high: Expression)
  extends Predicate with CodegenFallback{
  override def nullable: Boolean = false

  override def eval(input: InternalRow): Any = {
    val eval_shape = ShapeUtils.getShape(shape, input)
    val eval_low = range_low.asInstanceOf[Literal].value.asInstanceOf[Point]
    val eval_high = range_high.asInstanceOf[Literal].value.asInstanceOf[Point]
    require(eval_shape.dimensions == eval_low.dimensions && eval_shape.dimensions == eval_high.dimensions)
    val mbr = MBR(eval_low, eval_high)
    mbr.intersects(eval_shape)
  }

  override def toString: String = s" **($shape) IN Rectangle ($range_low) - ($range_high)**  "

  override def children: Seq[Expression] = Seq(shape, range_low, range_high)
}

Source File: FilterExec.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution

import org.apache.spark.sql.simba.expression._
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.ShapeUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal, PredicateHelper}
import org.apache.spark.sql.catalyst.expressions.{SortOrder, And => SQLAnd, Not => SQLNot, Or => SQLOr}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.execution.SparkPlan


case class FilterExec(condition: Expression, child: SparkPlan) extends SimbaPlan with PredicateHelper {
   override def output: Seq[Attribute] = child.output

  private class DistanceOrdering(point: Expression, target: Point) extends Ordering[InternalRow] {
    override def compare(x: InternalRow, y: InternalRow): Int = {
      val shape_x = ShapeUtils.getShape(point, child.output, x)
      val shape_y = ShapeUtils.getShape(point, child.output, y)
      val dis_x = target.minDist(shape_x)
      val dis_y = target.minDist(shape_y)
      dis_x.compare(dis_y)
    }
  }

  // TODO change target partition from 1 to some good value
  // Note that target here must be an point literal in WHERE clause,
  // hence we can consider it as Point safely
  def knn(rdd: RDD[InternalRow], point: Expression, target: Point, k: Int): RDD[InternalRow] =
    sparkContext.parallelize(rdd.map(_.copy()).takeOrdered(k)(new DistanceOrdering(point, target)), 1)

  def applyCondition(rdd: RDD[InternalRow], condition: Expression): RDD[InternalRow] = {
    condition match {
      case InKNN(point, target, k) =>
        val _target = target.asInstanceOf[Literal].value.asInstanceOf[Point]
        knn(rdd, point, _target, k.value.asInstanceOf[Number].intValue())
      case now@And(left, right) =>
        if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))}
        else applyCondition(rdd, left).map(_.copy()).intersection(applyCondition(rdd, right).map(_.copy()))
      case now@Or(left, right) =>
        if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))}
        else applyCondition(rdd, left).map(_.copy()).union(applyCondition(rdd, right).map(_.copy())).distinct()
      case now@Not(c) =>
        if (!now.hasKNN) rdd.mapPartitions{ iter => iter.filter(newPredicate(condition, child.output).eval(_))}
        else rdd.map(_.copy()).subtract(applyCondition(rdd, c).map(_.copy()))
      case _ =>
        rdd.mapPartitions(iter => iter.filter(newPredicate(condition, child.output).eval(_)))
    }
  }

  protected def doExecute(): RDD[InternalRow] = {
    val root_rdd = child.execute()
    condition transformUp {
      case SQLAnd(left, right) => And(left, right)
      case SQLOr(left, right)=> Or(left, right)
      case SQLNot(c) => Not(c)
    }
    applyCondition(root_rdd, condition)
  }

  override def outputOrdering: Seq[SortOrder] = child.outputOrdering

  override def children: Seq[SparkPlan] = child :: Nil
  override def outputPartitioning: Partitioning = child.outputPartitioning
}

Source File: CDJSpark.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.execution.SparkPlan


case class CDJSpark(left_key: Expression, right_key: Expression,
                    l: Literal, left: SparkPlan, right: SparkPlan) extends SparkPlan {
  override def outputPartitioning: Partitioning = left.outputPartitioning

  override def output: Seq[Attribute] = left.output ++ right.output

  final val r = NumberUtil.literalToDouble(l)

  override protected def doExecute(): RDD[InternalRow] =
    left.execute().cartesian(right.execute()).mapPartitions { iter =>
      val joinedRow = new JoinedRow
      iter.filter { row =>
        val point1 = ShapeUtils.getShape(left_key, left.output, row._1).asInstanceOf[Point]
        val point2 = ShapeUtils.getShape(right_key, right.output, row._2).asInstanceOf[Point]
        point1.minDist(point2) <= r
      }.map(row => joinedRow(row._1, row._2))
    }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: CKJSpark.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.execution.SimbaPlan
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.ShapeUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.execution.SparkPlan


case class CKJSpark(left_key: Expression, right_key: Expression,
                    l: Literal, left: SparkPlan, right: SparkPlan) extends SimbaPlan {
  override def outputPartitioning: Partitioning = left.outputPartitioning

  override def output: Seq[Attribute] = left.output ++ right.output

  final val k = l.value.asInstanceOf[Number].intValue()

  override protected def doExecute(): RDD[InternalRow] = {
    val left_rdd = left.execute()
    val right_rdd = right.execute()

    left_rdd.map(row =>
      (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row)
    ).cartesian(right_rdd).map {
      case (l: (Point, InternalRow), r: InternalRow) =>
        val tmp_point = ShapeUtils.getShape(right_key, right.output, r).asInstanceOf[Point]
        l._2 -> List((tmp_point.minDist(l._1), r))
    }.reduceByKey {
      case (l_list: Seq[(Double, InternalRow)], r_list: Seq[(Double, InternalRow)]) =>
        (l_list ++ r_list).sortWith(_._1 < _._1).take(k)
    }.flatMapValues(list => list).mapPartitions { iter =>
      val joinedRow = new JoinedRow
      iter.map(r => joinedRow(r._1, r._2._2))
    }
  }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: RDJSpark.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.execution.SimbaPlan
import org.apache.spark.sql.simba.index.RTree
import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition}
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.execution.SparkPlan

import scala.collection.mutable


case class RDJSpark(left_key: Expression, right_key: Expression, l: Literal,
                    left: SparkPlan, right: SparkPlan) extends SimbaPlan {
  override def output: Seq[Attribute] = left.output ++ right.output

  final val num_partitions = simbaSessionState.simbaConf.joinPartitions
  final val sample_rate = simbaSessionState.simbaConf.sampleRate
  final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode
  final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold
  final val r = NumberUtil.literalToDouble(l)

  override protected def doExecute(): RDD[InternalRow] = {
    val left_rdd = left.execute().map(row =>
      (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row)
    )

    val right_rdd = right.execute().map(row =>
      (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row)
    )

    val dimension = right_rdd.first()._1.coord.length

    val (left_partitioned, left_mbr_bound) =
      STRPartition(left_rdd, dimension, num_partitions, sample_rate,
        transfer_threshold, max_entries_per_node)

    val left_part_size = left_partitioned.mapPartitions {
      iter => Array(iter.length).iterator
    }.collect()

    val left_rt = RTree(left_mbr_bound.zip(left_part_size).map(x => (x._1._1, x._1._2, x._2)),
      max_entries_per_node)
    val bc_rt = sparkContext.broadcast(left_rt)

    val right_dup = right_rdd.flatMap {x =>
      bc_rt.value.circleRange(x._1, r).map(now => (now._2, x))
    }

    val right_dup_partitioned = MapDPartition(right_dup, left_mbr_bound.length)

    left_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) =>
      val ans = mutable.ListBuffer[InternalRow]()
      val right_data = rightIter.map(_._2).toArray
      if (right_data.length > 0) {
        val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node)
        leftIter.foreach {now =>
          ans ++= right_index.circleRange(now._1, r)
            .map(x => new JoinedRow(now._2, right_data(x._2)._2))
        }
      }
      ans.iterator
    }
  }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: DJSpark.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.execution.SimbaPlan
import org.apache.spark.sql.simba.index.RTree
import org.apache.spark.sql.simba.partitioner.{MapDPartition, STRPartition}
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.execution.SparkPlan

import scala.collection.mutable


case class DJSpark(left_key: Expression, right_key: Expression, l: Literal,
                   left: SparkPlan, right: SparkPlan) extends SimbaPlan {
  override def output: Seq[Attribute] = left.output ++ right.output

  final val num_partitions = simbaSessionState.simbaConf.joinPartitions
  final val sample_rate = simbaSessionState.simbaConf.sampleRate
  final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode
  final val transfer_threshold = simbaSessionState.simbaConf.transferThreshold
  final val r = NumberUtil.literalToDouble(l)

  override protected def doExecute(): RDD[InternalRow] = {
    val left_rdd = left.execute().map(row =>
      (ShapeUtils.getShape(left_key, left.output, row).asInstanceOf[Point], row)
    )

    val right_rdd = right.execute().map(row =>
      (ShapeUtils.getShape(right_key, right.output, row).asInstanceOf[Point], row)
    )

    val dimension = right_rdd.first()._1.coord.length

    val (left_partitioned, left_mbr_bound) = STRPartition(left_rdd, dimension, num_partitions,
      sample_rate, transfer_threshold, max_entries_per_node)
    val (right_partitioned, right_mbr_bound) = STRPartition(right_rdd, dimension, num_partitions,
      sample_rate, transfer_threshold, max_entries_per_node)

    val right_rt = RTree(right_mbr_bound.zip(Array.fill[Int](right_mbr_bound.length)(0))
      .map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node)

    val left_dup = new Array[Array[Int]](left_mbr_bound.length)
    val right_dup = new Array[Array[Int]](right_mbr_bound.length)

    var tot = 0
    left_mbr_bound.foreach { now =>
      val res = right_rt.circleRange(now._1, r)
      val tmp_arr = mutable.ArrayBuffer[Int]()
      res.foreach {x =>
        if (right_dup(x._2) == null) right_dup(x._2) = Array(tot)
        else right_dup(x._2) = right_dup(x._2) :+ tot
        tmp_arr += tot
        tot += 1
      }
      left_dup(now._2) = tmp_arr.toArray
    }

    val bc_left_dup = sparkContext.broadcast(left_dup)
    val bc_right_dup = sparkContext.broadcast(right_dup)

    val left_dup_rdd = left_partitioned.mapPartitionsWithIndex { (id, iter) =>
      iter.flatMap {now =>
        val tmp_list = bc_left_dup.value(id)
        if (tmp_list != null) tmp_list.map(x => (x, now))
        else Array[(Int, (Point, InternalRow))]()
      }
    }

    val right_dup_rdd = right_partitioned.mapPartitionsWithIndex { (id, iter) =>
      iter.flatMap {now =>
        val tmp_list = bc_right_dup.value(id)
        if (tmp_list != null) tmp_list.map(x => (x, now))
        else Array[(Int, (Point, InternalRow))]()
      }
    }

    val left_dup_partitioned = MapDPartition(left_dup_rdd, tot).map(_._2)
    val right_dup_partitioned = MapDPartition(right_dup_rdd, tot).map(_._2)

    left_dup_partitioned.zipPartitions(right_dup_partitioned) {(leftIter, rightIter) =>
      val ans = mutable.ListBuffer[InternalRow]()
      val right_data = rightIter.toArray
      if (right_data.nonEmpty) {
        val right_index = RTree(right_data.map(_._1).zipWithIndex, max_entries_per_node)
        leftIter.foreach {now =>
          ans ++= right_index.circleRange(now._1, r)
            .map(x => new JoinedRow(now._2, right_data(x._2)._2))
        }
      }
      ans.iterator
    }
  }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: BDJSparkR.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.execution.SimbaPlan
import org.apache.spark.sql.simba.index.RTree
import org.apache.spark.sql.simba.partitioner.MapDPartition
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.execution.SparkPlan

import scala.collection.mutable
import scala.util.Random


case class BDJSparkR(left_key: Expression, right_key: Expression, l: Literal,
                     left: SparkPlan, right: SparkPlan) extends SimbaPlan {
  override def output: Seq[Attribute] = left.output ++ right.output

  final val num_partitions = simbaSessionState.simbaConf.joinPartitions
  final val r = NumberUtil.literalToDouble(l)
  final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode

  override protected def doExecute(): RDD[InternalRow] = {
    val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _)))

    val tot_dup_rdd = tot_rdd.flatMap {x =>
      val rand_no = new Random().nextInt(num_partitions)
      var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]()
      if (x._1 == 0) {
        val base = rand_no * num_partitions
        for (i <- 0 until num_partitions)
          ans += ((base + i, x))
      } else {
        for (i <- 0 until num_partitions)
          ans += ((i * num_partitions + rand_no, x))
      }
      ans
    }

    val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions)

    tot_dup_partitioned.mapPartitions {iter =>
      var left_data = mutable.ListBuffer[(Point, InternalRow)]()
      var right_data = mutable.ListBuffer[(Point, InternalRow)]()
      while (iter.hasNext) {
        val data = iter.next()
        if (data._2._1 == 0) {
          val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point]
          left_data += ((tmp_point, data._2._2))
        } else {
          val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point]
          right_data += ((tmp_point, data._2._2))
        }
      }

      val joined_ans = mutable.ListBuffer[InternalRow]()

      if (right_data.nonEmpty) {
        val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node)
        left_data.foreach(left => right_rtree.circleRange(left._1, r)
          .foreach(x => joined_ans += new JoinedRow(left._2, right_data(x._2)._2)))
      }

      joined_ans.iterator
    }
  }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: BKJSparkR.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.execution.SimbaPlan
import org.apache.spark.sql.simba.index.RTree
import org.apache.spark.sql.simba.partitioner.MapDPartition
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.ShapeUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.execution.SparkPlan

import scala.collection.mutable
import scala.util.Random


case class BKJSparkR(left_key: Expression, right_key: Expression, l: Literal,
                     left: SparkPlan, right: SparkPlan) extends SimbaPlan {
  override def output: Seq[Attribute] = left.output ++ right.output

  final val num_partitions = simbaSessionState.simbaConf.joinPartitions
  final val max_entries_per_node = simbaSessionState.simbaConf.maxEntriesPerNode
  final val k = l.value.asInstanceOf[Number].intValue()

  private class DisOrdering extends Ordering[(InternalRow, Double)] {
    override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int =
      -x._2.compare(y._2)
  }

  override protected def doExecute(): RDD[InternalRow] = {
    val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _)))

    val tot_dup_rdd = tot_rdd.flatMap {x =>
      val rand_no = new Random().nextInt(num_partitions)
      val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]()
      if (x._1 == 0) {
        val base = rand_no * num_partitions
        for (i <- 0 until num_partitions)
          ans += ((base + i, x))
      } else {
        for (i <- 0 until num_partitions)
          ans += ((i * num_partitions + rand_no, x))
      }
      ans
    }

    val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions)

    tot_dup_partitioned.mapPartitions {iter =>
      var left_data = mutable.ListBuffer[(Point, InternalRow)]()
      var right_data = mutable.ListBuffer[(Point, InternalRow)]()
      while (iter.hasNext) {
        val data = iter.next()
        if (data._2._1 == 0) {
          val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point]
          left_data += ((tmp_point, data._2._2))
        } else {
          val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point]
          right_data += ((tmp_point, data._2._2))
        }
      }

      val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]()

      if (right_data.nonEmpty) {
        val right_rtree = RTree(right_data.map(_._1).zipWithIndex.toArray, max_entries_per_node)
        left_data.foreach(left =>
          joined_ans += ((left._2, right_rtree.kNN(left._1, k, keepSame = false)
            .map(x => (right_data(x._2)._2, x._1.minDist(left._1)))))
        )
      }

      joined_ans.iterator
    }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions)
      .flatMap { now => now._2.map(x => new JoinedRow(now._1, x._1)) }
  }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: BKJSpark.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.execution.SimbaPlan
import org.apache.spark.sql.simba.partitioner.MapDPartition
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.ShapeUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.util.BoundedPriorityQueue

import scala.collection.mutable
import scala.util.Random


case class BKJSpark(left_key: Expression, right_key: Expression, l: Literal,
                    left: SparkPlan, right: SparkPlan) extends SimbaPlan {
  override def output: Seq[Attribute] = left.output ++ right.output

  final val num_partitions = simbaSessionState.simbaConf.joinPartitions
  final val k = l.value.asInstanceOf[Number].intValue()

  private class DisOrdering extends Ordering[(InternalRow, Double)] {
    override def compare(x : (InternalRow, Double), y: (InternalRow, Double)): Int =
      -x._2.compare(y._2)
  }

  override protected def doExecute(): RDD[InternalRow] = {
    val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _)))

    val tot_dup_rdd = tot_rdd.flatMap {x =>
      val rand_no = new Random().nextInt(num_partitions)
      val ans = mutable.ListBuffer[(Int, (Int, InternalRow))]()
      if (x._1 == 0) {
        val base = rand_no * num_partitions
        for (i <- 0 until num_partitions)
          ans += ((base + i, x))
      } else {
        for (i <- 0 until num_partitions)
          ans += ((i * num_partitions + rand_no, x))
      }
      ans
    }

    val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions)

    tot_dup_partitioned.mapPartitions {iter =>
      var left_data = mutable.ListBuffer[(Point, InternalRow)]()
      var right_data = mutable.ListBuffer[(Point, InternalRow)]()
      while (iter.hasNext) {
        val data = iter.next()
        if (data._2._1 == 0) {
          val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point]
          left_data += ((tmp_point, data._2._2))
        } else {
          val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point]
          right_data += ((tmp_point, data._2._2))
        }
      }

      val joined_ans = mutable.ListBuffer[(InternalRow, Array[(InternalRow, Double)])]()

      left_data.foreach(left => {
        var pq = new BoundedPriorityQueue[(InternalRow, Double)](k)(new DisOrdering)
        right_data.foreach(right => pq += ((right._2, right._1.minDist(left._1))))
        joined_ans += ((left._2, pq.toArray))
      })
      joined_ans.iterator
    }.reduceByKey((left, right) => (left ++ right).sortWith(_._2 < _._2).take(k), num_partitions)
      .flatMap {
        now => now._2.map(x => new JoinedRow(now._1, x._1))
      }
  }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: BDJSpark.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.execution.join

import org.apache.spark.sql.simba.execution.SimbaPlan
import org.apache.spark.sql.simba.partitioner.MapDPartition
import org.apache.spark.sql.simba.spatial.Point
import org.apache.spark.sql.simba.util.{NumberUtil, ShapeUtils}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Literal}
import org.apache.spark.sql.execution.SparkPlan

import scala.collection.mutable
import scala.util.Random


case class BDJSpark(left_key: Expression, right_key: Expression, l: Literal,
                    left: SparkPlan, right: SparkPlan) extends SimbaPlan {
  override def output: Seq[Attribute] = left.output ++ right.output

  final val num_partitions = simbaSessionState.simbaConf.joinPartitions
  final val r = NumberUtil.literalToDouble(l)

  override protected def doExecute(): RDD[InternalRow] = {
    val tot_rdd = left.execute().map((0, _)).union(right.execute().map((1, _)))

    val tot_dup_rdd = tot_rdd.flatMap {x =>
      val rand_no = new Random().nextInt(num_partitions)
      var ans = mutable.ListBuffer[(Int, (Int, InternalRow))]()
      if (x._1 == 0) {
        val base = rand_no * num_partitions
        for (i <- 0 until num_partitions)
          ans += ((base + i, x))
      } else {
        for (i <- 0 until num_partitions)
          ans += ((i * num_partitions + rand_no, x))
      }
      ans
    }

    val tot_dup_partitioned = MapDPartition(tot_dup_rdd, num_partitions * num_partitions)

    tot_dup_partitioned.mapPartitions {iter =>
      var left_data = mutable.ListBuffer[(Point, InternalRow)]()
      var right_data = mutable.ListBuffer[(Point, InternalRow)]()
      while (iter.hasNext) {
        val data = iter.next()
        if (data._2._1 == 0) {
          val tmp_point = ShapeUtils.getShape(left_key, left.output, data._2._2).asInstanceOf[Point]
          left_data += ((tmp_point, data._2._2))
        } else {
          val tmp_point = ShapeUtils.getShape(right_key, right.output, data._2._2).asInstanceOf[Point]
          right_data += ((tmp_point, data._2._2))
        }
      }

      val joined_ans = mutable.ListBuffer[InternalRow]()

      left_data.foreach {left =>
        right_data.foreach {right =>
          if (left._1.minDist(right._1) <= r) {
            joined_ans += new JoinedRow(left._2, right._2)
          }
        }
      }

      joined_ans.iterator
    }
  }

  override def children: Seq[SparkPlan] = Seq(left, right)
}

Source File: SimbaOptimizer.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba

import org.apache.spark.sql.ExperimentalMethods
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
import org.apache.spark.sql.catalyst.expressions.{And, Expression, PredicateHelper}
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.SparkOptimizer
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.simba.plans.SpatialJoin


class SimbaOptimizer(catalog: SessionCatalog,
                     conf: SQLConf,
                     experimentalMethods: ExperimentalMethods)
 extends SparkOptimizer(catalog, conf, experimentalMethods) {
  override def batches: Seq[Batch] = super.batches :+
    Batch("SpatialJoinPushDown", FixedPoint(100), PushPredicateThroughSpatialJoin)
}

object PushPredicateThroughSpatialJoin extends Rule[LogicalPlan] with PredicateHelper {
  private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = {
    val (leftEvaluateCondition, rest) =
      condition.partition(_.references subsetOf left.outputSet)
    val (rightEvaluateCondition, commonCondition) =
      rest.partition(_.references subsetOf right.outputSet)

    (leftEvaluateCondition, rightEvaluateCondition, commonCondition)
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    // push the where condition down into join filter
    case f @ Filter(filterCondition, SpatialJoin(left, right, joinType, joinCondition)) =>
      val (leftFilterConditions, rightFilterConditions, commonFilterCondition) =
        split(splitConjunctivePredicates(filterCondition), left, right)

      val newLeft = leftFilterConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left)
      val newRight = rightFilterConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right)
      val newJoinCond = (commonFilterCondition ++ joinCondition).reduceLeftOption(And)
      SpatialJoin(newLeft, newRight, joinType, newJoinCond)

    // push down the join filter into sub query scanning if applicable
    case f @ SpatialJoin(left, right, joinType, joinCondition) =>
      val (leftJoinConditions, rightJoinConditions, commonJoinCondition) =
        split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right)

      val newLeft = leftJoinConditions.reduceLeftOption(And).map(Filter(_, left)).getOrElse(left)
      val newRight = rightJoinConditions.reduceLeftOption(And).map(Filter(_, right)).getOrElse(right)
      val newJoinCond = commonJoinCondition.reduceLeftOption(And)

      SpatialJoin(newLeft, newRight, joinType, newJoinCond)
  }
}

Source File: SpatialJoin.scala From Simba with Apache License 2.0

5 votes

package org.apache.spark.sql.simba.plans

import org.apache.spark.sql.simba.expression.{InCircleRange, InKNN}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LogicalPlan}
import org.apache.spark.sql.types.BooleanType


case class SpatialJoin(left: LogicalPlan, right: LogicalPlan, joinType: SpatialJoinType,
                       condition: Option[Expression]) extends BinaryNode {
  override def output: Seq[Attribute] = {
    joinType match {
      case KNNJoin =>
        require(condition.get.isInstanceOf[InKNN])
        left.output ++ right.output
      case ZKNNJoin =>
        require(condition.get.isInstanceOf[InKNN])
        left.output ++ right.output
      case DistanceJoin =>
        require(condition.get.isInstanceOf[InCircleRange])
        left.output ++ right.output.map(_.withNullability(true))
      case _ =>
        left.output ++ right.output
    }
  }

  def selfJoinResolved: Boolean = left.outputSet.intersect(right.outputSet).isEmpty

  // Joins are only resolved if they don't introduce ambiguous expression ids.
  override lazy val resolved: Boolean = {
    childrenResolved &&
      expressions.forall(_.resolved) &&
      selfJoinResolved &&
      condition.forall(_.dataType == BooleanType)
  }
}

Source File: SqlExtensionProviderSuite.scala From glow with Apache License 2.0

5 votes

package io.projectglow.sql

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, Literal, UnaryExpression}
import org.apache.spark.sql.types.{DataType, IntegerType}

import io.projectglow.GlowSuite

class SqlExtensionProviderSuite extends GlowSuite {
  override def beforeAll(): Unit = {
    super.beforeAll()
    SqlExtensionProvider.registerFunctions(
      spark.sessionState.conf,
      spark.sessionState.functionRegistry,
      "test-functions.yml")
  }

  private lazy val sess = spark
  test("one arg function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("one_arg_test(id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("one_arg_test()").collect()
    }

    intercept[AnalysisException] {
      spark.range(1).selectExpr("one_arg_test(id, id)").collect()
    }
  }

  test("two arg function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("two_arg_test(id, id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("two_arg_test(id)").collect()
    }

    intercept[AnalysisException] {
      spark.range(1).selectExpr("two_arg_test(id, id, id)").collect()
    }
  }

  test("var args function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("var_args_test(id, id)").as[Int].head() == 1)
    assert(spark.range(1).selectExpr("var_args_test(id, id, id, id)").as[Int].head() == 1)
    assert(spark.range(1).selectExpr("var_args_test(id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("var_args_test()").collect()
    }
  }

  test("can call optional arg function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("optional_arg_test(id)").as[Int].head() == 1)
    assert(spark.range(1).selectExpr("optional_arg_test(id, id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("optional_arg_test()").collect()
    }

    intercept[AnalysisException] {
      spark.range(1).selectExpr("optional_arg_test(id, id, id)").collect()
    }
  }
}

trait TestExpr extends Expression with CodegenFallback {
  override def dataType: DataType = IntegerType

  override def nullable: Boolean = true

  override def eval(input: InternalRow): Any = 1
}

case class OneArgExpr(child: Expression) extends UnaryExpression with TestExpr
case class TwoArgExpr(left: Expression, right: Expression) extends BinaryExpression with TestExpr
case class VarArgsExpr(arg: Expression, varArgs: Seq[Expression]) extends TestExpr {
  override def children: Seq[Expression] = arg +: varArgs
}
case class OptionalArgExpr(required: Expression, optional: Expression) extends TestExpr {
  def this(required: Expression) = this(required, Literal(1))
  override def children: Seq[Expression] = Seq(required, optional)
}

Source File: LinearRegressionExpr.scala From glow with Apache License 2.0

5 votes

package io.projectglow.sql.expressions

import breeze.linalg.DenseVector
import org.apache.spark.TaskContext
import org.apache.spark.sql.SQLUtils
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, TernaryExpression}
import org.apache.spark.sql.catalyst.util.ArrayData
import org.apache.spark.sql.types._

object LinearRegressionExpr {
  private val matrixUDT = SQLUtils.newMatrixUDT()
  private val state = new ThreadLocal[CovariateQRContext]

  def doLinearRegression(genotypes: Any, phenotypes: Any, covariates: Any): InternalRow = {

    if (state.get() == null) {
      // Save the QR factorization of the covariate matrix since it's the same for every row
      state.set(CovariateQRContext.computeQR(matrixUDT.deserialize(covariates).toDense))
      TaskContext.get().addTaskCompletionListener[Unit](_ => state.remove())
    }

    LinearRegressionGwas.linearRegressionGwas(
      new DenseVector[Double](genotypes.asInstanceOf[ArrayData].toDoubleArray()),
      new DenseVector[Double](phenotypes.asInstanceOf[ArrayData].toDoubleArray()),
      state.get()
    )
  }
}

case class LinearRegressionExpr(
    genotypes: Expression,
    phenotypes: Expression,
    covariates: Expression)
    extends TernaryExpression
    with ImplicitCastInputTypes {

  private val matrixUDT = SQLUtils.newMatrixUDT()

  override def dataType: DataType =
    StructType(
      Seq(
        StructField("beta", DoubleType),
        StructField("standardError", DoubleType),
        StructField("pValue", DoubleType)))

  override def inputTypes: Seq[DataType] =
    Seq(ArrayType(DoubleType), ArrayType(DoubleType), matrixUDT)

  override def children: Seq[Expression] = Seq(genotypes, phenotypes, covariates)

  override protected def nullSafeEval(genotypes: Any, phenotypes: Any, covariates: Any): Any = {
    LinearRegressionExpr.doLinearRegression(genotypes, phenotypes, covariates)
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    nullSafeCodeGen(
      ctx,
      ev,
      (genotypes, phenotypes, covariates) => {
        s"""
         |${ev.value} = io.projectglow.sql.expressions.LinearRegressionExpr.doLinearRegression($genotypes, $phenotypes, $covariates);
       """.stripMargin
      }
    )
  }
}

Source File: HiveAcidRelation.scala From spark-acid with Apache License 2.0

5 votes

package com.qubole.spark.hiveacid.datasource

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession}
import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan}
import org.apache.spark.sql.types._
import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf}
import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert}
import org.apache.spark.sql.catalyst.AliasIdentifier
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import collection.JavaConversions._


case class HiveAcidRelation(sparkSession: SparkSession,
                            fullyQualifiedTableName: String,
                            parameters: Map[String, String])
    extends BaseRelation
    with InsertableRelation
    with PrunedFilteredScan
    with Logging {

  private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession(
    sparkSession,
    fullyQualifiedTableName
  )
  private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession,
    hiveAcidMetadata, parameters)

  private val readOptions = SparkAcidConf(sparkSession, parameters)

  override def sqlContext: SQLContext = sparkSession.sqlContext

  override val schema: StructType = if (readOptions.includeRowIds) {
    hiveAcidMetadata.tableSchemaWithRowId
  } else {
    hiveAcidMetadata.tableSchema
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
   // sql insert into and overwrite
    if (overwrite) {
      hiveAcidTable.insertOverwrite(data)
    } else {
      hiveAcidTable.insertInto(data)
    }
  }

  def update(condition: Option[Column], newValues: Map[String, Column]): Unit = {
    hiveAcidTable.update(condition, newValues)
  }

  def delete(condition: Column): Unit = {
    hiveAcidTable.delete(condition)
  }
  override def sizeInBytes: Long = {
    val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
    (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong
  }

  def merge(sourceDf: DataFrame,
            mergeExpression: Expression,
            matchedClause: Seq[MergeWhenClause],
            notMatched: Option[MergeWhenNotInsert],
            sourceAlias: Option[AliasIdentifier],
            targetAlias: Option[AliasIdentifier]): Unit = {
    hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause,
      notMatched, sourceAlias, targetAlias)
  }

  def getHiveAcidTable(): HiveAcidTable = {
    hiveAcidTable
  }

  // FIXME: should it be true / false. Recommendation seems to
  //  be to leave it as true
  override val needConversion: Boolean = false

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    val readOptions = SparkAcidConf(sparkSession, parameters)
    // sql "select *"
    hiveAcidTable.getRdd(requiredColumns, filters, readOptions)
  }
}

Source File: UpdateCommand.scala From spark-acid with Apache License 2.0

5 votes

package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command

import com.qubole.spark.hiveacid.HiveAcidErrors
import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
import org.apache.spark.sql.{Column, Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.LogicalRelation

case class UpdateCommand(
    table: LogicalPlan,
    setExpressions: Map[String, Expression],
    condition: Option[Expression])
  extends RunnableCommand {

  override def children: Seq[LogicalPlan] = Seq(table)
  override def output: Seq[Attribute] = Seq.empty
  override lazy val resolved: Boolean = childrenResolved

  override def run(sparkSession: SparkSession): Seq[Row] = {
    if (children.size != 1) {
      throw new IllegalArgumentException("UPDATE command should have one table to update, whereas this has: "
        + children.size)
    }
    children(0) match {
      case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
        val setColumns = setExpressions.mapValues(expr => new Column(expr))
        val updateFilterColumn = condition.map(new Column(_))
        relation.update(updateFilterColumn, setColumns)
      }
      case LogicalRelation(_, _, Some(catalogTable), _) =>
        throw HiveAcidErrors.tableNotAcidException(catalogTable.qualifiedName)
      case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
    }
    Seq.empty[Row]
  }
}

Source File: MergeCommand.scala From spark-acid with Apache License 2.0

5 votes

package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command

import com.qubole.spark.hiveacid.HiveAcidErrors
import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
import com.qubole.spark.hiveacid.merge.{MergeCondition, MergeWhenClause, MergeWhenNotInsert}
import org.apache.spark.sql.catalyst.AliasIdentifier
import org.apache.spark.sql.{Row, SparkSession, SqlUtils}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.LogicalRelation

case class MergeCommand(targetTable: LogicalPlan,
                        sourceTable: LogicalPlan,
                        matched: Seq[MergeWhenClause],
                        notMatched: Option[MergeWhenClause],
                        mergeCondition: MergeCondition,
                        sourceAlias: Option[AliasIdentifier],
                        targetAlias: Option[AliasIdentifier])
  extends RunnableCommand {

  override def children: Seq[LogicalPlan] = Seq(targetTable, sourceTable)
  override def output: Seq[Attribute] = Seq.empty
  override lazy val resolved: Boolean = childrenResolved
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val insertClause: Option[MergeWhenNotInsert] = notMatched match {
      case Some(i: MergeWhenNotInsert) => Some(i)
      case None => None
      case _ => throw HiveAcidErrors.mergeValidationError("WHEN NOT Clause has to be INSERT CLAUSE")
    }

    children.head match {
      case LogicalRelation(relation: HiveAcidRelation, _, _ , _) =>
        relation.merge(SqlUtils.logicalPlanToDataFrame(sparkSession, sourceTable),
          mergeCondition.expression, matched, insertClause, sourceAlias, targetAlias)
      case SubqueryAlias(_, LogicalRelation(relation: HiveAcidRelation, _, _, _)) =>
        relation.merge(SqlUtils.logicalPlanToDataFrame(sparkSession, sourceTable),
          mergeCondition.expression, matched, insertClause, sourceAlias, targetAlias)
      case _ => throw HiveAcidErrors.tableNotAcidException(targetTable.toString())
    }

    Seq.empty
  }
}

Source File: DeleteCommand.scala From spark-acid with Apache License 2.0

5 votes

package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command

import com.qubole.spark.hiveacid.HiveAcidErrors
import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
import org.apache.spark.sql.{Column, Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.LogicalRelation

case class DeleteCommand(
    table: LogicalPlan,
    condition: Expression)
  extends RunnableCommand {

  // We don't want `table` in children as sometimes we don't want to transform it.
  override def children: Seq[LogicalPlan] = Seq(table)
  override def output: Seq[Attribute] = Seq.empty
  override lazy val resolved: Boolean = childrenResolved
  override def run(sparkSession: SparkSession): Seq[Row] = {
    if (children.size != 1) {
      throw new IllegalArgumentException("DELETE command should specify exactly one table, whereas this has: "
        + children.size)
    }
    children(0) match {
      case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
        relation.delete(new Column(condition))
      }
      case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
    }
    Seq.empty[Row]
  }
}

Source File: MergePlan.scala From spark-acid with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.parser.plans.logical

import com.qubole.spark.hiveacid.merge.{MergeWhenClause}
import org.apache.spark.sql.{SparkSession, SqlUtils}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}

case class MergePlan(sourcePlan: LogicalPlan,
                     targetPlan: LogicalPlan,
                     condition: Expression,
                     matched: Seq[MergeWhenClause],
                     notMatched: Option[MergeWhenClause]) extends Command {
  override def children: Seq[LogicalPlan] = Seq(sourcePlan, targetPlan)
  override def output: Seq[Attribute] = Seq.empty
}

object MergePlan {
  def resolve(sparkSession: SparkSession, mergePlan: MergePlan): MergePlan = {
    MergeWhenClause.validate(mergePlan.matched ++ mergePlan.notMatched)
    val resolvedCondition = SqlUtils.resolveReferences(sparkSession, mergePlan.condition,
      mergePlan.children, true, None)
    val resolvedMatched = MergeWhenClause.resolve(sparkSession, mergePlan, mergePlan.matched)
    val resolvedNotMatched = mergePlan.notMatched.map {
      x => x.resolve(sparkSession, mergePlan)
    }

    MergePlan(mergePlan.sourcePlan,
      mergePlan.targetPlan,
      resolvedCondition,
      resolvedMatched,
      resolvedNotMatched)
  }
}

Source File: SqlUtils.scala From spark-acid with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.types.StructType

object SqlUtils {
  def convertToDF(sparkSession: SparkSession, plan : LogicalPlan): DataFrame = {
    Dataset.ofRows(sparkSession, plan)
  }

  def resolveReferences(sparkSession: SparkSession,
                        expr: Expression,
                        planContaining: LogicalPlan, failIfUnresolved: Boolean,
                        exprName: Option[String] = None): Expression = {
    resolveReferences(sparkSession, expr, Seq(planContaining), failIfUnresolved, exprName)
  }

  def resolveReferences(sparkSession: SparkSession,
                        expr: Expression,
                        planContaining: Seq[LogicalPlan],
                        failIfUnresolved: Boolean,
                        exprName: Option[String]): Expression = {
    val newPlan = FakeLogicalPlan(expr, planContaining)
    val resolvedExpr = sparkSession.sessionState.analyzer.execute(newPlan) match {
      case FakeLogicalPlan(resolvedExpr: Expression, _) =>
        // Return even if it did not successfully resolve
        resolvedExpr
      case _ =>
        expr
      // This is unexpected
    }
    if (failIfUnresolved) {
      resolvedExpr.flatMap(_.references).filter(!_.resolved).foreach {
        attr => {
          val failedMsg = exprName match {
            case Some(name) => s"${attr.sql} resolution in $name given these columns: "+
              planContaining.flatMap(_.output).map(_.name).mkString(",")
            case _ => s"${attr.sql} resolution failed given these columns: "+
              planContaining.flatMap(_.output).map(_.name).mkString(",")
          }
          attr.failAnalysis(failedMsg)
        }
      }
    }
    resolvedExpr
  }

  def hasSparkStopped(sparkSession: SparkSession): Boolean = {
    sparkSession.sparkContext.stopped.get()
  }

  
  def createDataFrameUsingAttributes(sparkSession: SparkSession,
                                     rdd: RDD[Row],
                                     schema: StructType,
                                     attributes: Seq[Attribute]): DataFrame = {
    val encoder = RowEncoder(schema)
    val catalystRows = rdd.map(encoder.toRow)
    val logicalPlan = LogicalRDD(
      attributes,
      catalystRows,
      isStreaming = false)(sparkSession)
    Dataset.ofRows(sparkSession, logicalPlan)
  }

  def analysisException(cause: String): Throwable = {
    new AnalysisException(cause)
  }
}

case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan])
  extends LogicalPlan {
  override def output: Seq[Attribute] = children.foldLeft(Seq[Attribute]())((out, child) => out ++ child.output)
}

Source File: HiveAcidUtils.scala From spark-acid with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import scala.collection.JavaConverters._
import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTablePartition, CatalogUtils}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, InterpretedPredicate, PrettyAttribute}

object HiveAcidUtils {

  
  def prunePartitionsByFilter(
                               hiveAcidMetadata: HiveAcidMetadata,
                               inputPartitions: Seq[CatalogTablePartition],
                               predicates: Option[Expression],
                               defaultTimeZoneId: String): Seq[CatalogTablePartition] = {
    if (predicates.isEmpty) {
      inputPartitions
    } else {
      val partitionSchema = hiveAcidMetadata.partitionSchema
      val partitionColumnNames = hiveAcidMetadata.partitionSchema.fieldNames.toSet

      val nonPartitionPruningPredicates = predicates.filterNot {
        _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
      }
      if (nonPartitionPruningPredicates.nonEmpty) {
        throw new AnalysisException("Expected only partition pruning predicates: " +
          nonPartitionPruningPredicates)
      }

      val boundPredicate =
        InterpretedPredicate.create(predicates.get.transform {
          case att: Attribute =>
            val index = partitionSchema.indexWhere(_.name == att.name)
            BoundReference(index, partitionSchema(index).dataType, nullable = true)
        })

      inputPartitions.filter { p =>
        boundPredicate.eval(p.toRow(partitionSchema, defaultTimeZoneId))
      }
    }
  }

  def convertToCatalogTablePartition(hp: com.qubole.shaded.hadoop.hive.ql.metadata.Partition): CatalogTablePartition = {
    val apiPartition = hp.getTPartition
    val properties: Map[String, String] = if (hp.getParameters != null) {
      hp.getParameters.asScala.toMap
    } else {
      Map.empty
    }
    CatalogTablePartition(
      spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty),
      storage = CatalogStorageFormat(
        locationUri = Option(CatalogUtils.stringToURI(apiPartition.getSd.getLocation)),
        inputFormat = Option(apiPartition.getSd.getInputFormat),
        outputFormat = Option(apiPartition.getSd.getOutputFormat),
        serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib),
        compressed = apiPartition.getSd.isCompressed,
        properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
          .map(_.asScala.toMap).orNull),
      createTime = apiPartition.getCreateTime.toLong * 1000,
      lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000,
      parameters = properties,
      stats = None) // TODO: need to implement readHiveStats
  }
}

Source File: IntervalTreeJoin.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.genApp

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow}
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}

@DeveloperApi
case class IntervalTreeJoin(left: SparkPlan,
                     right: SparkPlan,
                     condition: Seq[Expression],
                     context: SparkSession) extends BinaryExecNode {
  def output = left.output ++ right.output

  lazy val (buildPlan, streamedPlan) = (left, right)

  lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)),
    List(condition(2), condition(3)))

  @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output)
  @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys,
    streamedPlan.output)

  protected override def doExecute(): RDD[InternalRow] = {
    val v1 = left.execute()
    val v1kv = v1.map(x => {
      val v1Key = buildKeyGenerator(x)

      (new Interval[Int](v1Key.getInt(0), v1Key.getInt(1)),
        x.copy())
    })
    val v2 = right.execute()
    val v2kv = v2.map(x => {
      val v2Key = streamKeyGenerator(x)
      (new Interval[Int](v2Key.getInt(0), v2Key.getInt(1)),
        x.copy())
    })
    
    if (v1.count <= v2.count) {
      val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v1kv, v2kv)
        .flatMap(l => l._2
        .map(r => (l._1, r)))
      v3.map {
        case (l: InternalRow, r: InternalRow) => {
          val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema);
          joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner
        }
      }
    }
    else {
      val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r)))
      v3.map {
        case (r: InternalRow, l: InternalRow) => {
          val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema);
          joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner
        }
      }
    }

  }
}

Source File: IntervalTreeJoinChromosome.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.methods.genApp

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow}
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.biodatageeks.sequila.rangejoins.genApp.Interval

@DeveloperApi
case class
IntervalTreeJoinChromosome(left: SparkPlan,
                             right: SparkPlan,
                             condition: Seq[Expression],
                             context: SparkSession) extends BinaryExecNode {
  def output = left.output ++ right.output

  lazy val (buildPlan, streamedPlan) = (left, right)

  lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1),condition(4)),
    List(condition(2), condition(3),condition(5)))

  @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output)
  @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys,
    streamedPlan.output)

  protected override def doExecute(): RDD[InternalRow] = {
    val v1 = left.execute()
    val v1kv = v1.map(x => {
      val v1Key = buildKeyGenerator(x)

      ((v1Key.getString(2),new Interval[Int](v1Key.getInt(0), v1Key.getInt(1))),
        x.copy())
    })
    val v2 = right.execute()
    val v2kv = v2.map(x => {
      val v2Key = streamKeyGenerator(x)
      ((v2Key.getString(2),new Interval[Int](v2Key.getInt(0), v2Key.getInt(1))),
        x.copy())
    })
    
    if (v1.count <= v2.count) {
      val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v1kv, v2kv)
        .flatMap(l => l._2
          .map(r => (l._1, r)))
      v3.mapPartitions(
        p => {
          val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
          p.map(r => joiner.join(r._1.asInstanceOf[UnsafeRow], r._2.asInstanceOf[UnsafeRow]))
        }
      )
    }
    else {
      val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r)))
      v3.mapPartitions(
        p => {
          val joiner = GenerateUnsafeRowJoiner.create(right.schema, left.schema)
          p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow]))
        }

      )
    }

  }
}

Source File: MetastoreIndexSuite.scala From parquet-index with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

import com.github.lightcopy.testutil.UnitTestSuite
import com.github.lightcopy.testutil.implicits._

// Test catalog to check internal methods
private[datasources] class TestIndex extends MetastoreIndex {
  private var internalIndexFilters: Seq[Filter] = Nil
  override def tablePath(): Path = ???
  override def partitionSchema: StructType = ???
  override def indexSchema: StructType = ???
  override def dataSchema: StructType = ???
  override def setIndexFilters(filters: Seq[Filter]) = {
    internalIndexFilters = filters
  }
  override def indexFilters: Seq[Filter] = internalIndexFilters
  override def listFilesWithIndexSupport(
      partitionFilters: Seq[Expression],
      dataFilters: Seq[Expression],
      indexFilters: Seq[Filter]): Seq[PartitionDirectory] = ???
  override def inputFiles: Array[String] = ???
  override def sizeInBytes: Long = ???
}

class MetastoreIndexSuite extends UnitTestSuite {
  test("provide sequence of path based on table path") {
    val catalog = new TestIndex() {
      override def tablePath(): Path = new Path("test")
    }

    catalog.rootPaths should be (Seq(new Path("test")))
  }

  test("when using listFiles directly supply empty index filter") {
    var indexSeq: Seq[Filter] = null
    var filterSeq: Seq[Expression] = null
    val catalog = new TestIndex() {
      override def listFilesWithIndexSupport(
          partitionFilters: Seq[Expression],
          dataFilters: Seq[Expression],
          indexFilters: Seq[Filter]): Seq[PartitionDirectory] = {
        indexSeq = indexFilters
        filterSeq = partitionFilters
        Seq.empty
      }
    }

    catalog.listFiles(Seq.empty, Seq.empty)
    indexSeq should be (Nil)
    filterSeq should be (Nil)
  }

  test("refresh should be no-op by default") {
    val catalog = new TestIndex()
    catalog.refresh()
  }
}

Source File: SimilarityFunctions.scala From spark-stringmetric with MIT License

5 votes

package com.github.mrpowers.spark.stringmetric

import com.github.mrpowers.spark.stringmetric.expressions.HammingDistance
import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions._

import java.util.Locale

import org.apache.commons.text.similarity.{
  CosineDistance,
  JaccardSimilarity,
  JaroWinklerDistance,
  FuzzyScore
}


object SimilarityFunctions {
  private def withExpr(expr: Expression): Column = new Column(expr)

  val cosine_distance = udf[Option[Double], String, String](cosineDistanceFun)

  def cosineDistanceFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val cd = new CosineDistance()
    Some(cd(s1, s2))
  }

  val fuzzy_score = udf[Option[Integer], String, String](fuzzyScoreFun)

  def fuzzyScoreFun(s1: String, s2: String): Option[Integer] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val f = new FuzzyScore(Locale.ENGLISH)
    Some(f.fuzzyScore(str1, str2))
  }

  def hamming(s1: Column, s2: Column): Column = withExpr {
    HammingDistance(s1.expr, s2.expr)
  }

  val jaccard_similarity = udf[Option[Double], String, String](jaccardSimilarityFun)

  def jaccardSimilarityFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val j = new JaccardSimilarity()
    Some(j.apply(str1, str2))
  }

  val jaro_winkler = udf[Option[Double], String, String](jaroWinlkerFun)

  def jaroWinlkerFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val j = new JaroWinklerDistance()
    Some(j.apply(str1, str2))
  }

}

Source File: CatalystDataToAvro.scala From spark-schema-registry with Apache License 2.0

5 votes

package com.hortonworks.spark.registry.avro

import com.hortonworks.registries.schemaregistry.{SchemaCompatibility, SchemaMetadata}
import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotSerializer
import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{BinaryType, DataType}

import scala.collection.JavaConverters._


case class CatalystDataToAvro(
    child: Expression,
    schemaName: String,
    recordName: String,
    nameSpace: String,
    config: Map[String, Object]
    ) extends UnaryExpression {

  override def dataType: DataType = BinaryType

  private val topLevelRecordName = if (recordName == "") schemaName else recordName

  @transient private lazy val avroType =
    SchemaConverters.toAvroType(child.dataType, child.nullable, topLevelRecordName, nameSpace)

  @transient private lazy val avroSer =
    new AvroSerializer(child.dataType, avroType, child.nullable)

  @transient private lazy val srSer: AvroSnapshotSerializer = {
    val obj = new AvroSnapshotSerializer()
    obj.init(config.asJava)
    obj
  }

  @transient private lazy val srClient = new SchemaRegistryClient(config.asJava)

  @transient private lazy val schemaMetadata = {
    var schemaMetadataInfo = srClient.getSchemaMetadataInfo(schemaName)
    if (schemaMetadataInfo == null) {
      val generatedSchemaMetadata = new SchemaMetadata.Builder(schemaName).
        `type`(AvroSchemaProvider.TYPE)
        .schemaGroup("Autogenerated group")
        .description("Autogenerated schema")
        .compatibility(SchemaCompatibility.BACKWARD).build
      srClient.addSchemaMetadata(generatedSchemaMetadata)
      generatedSchemaMetadata
    } else {
      schemaMetadataInfo.getSchemaMetadata
    }
  }

  override def nullSafeEval(input: Any): Any = {
    val avroData = avroSer.serialize(input)
    srSer.serialize(avroData.asInstanceOf[Object], schemaMetadata)
  }

  override def simpleString: String = {
    s"to_sr(${child.sql}, ${child.dataType.simpleString})"
  }

  override def sql: String = {
    s"to_sr(${child.sql}, ${child.dataType.catalogString})"
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val expr = ctx.addReferenceObj("this", this)
    defineCodeGen(ctx, ev, input =>
      s"(byte[]) $expr.nullSafeEval($input)")
  }
}

Source File: MyUDF.scala From spark-tools with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.TimestampType

object MyUDF {

  private def myTimestampCast(xs: Seq[Expression]): Expression = {
    val expSource = xs.head
    expSource.dataType match {
      case LongType =>
        new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr
      case TimestampType =>
        
        expSource
    }
  }

  def register(sparkSession: SparkSession): Unit =
    sparkSession.sessionState.functionRegistry
      .registerFunction(FunctionIdentifier("toTs",None), myTimestampCast)

}

Source File: MyUDF.scala From spark-tools with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.TimestampType

object MyUDF {

  private def myTimestampCast(xs: Seq[Expression]): Expression = {
    val expSource = xs.head
    expSource.dataType match {
      case LongType =>
        new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr
      case TimestampType =>
        
        expSource
    }
  }

  def register(sparkSession: SparkSession): Unit =
    sparkSession.sessionState.functionRegistry
      .registerFunction("toTs", myTimestampCast)

}

Source File: SQLBuilderTest.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst

import scala.util.control.NonFatal

import org.apache.spark.sql.{DataFrame, Dataset, QueryTest}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.hive.test.TestHiveSingleton


abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
    val actualSQL = e.sql
    try {
      assert(actualSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following expression:
             |
             |${e.prettyName}
             |
             |$cause
           """.stripMargin)
    }
  }

  protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = {
    val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) =>
      fail(
        s"""Cannot convert the following logical query plan to SQL:
           |
           |${plan.treeString}
         """.stripMargin)
    }

    try {
      assert(generatedSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following logical query plan:
             |
             |${plan.treeString}
             |
             |$cause
           """.stripMargin)
    }

    checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan))
  }

  protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = {
    checkSQL(df.queryExecution.analyzed, expectedSQL)
  }
}

Source File: ScriptTransformation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: CodegenFallback.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic}


trait CodegenFallback extends Expression {

  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    foreach {
      case n: Nondeterministic => n.setInitialValues()
      case _ =>
    }

    // LeafNode does not need `input`
    val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW
    val idx = ctx.references.length
    ctx.references += this
    val objectTerm = ctx.freshName("obj")
    val placeHolder = ctx.registerComment(this.toString)
    if (nullable) {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        boolean ${ev.isNull} = $objectTerm == null;
        ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)};
        if (!${ev.isNull}) {
          ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        }""")
    } else {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        """, isNull = "false")
    }
  }
}

Source File: SubstituteUnresolvedOrdinals.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.CatalystConf
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder}
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
import org.apache.spark.sql.types.IntegerType


class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] {
  private def isIntLiteral(e: Expression) = e match {
    case Literal(_, IntegerType) => true
    case _ => false
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) =>
      val newOrders = s.order.map {
        case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) =>
          val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
          withOrigin(order.origin)(order.copy(child = newOrdinal))
        case other => other
      }
      withOrigin(s.origin)(s.copy(order = newOrders))

    case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) =>
      val newGroups = a.groupingExpressions.map {
        case ordinal @ Literal(index: Int, IntegerType) =>
          withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
        case other => other
      }
      withOrigin(a.origin)(a.copy(groupingExpressions = newGroups))
  }
}

Source File: ResolveTableValuedFunctions.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Range}
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{DataType, IntegerType, LongType}


      tvf("start" -> LongType, "end" -> LongType, "step" -> LongType,
          "numPartitions" -> IntegerType) {
        case Seq(start: Long, end: Long, step: Long, numPartitions: Int) =>
          Range(start, end, step, Some(numPartitions))
      })
  )

  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) =>
      builtinFunctions.get(u.functionName) match {
        case Some(tvf) =>
          val resolved = tvf.flatMap { case (argList, resolver) =>
            argList.implicitCast(u.functionArgs) match {
              case Some(casted) =>
                Some(resolver(casted.map(_.eval())))
              case _ =>
                None
            }
          }
          resolved.headOption.getOrElse {
            val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ")
            u.failAnalysis(
              s"""error: table-valued function ${u.functionName} with alternatives:
                |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")}
                |cannot be applied to: (${argTypes})""".stripMargin)
          }
        case _ =>
          u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function")
      }
  }
}

Source File: RuleExecutorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.errors.TreeNodeException
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    val message = intercept[TreeNodeException[LogicalPlan]] {
      ToFixedPoint.execute(Literal(100))
    }.getMessage
    assert(message.contains("Max iterations (10) reached for batch fixedPoint"))
  }
}

Source File: ShuffledHashJoinExec.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
    val buildDataSize = longMetric("buildDataSize")
    val buildTime = longMetric("buildTime")
    val start = System.nanoTime()
    val context = TaskContext.get()
    val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager())
    buildTime += (System.nanoTime() - start) / 1000000
    buildDataSize += relation.estimatedSize
    // This relation is usually used until the end of task.
    context.addTaskCompletionListener(_ => relation.close())
    relation
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
      val hashed = buildHashedRelation(buildIter)
      join(streamIter, hashed, numOutputRows)
    }
  }
}

Source File: CartesianProductExec.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter


class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    // We will not sort the rows, so prefixComparator and recordComparator are null.
    val sorter = UnsafeExternalSorter.create(
      context.taskMemoryManager(),
      SparkEnv.get.blockManager,
      SparkEnv.get.serializerManager,
      context,
      null,
      null,
      1024,
      SparkEnv.get.memoryManager.pageSizeBytes,
      SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
        UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD),
      false)

    val partition = split.asInstanceOf[CartesianPartition]
    for (y <- rdd2.iterator(partition.s2, context)) {
      sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false)
    }

    // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow]
    def createIter(): Iterator[UnsafeRow] = {
      val iter = sorter.getIterator
      val unsafeRow = new UnsafeRow(numFieldsOfRight)
      new Iterator[UnsafeRow] {
        override def hasNext: Boolean = {
          iter.hasNext
        }
        override def next(): UnsafeRow = {
          iter.loadNext()
          unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength)
          unsafeRow
        }
      }
    }

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, sorter.cleanupResources())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
    pair.mapPartitionsInternal { iter =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition: (InternalRow) => Boolean =
          newPredicate(condition.get, left.output ++ right.output)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
}

Source File: subquery.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{expressions, InternalRow}
import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{BooleanType, DataType, StructType}


case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
    val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]()
    plan transformAllExpressions {
      case sub: ExecSubqueryExpression =>
        val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]())
        val sameResult = sameSchema.find(_.sameResult(sub.plan))
        if (sameResult.isDefined) {
          sub.withNewPlan(sameResult.get)
        } else {
          sameSchema += sub.plan
          sub
        }
    }
  }
}

Source File: GroupedIterator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder}
import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection}

object GroupedIterator {
  def apply(
      input: Iterator[InternalRow],
      keyExpressions: Seq[Expression],
      inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = {
    if (input.hasNext) {
      new GroupedIterator(input.buffered, keyExpressions, inputSchema)
    } else {
      Iterator.empty
    }
  }
}


  def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator

  def next(): (InternalRow, Iterator[InternalRow]) = {
    assert(hasNext) // Ensure we have fetched the next iterator.
    val ret = (keyProjection(currentGroup), currentIterator)
    currentIterator = null
    ret
  }

  private def fetchNextGroupIterator(): Boolean = {
    assert(currentIterator == null)

    if (currentRow == null && input.hasNext) {
      currentRow = input.next()
    }

    if (currentRow == null) {
      // These is no data left, return false.
      false
    } else {
      // Skip to next group.
      // currentRow may be overwritten by `hasNext`, so we should compare them first.
      while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) {
        currentRow = input.next()
      }

      if (keyOrdering.compare(currentGroup, currentRow) == 0) {
        // We are in the last group, there is no more groups, return false.
        false
      } else {
        // Now the `currentRow` is the first row of next group.
        currentGroup = currentRow.copy()
        currentIterator = createGroupValuesIterator()
        true
      }
    }
  }

  private def createGroupValuesIterator(): Iterator[InternalRow] = {
    new Iterator[InternalRow] {
      def hasNext: Boolean = currentRow != null || fetchNextRowInGroup()

      def next(): InternalRow = {
        assert(hasNext)
        val res = currentRow
        currentRow = null
        res
      }

      private def fetchNextRowInGroup(): Boolean = {
        assert(currentRow == null)

        if (input.hasNext) {
          // The inner iterator should NOT consume the input into next group, here we use `head` to
          // peek the next input, to see if we should continue to process it.
          if (keyOrdering.compare(currentGroup, input.head) == 0) {
            // Next input is in the current group.  Continue the inner iterator.
            currentRow = input.next()
            true
          } else {
            // Next input is not in the right group.  End this inner iterator.
            false
          }
        } else {
          // There is no more data, return false.
          false
        }
      }
    }
  }
}

Source File: NativeFunctionRegistration.scala From spark-alchemy with Apache License 2.0

5 votes

package com.swoop.alchemy.spark.expressions

import org.apache.spark.sql.EncapsulationViolator.createAnalysisException
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription, ExpressionInfo, RuntimeReplaceable}

import scala.reflect.ClassTag
import scala.util.{Failure, Success, Try}

// based on Spark's FunctionRegistry @ossSpark
trait NativeFunctionRegistration extends FunctionRegistration {

  type FunctionBuilder = Seq[Expression] => Expression

  def expressions: Map[String, (ExpressionInfo, FunctionBuilder)]


  def registerFunctions(fr: FunctionRegistry): Unit = {
    expressions.foreach { case (name, (info, builder)) => fr.registerFunction(FunctionIdentifier(name), info, builder) }
  }

  def registerFunctions(spark: SparkSession): Unit = {
    registerFunctions(spark.sessionState.functionRegistry)
  }

  
  protected def expressionInfo[T <: Expression : ClassTag](name: String): ExpressionInfo = {
    val clazz = scala.reflect.classTag[T].runtimeClass
    val df = clazz.getAnnotation(classOf[ExpressionDescription])
    if (df != null) {
      new ExpressionInfo(clazz.getCanonicalName, null, name, df.usage(), df.extended())
    } else {
      new ExpressionInfo(clazz.getCanonicalName, name)
    }
  }

}

Source File: DeltaPushFilter.scala From connectors with Apache License 2.0

5 votes

package org.apache.spark.sql.delta

import scala.collection.immutable.HashSet
import scala.collection.JavaConverters._

import org.apache.hadoop.hive.ql.exec.{FunctionRegistry, SerializationUtilities}
import org.apache.hadoop.hive.ql.lib._
import org.apache.hadoop.hive.ql.parse.SemanticException
import org.apache.hadoop.hive.ql.plan.{ExprNodeColumnDesc, ExprNodeConstantDesc, ExprNodeGenericFuncDesc}
import org.apache.hadoop.hive.ql.udf.generic._
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{And, EqualNullSafe, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, InSet, LessThan, LessThanOrEqual, Like, Literal, Not}

object DeltaPushFilter extends Logging {
  lazy val supportedPushDownUDFs = Array(
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS",
    "org.apache.hadoop.hive.ql.udf.UDFLike",
    "org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn"
  )

  def partitionFilterConverter(hiveFilterExprSeriablized: String): Seq[Expression] = {
    if (hiveFilterExprSeriablized != null) {
      val filterExpr = SerializationUtilities.deserializeExpression(hiveFilterExprSeriablized)
      val opRules = new java.util.LinkedHashMap[Rule, NodeProcessor]()
      val nodeProcessor = new NodeProcessor() {
        @throws[SemanticException]
        def process(nd: Node, stack: java.util.Stack[Node],
            procCtx: NodeProcessorCtx, nodeOutputs: Object*): Object = {
          nd match {
            case e: ExprNodeGenericFuncDesc if FunctionRegistry.isOpAnd(e) =>
              nodeOutputs.map(_.asInstanceOf[Expression]).reduce(And)
            case e: ExprNodeGenericFuncDesc =>
              val (columnDesc, constantDesc) =
                if (nd.getChildren.get(0).isInstanceOf[ExprNodeColumnDesc]) {
                  (nd.getChildren.get(0), nd.getChildren.get(1))
                } else { (nd.getChildren.get(1), nd.getChildren.get(0)) }

              val columnAttr = UnresolvedAttribute(
                columnDesc.asInstanceOf[ExprNodeColumnDesc].getColumn)
              val constantVal = Literal(constantDesc.asInstanceOf[ExprNodeConstantDesc].getValue)
              nd.asInstanceOf[ExprNodeGenericFuncDesc].getGenericUDF match {
                case f: GenericUDFOPNotEqualNS =>
                  Not(EqualNullSafe(columnAttr, constantVal))
                case f: GenericUDFOPNotEqual =>
                  Not(EqualTo(columnAttr, constantVal))
                case f: GenericUDFOPEqualNS =>
                  EqualNullSafe(columnAttr, constantVal)
                case f: GenericUDFOPEqual =>
                  EqualTo(columnAttr, constantVal)
                case f: GenericUDFOPGreaterThan =>
                  GreaterThan(columnAttr, constantVal)
                case f: GenericUDFOPEqualOrGreaterThan =>
                  GreaterThanOrEqual(columnAttr, constantVal)
                case f: GenericUDFOPLessThan =>
                  LessThan(columnAttr, constantVal)
                case f: GenericUDFOPEqualOrLessThan =>
                  LessThanOrEqual(columnAttr, constantVal)
                case f: GenericUDFBridge if f.getUdfName.equals("like") =>
                  Like(columnAttr, constantVal)
                case f: GenericUDFIn =>
                  val inConstantVals = nd.getChildren.asScala
                    .filter(_.isInstanceOf[ExprNodeConstantDesc])
                    .map(_.asInstanceOf[ExprNodeConstantDesc].getValue)
                    .map(Literal(_)).toSet
                  InSet(columnAttr, HashSet() ++ inConstantVals)
                case _ =>
                  throw new RuntimeException(s"Unsupported func(${nd.getName}) " +
                    s"which can not be pushed down to delta")
              }
            case _ => null
          }
        }
      }

      val disp = new DefaultRuleDispatcher(nodeProcessor, opRules, null)
      val ogw = new DefaultGraphWalker(disp)
      val topNodes = new java.util.ArrayList[Node]()
      topNodes.add(filterExpr)
      val nodeOutput = new java.util.HashMap[Node, Object]()
      try {
        ogw.startWalking(topNodes, nodeOutput)
      } catch {
        case ex: Exception =>
          throw new RuntimeException(ex)
      }
      logInfo(s"converted partition filter expr:" +
        s"${nodeOutput.get(filterExpr).asInstanceOf[Expression].toJSON}")
      Seq(nodeOutput.get(filterExpr).asInstanceOf[Expression])
    } else Seq.empty[org.apache.spark.sql.catalyst.expressions.Expression]
  }
}

Source File: TimestampCast.scala From flint with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.codegen.{ CodegenContext, ExprCode, CodeGenerator, JavaCode, Block }
import org.apache.spark.sql.catalyst.expressions.{ Expression, NullIntolerant, UnaryExpression }
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types.{ DataType, LongType, TimestampType }

case class TimestampToNanos(child: Expression) extends TimestampCast {
  val dataType: DataType = LongType
  protected def cast(childPrim: String): String =
    s"$childPrim * 1000L"
  override protected def nullSafeEval(input: Any): Any =
    input.asInstanceOf[Long] * 1000L
}

case class NanosToTimestamp(child: Expression) extends TimestampCast {
  val dataType: DataType = TimestampType
  protected def cast(childPrim: String): String =
    s"$childPrim / 1000L"
  override protected def nullSafeEval(input: Any): Any =
    input.asInstanceOf[Long] / 1000L
}

object TimestampToNanos {
  
  private[this] def castCode(ctx: CodegenContext, childPrim: String, childNull: String,
    resultPrim: String, resultNull: String, resultType: DataType): Block = {
    code"""
      boolean $resultNull = $childNull;
      ${CodeGenerator.javaType(resultType)} $resultPrim = ${CodeGenerator.defaultValue(resultType)};
      if (!${childNull}) {
        $resultPrim = (long) ${cast(childPrim)};
      }
    """
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val eval = child.genCode(ctx)
    ev.copy(code = eval.code +
      castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType))
  }
}

Source File: SparkWrapper.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark

import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog}
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.types.{DataType, Metadata}

object SparkWrapper {
  def getVersion: String = {
    "SparkWrapper-2.3"
  }

  def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = {
    SubqueryAlias(identifier, child)
  }

  def newAlias(child: Expression, name: String): Alias = {
    Alias(child, name)()
  }

  def newAttributeReference(
      name: String,
      dataType: DataType,
      nullable: Boolean,
      metadata: Metadata): AttributeReference = {
    AttributeReference(name, dataType, nullable, metadata)()
  }

  def callSessionCatalogCreateTable(
      obj: SessionCatalog,
      tableDefinition: CatalogTable,
      ignoreIfExists: Boolean): Unit = {
    obj.createTable(tableDefinition, ignoreIfExists)
  }
}

Source File: SparkWrapper.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark

import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog}
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.types.{DataType, Metadata}

object SparkWrapper {
  def getVersion: String = {
    "SparkWrapper-2.4"
  }

  def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = {
    SubqueryAlias(identifier, child)
  }

  def newAlias(child: Expression, name: String): Alias = {
    Alias(child, name)()
  }

  def newAttributeReference(
      name: String,
      dataType: DataType,
      nullable: Boolean,
      metadata: Metadata): AttributeReference = {
    AttributeReference(name, dataType, nullable, metadata)()
  }

  def callSessionCatalogCreateTable(
      obj: SessionCatalog,
      tableDefinition: CatalogTable,
      ignoreIfExists: Boolean): Unit = {
    obj.createTable(tableDefinition, ignoreIfExists)
  }
}

Source File: parser.scala From tispark with Apache License 2.0

5 votes

package org.apache.spark.sql.extensions

import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
import org.apache.spark.sql.catalyst.parser._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.execution.SparkSqlParser
import org.apache.spark.sql.execution.command.{
  CacheTableCommand,
  CreateViewCommand,
  ExplainCommand,
  UncacheTableCommand
}
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{SparkSession, TiContext}

case class TiParser(getOrCreateTiContext: SparkSession => TiContext)(
    sparkSession: SparkSession,
    delegate: ParserInterface)
    extends ParserInterface {
  private lazy val tiContext = getOrCreateTiContext(sparkSession)
  private lazy val internal = new SparkSqlParser(sparkSession.sqlContext.conf)

  
  private def needQualify(tableIdentifier: TableIdentifier) =
    tableIdentifier.database.isEmpty && tiContext.sessionCatalog
      .getTempView(tableIdentifier.table)
      .isEmpty
}

Source File: TiAggregation.scala From tispark with Apache License 2.0

5 votes

package org.apache.spark.sql

import com.pingcap.tispark.TiDBRelation
import com.pingcap.tispark.utils.ReflectionUtil
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression}
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.execution.datasources.LogicalRelation

object TiAggregation {
  type ReturnType =
    (Seq[NamedExpression], Seq[AggregateExpression], Seq[NamedExpression], LogicalPlan)

  def unapply(plan: LogicalPlan): Option[ReturnType] =
    ReflectionUtil.callTiAggregationImplUnapply(plan)
}

object TiAggregationProjection {
  type ReturnType = (Seq[Expression], LogicalPlan, TiDBRelation, Seq[NamedExpression])

  def unapply(plan: LogicalPlan): Option[ReturnType] =
    plan match {
      // Only push down aggregates projection when all filters can be applied and
      // all projection expressions are column references
      case PhysicalOperation(
            projects,
            filters,
            rel @ LogicalRelation(source: TiDBRelation, _, _, _))
          if projects.forall(_.isInstanceOf[Attribute]) =>
        Some((filters, rel, source, projects))
      case _ => Option.empty[ReturnType]
    }
}

Source File: XmlDataToCatalyst.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml

import org.apache.spark.sql.catalyst.CatalystTypeConverters
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
import org.apache.spark.sql.catalyst.util.GenericArrayData
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

import com.databricks.spark.xml.parsers.StaxXmlParser

case class XmlDataToCatalyst(
    child: Expression,
    schema: DataType,
    options: XmlOptions)
  extends UnaryExpression with CodegenFallback with ExpectsInputTypes {

  override lazy val dataType: DataType = schema

  @transient
  lazy val rowSchema: StructType = schema match {
    case st: StructType => st
    case ArrayType(st: StructType, _) => st
  }

  override def nullSafeEval(xml: Any): Any = xml match {
    case string: UTF8String =>
      CatalystTypeConverters.convertToCatalyst(
        StaxXmlParser.parseColumn(string.toString, rowSchema, options))
    case string: String =>
      StaxXmlParser.parseColumn(string, rowSchema, options)
    case arr: GenericArrayData =>
      CatalystTypeConverters.convertToCatalyst(
        arr.array.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options)))
    case arr: Array[_] =>
      arr.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options))
    case _ => null
  }

  override def inputTypes: Seq[DataType] = schema match {
    case _: StructType => Seq(StringType)
    case ArrayType(_: StructType, _) => Seq(ArrayType(StringType))
  }
}

Source File: RedisRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.redis

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.sql.xsql.DataSourceManager._
import org.apache.spark.sql.xsql.execution.datasources.redis.RedisSpecialStrategy

trait RedisRelationTrait {
  val parameters: Map[String, String]
  val schema: StructType
  lazy val redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(parameters.get(URL).get))
}
case class RedisRelationImpl(val parameters: Map[String, String], val schema: StructType)
  extends RedisRelationTrait

case class RedisRelation(
    parameters: Map[String, String],
    schema: StructType,
    filter: Seq[Expression] = Nil)(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with PrunedScan
  with RedisRelationTrait {

  override def toString: String = s"RedisRelation(${filter.mkString(",")})"

  val partitionNum: Int = parameters.getOrElse("partitionNum", "1").toInt

  override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
    val filters = filter
      .map(RedisSpecialStrategy.getAttr)
      .groupBy(_._1)
      .map(tup => (tup._1, tup._2.map(_._2)))
    new RedisRDD(sqlContext.sparkContext, this, filters, requiredColumns, partitionNum)
  }
}

Source File: ScriptTransformation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: CodegenFallback.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._


trait CodegenFallback extends Expression {

  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    // LeafNode does not need `input`
    val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW
    val idx = ctx.references.length
    ctx.references += this
    var childIndex = idx
    this.foreach {
      case n: Nondeterministic =>
        // This might add the current expression twice, but it won't hurt.
        ctx.references += n
        childIndex += 1
        ctx.addPartitionInitializationStatement(
          s"""
             |((Nondeterministic) references[$childIndex])
             |  .initialize(partitionIndex);
          """.stripMargin)
      case _ =>
    }
    val objectTerm = ctx.freshName("obj")
    val placeHolder = ctx.registerComment(this.toString)
    val javaType = CodeGenerator.javaType(this.dataType)
    if (nullable) {
      ev.copy(code = code"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        boolean ${ev.isNull} = $objectTerm == null;
        $javaType ${ev.value} = ${CodeGenerator.defaultValue(this.dataType)};
        if (!${ev.isNull}) {
          ${ev.value} = (${CodeGenerator.boxedType(this.dataType)}) $objectTerm;
        }""")
    } else {
      ev.copy(code = code"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        $javaType ${ev.value} = (${CodeGenerator.boxedType(this.dataType)}) $objectTerm;
        """, isNull = FalseLiteral)
    }
  }
}

Source File: SubstituteUnresolvedOrdinals.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder}
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.IntegerType


class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] {
  private def isIntLiteral(e: Expression) = e match {
    case Literal(_, IntegerType) => true
    case _ => false
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) =>
      val newOrders = s.order.map {
        case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) =>
          val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
          withOrigin(order.origin)(order.copy(child = newOrdinal))
        case other => other
      }
      withOrigin(s.origin)(s.copy(order = newOrders))

    case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) =>
      val newGroups = a.groupingExpressions.map {
        case ordinal @ Literal(index: Int, IntegerType) =>
          withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
        case other => other
      }
      withOrigin(a.origin)(a.copy(groupingExpressions = newGroups))
  }
}

Source File: ResolveTableValuedFunctions.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import java.util.Locale

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Alias, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range}
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{DataType, IntegerType, LongType}


      tvf("start" -> LongType, "end" -> LongType, "step" -> LongType,
          "numPartitions" -> IntegerType) {
        case Seq(start: Long, end: Long, step: Long, numPartitions: Int) =>
          Range(start, end, step, Some(numPartitions))
      })
  )

  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) =>
      // The whole resolution is somewhat difficult to understand here due to too much abstractions.
      // We should probably rewrite the following at some point. Reynold was just here to improve
      // error messages and didn't have time to do a proper rewrite.
      val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match {
        case Some(tvf) =>

          def failAnalysis(): Nothing = {
            val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ")
            u.failAnalysis(
              s"""error: table-valued function ${u.functionName} with alternatives:
                 |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")}
                 |cannot be applied to: ($argTypes)""".stripMargin)
          }

          val resolved = tvf.flatMap { case (argList, resolver) =>
            argList.implicitCast(u.functionArgs) match {
              case Some(casted) =>
                try {
                  Some(resolver(casted.map(_.eval())))
                } catch {
                  case e: AnalysisException =>
                    failAnalysis()
                }
              case _ =>
                None
            }
          }
          resolved.headOption.getOrElse {
            failAnalysis()
          }
        case _ =>
          u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function")
      }

      // If alias names assigned, add `Project` with the aliases
      if (u.outputNames.nonEmpty) {
        val outputAttrs = resolvedFunc.output
        // Checks if the number of the aliases is equal to expected one
        if (u.outputNames.size != outputAttrs.size) {
          u.failAnalysis(s"Number of given aliases does not match number of output columns. " +
            s"Function name: ${u.functionName}; number of aliases: " +
            s"${u.outputNames.size}; number of output columns: ${outputAttrs.size}.")
        }
        val aliases = outputAttrs.zip(u.outputNames).map {
          case (attr, name) => Alias(attr, name)()
        }
        Project(aliases, resolvedFunc)
      } else {
        resolvedFunc
      }
  }
}

Source File: QueryPlanSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.dsl.plans
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal, NamedExpression}
import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
import org.apache.spark.sql.types.IntegerType

class QueryPlanSuite extends SparkFunSuite {

  test("origin remains the same after mapExpressions (SPARK-23823)") {
    CurrentOrigin.setPosition(0, 0)
    val column = AttributeReference("column", IntegerType)(NamedExpression.newExprId)
    val query = plans.DslLogicalPlan(plans.table("table")).select(column)
    CurrentOrigin.reset()

    val mappedQuery = query mapExpressions {
      case _: Expression => Literal(1)
    }

    val mappedOrigin = mappedQuery.expressions.apply(0).origin
    assert(mappedOrigin == Origin.apply(Some(0), Some(0)))
  }

}

Source File: RuleExecutorSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.errors.TreeNodeException
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    val message = intercept[TreeNodeException[LogicalPlan]] {
      ToFixedPoint.execute(Literal(100))
    }.getMessage
    assert(message.contains("Max iterations (10) reached for batch fixedPoint"))
  }

  test("structural integrity checker") {
    object WithSIChecker extends RuleExecutor[Expression] {
      override protected def isPlanIntegral(expr: Expression): Boolean = expr match {
        case IntegerLiteral(_) => true
        case _ => false
      }
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(WithSIChecker.execute(Literal(10)) === Literal(9))

    val message = intercept[TreeNodeException[LogicalPlan]] {
      WithSIChecker.execute(Literal(10.1))
    }.getMessage
    assert(message.contains("the structural integrity of the plan is broken"))
  }
}

Source File: CheckCartesianProductsSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.scalatest.Matchers._

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED

class CheckCartesianProductsSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil
  }

  val testRelation1 = LocalRelation('a.int, 'b.int)
  val testRelation2 = LocalRelation('c.int, 'd.int)

  val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter)
  val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists))

  test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "true") {
      noException should be thrownBy {
        for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  test("CheckCartesianProducts throws an exception for join types that require a join condition") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        val thrownException = the [AnalysisException] thrownBy {
          performCartesianProductCheck(joinType)
        }
        assert(thrownException.message.contains("Detected implicit cartesian product"))
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if a join condition is present") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType, Some('a === 'd))
        }
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithoutRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  private def performCartesianProductCheck(
      joinType: JoinType,
      condition: Option[Expression] = None): Unit = {
    val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze
    val optimizedPlan = Optimize.execute(analyzedPlan)
    comparePlans(analyzedPlan, optimizedPlan)
  }
}

Source File: ShuffledHashJoinExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))

  override def requiredChildDistribution: Seq[Distribution] =
    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil

  private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
    val buildDataSize = longMetric("buildDataSize")
    val buildTime = longMetric("buildTime")
    val start = System.nanoTime()
    val context = TaskContext.get()
    val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager())
    buildTime += (System.nanoTime() - start) / 1000000
    buildDataSize += relation.estimatedSize
    // This relation is usually used until the end of task.
    context.addTaskCompletionListener[Unit](_ => relation.close())
    relation
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
      val hashed = buildHashedRelation(buildIter)
      join(streamIter, hashed, numOutputRows)
    }
  }
}

Source File: CartesianProductExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.util.CompletionIterator


class UnsafeCartesianRDD(
    left : RDD[UnsafeRow],
    right : RDD[UnsafeRow],
    numFieldsOfRight: Int,
    inMemoryBufferThreshold: Int,
    spillThreshold: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold)

    val partition = split.asInstanceOf[CartesianPartition]
    rdd2.iterator(partition.s2, context).foreach(rowArray.add)

    // Create an iterator from rowArray
    def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator()

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, rowArray.clear())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(
      leftResults,
      rightResults,
      right.output.size,
      sqlContext.conf.cartesianProductExecBufferInMemoryThreshold,
      sqlContext.conf.cartesianProductExecBufferSpillThreshold)
    pair.mapPartitionsWithIndexInternal { (index, iter) =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
        boundCondition.initialize(index)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition.eval(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
}

Source File: DataSourceV2StringFormat.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import org.apache.commons.lang3.StringUtils

import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.sources.v2.DataSourceV2
import org.apache.spark.util.Utils


  def pushedFilters: Seq[Expression]

  private def sourceName: String = source match {
    case registered: DataSourceRegister => registered.shortName()
    // source.getClass.getSimpleName can cause Malformed class name error,
    // call safer `Utils.getSimpleName` instead
    case _ => Utils.getSimpleName(source.getClass)
  }

  def metadataString: String = {
    val entries = scala.collection.mutable.ArrayBuffer.empty[(String, String)]

    if (pushedFilters.nonEmpty) {
      entries += "Filters" -> pushedFilters.mkString("[", ", ", "]")
    }

    // TODO: we should only display some standard options like path, table, etc.
    if (options.nonEmpty) {
      entries += "Options" -> Utils.redact(options).map {
        case (k, v) => s"$k=$v"
      }.mkString("[", ",", "]")
    }

    val outputStr = Utils.truncatedString(output, "[", ", ", "]")

    val entriesStr = if (entries.nonEmpty) {
      Utils.truncatedString(entries.map {
        case (key, value) => key + ": " + StringUtils.abbreviate(value, 100)
      }, " (", ", ", ")")
    } else {
      ""
    }

    s"$sourceName$outputStr$entriesStr"
  }
}

Source File: DataSourcePartitioning.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression}
import org.apache.spark.sql.catalyst.plans.physical
import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning}


class DataSourcePartitioning(
    partitioning: Partitioning,
    colNames: AttributeMap[String]) extends physical.Partitioning {

  override val numPartitions: Int = partitioning.numPartitions()

  override def satisfies0(required: physical.Distribution): Boolean = {
    super.satisfies0(required) || {
      required match {
        case d: physical.ClusteredDistribution if isCandidate(d.clustering) =>
          val attrs = d.clustering.map(_.asInstanceOf[Attribute])
          partitioning.satisfy(
            new ClusteredDistribution(attrs.map { a =>
              val name = colNames.get(a)
              assert(name.isDefined, s"Attribute ${a.name} is not found in the data source output")
              name.get
            }.toArray))

        case _ => false
      }
    }
  }

  private def isCandidate(clustering: Seq[Expression]): Boolean = {
    clustering.forall {
      case a: Attribute => colNames.contains(a)
      case _ => false
    }
  }
}

Source File: DataSourceV2Strategy.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import scala.collection.mutable

import org.apache.spark.sql.{sources, Strategy}
import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression}
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition}
import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
import org.apache.spark.sql.execution.datasources.DataSourceStrategy
import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec}
import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns}
import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader

object DataSourceV2Strategy extends Strategy {

  
  // TODO: nested column pruning.
  private def pruneColumns(
      reader: DataSourceReader,
      relation: DataSourceV2Relation,
      exprs: Seq[Expression]): Seq[AttributeReference] = {
    reader match {
      case r: SupportsPushDownRequiredColumns =>
        val requiredColumns = AttributeSet(exprs.flatMap(_.references))
        val neededOutput = relation.output.filter(requiredColumns.contains)
        if (neededOutput != relation.output) {
          r.pruneColumns(neededOutput.toStructType)
          val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap
          r.readSchema().toAttributes.map {
            // We have to keep the attribute id during transformation.
            a => a.withExprId(nameToAttr(a.name).exprId)
          }
        } else {
          relation.output
        }

      case _ => relation.output
    }
  }


  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
    case PhysicalOperation(project, filters, relation: DataSourceV2Relation) =>
      val reader = relation.newReader()
      // `pushedFilters` will be pushed down and evaluated in the underlying data sources.
      // `postScanFilters` need to be evaluated after the scan.
      // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter.
      val (pushedFilters, postScanFilters) = pushFilters(reader, filters)
      val output = pruneColumns(reader, relation, project ++ postScanFilters)
      logInfo(
        s"""
           |Pushing operators to ${relation.source.getClass}
           |Pushed Filters: ${pushedFilters.mkString(", ")}
           |Post-Scan Filters: ${postScanFilters.mkString(",")}
           |Output: ${output.mkString(", ")}
         """.stripMargin)

      val scan = DataSourceV2ScanExec(
        output, relation.source, relation.options, pushedFilters, reader)

      val filterCondition = postScanFilters.reduceLeftOption(And)
      val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)

      // always add the projection, which will produce unsafe rows required by some operators
      ProjectExec(project, withFilter) :: Nil

    case r: StreamingDataSourceV2Relation =>
      // ensure there is a projection, which will produce unsafe rows required by some operators
      ProjectExec(r.output,
        DataSourceV2ScanExec(r.output, r.source, r.options, r.pushedFilters, r.reader)) :: Nil

    case WriteToDataSourceV2(writer, query) =>
      WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil

    case AppendData(r: DataSourceV2Relation, query, _) =>
      WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil

    case WriteToContinuousDataSource(writer, query) =>
      WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil

    case Repartition(1, false, child) =>
      val isContinuous = child.collectFirst {
        case StreamingDataSourceV2Relation(_, _, _, r: ContinuousReader) => r
      }.isDefined

      if (isContinuous) {
        ContinuousCoalesceExec(1, planLater(child)) :: Nil
      } else {
        Nil
      }

    case _ => Nil
  }
}

Source File: Exchange.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.exchange

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType


case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
    val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]()
    plan.transformUp {
      case exchange: Exchange =>
        // the exchanges that have same results usually also have same schemas (same column names).
        val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]())
        val samePlan = sameSchema.find { e =>
          exchange.sameResult(e)
        }
        if (samePlan.isDefined) {
          // Keep the output of this exchange, the following plans require that to resolve
          // attributes.
          ReusedExchangeExec(exchange.output, samePlan.get)
        } else {
          sameSchema += exchange
          exchange
        }
    }
  }
}

Source File: subquery.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{expressions, InternalRow}
import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{BooleanType, DataType, StructType}


case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of subqueries to avoid O(N*N) sameResult calls.
    val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]()
    plan transformAllExpressions {
      case sub: ExecSubqueryExpression =>
        val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]())
        val sameResult = sameSchema.find(_.sameResult(sub.plan))
        if (sameResult.isDefined) {
          sub.withNewPlan(sameResult.get)
        } else {
          sameSchema += sub.plan
          sub
        }
    }
  }
}

Source File: GroupedIterator.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder}
import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection}

object GroupedIterator {
  def apply(
      input: Iterator[InternalRow],
      keyExpressions: Seq[Expression],
      inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = {
    if (input.hasNext) {
      new GroupedIterator(input.buffered, keyExpressions, inputSchema)
    } else {
      Iterator.empty
    }
  }
}


  def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator

  def next(): (InternalRow, Iterator[InternalRow]) = {
    assert(hasNext) // Ensure we have fetched the next iterator.
    val ret = (keyProjection(currentGroup), currentIterator)
    currentIterator = null
    ret
  }

  private def fetchNextGroupIterator(): Boolean = {
    assert(currentIterator == null)

    if (currentRow == null && input.hasNext) {
      currentRow = input.next()
    }

    if (currentRow == null) {
      // These is no data left, return false.
      false
    } else {
      // Skip to next group.
      // currentRow may be overwritten by `hasNext`, so we should compare them first.
      while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) {
        currentRow = input.next()
      }

      if (keyOrdering.compare(currentGroup, currentRow) == 0) {
        // We are in the last group, there is no more groups, return false.
        false
      } else {
        // Now the `currentRow` is the first row of next group.
        currentGroup = currentRow.copy()
        currentIterator = createGroupValuesIterator()
        true
      }
    }
  }

  private def createGroupValuesIterator(): Iterator[InternalRow] = {
    new Iterator[InternalRow] {
      def hasNext: Boolean = currentRow != null || fetchNextRowInGroup()

      def next(): InternalRow = {
        assert(hasNext)
        val res = currentRow
        currentRow = null
        res
      }

      private def fetchNextRowInGroup(): Boolean = {
        assert(currentRow == null)

        if (input.hasNext) {
          // The inner iterator should NOT consume the input into next group, here we use `head` to
          // peek the next input, to see if we should continue to process it.
          if (keyOrdering.compare(currentGroup, input.head) == 0) {
            // Next input is in the current group.  Continue the inner iterator.
            currentRow = input.next()
            true
          } else {
            // Next input is not in the right group.  End this inner iterator.
            false
          }
        } else {
          // There is no more data, return false.
          false
        }
      }
    }
  }
}

Source File: AnnotationParser.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.parser

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{AnnotationReference, Expression, Literal}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String


  protected def toTableMetadata(metadata: Map[String, Expression]): Metadata = {
    val res = new MetadataBuilder()
    metadata.foreach {
      case (k, v:Literal) =>
        v.dataType match {
          case StringType =>
            if (k.equals("?")) {
              sys.error("column metadata key can not be ?")
            }
            if (k.equals("*")) {
              sys.error("column metadata key can not be *")
            }
            res.putString(k, v.value.asInstanceOf[UTF8String].toString)
          case LongType => res.putLong(k, v.value.asInstanceOf[Long])
          case DoubleType => res.putDouble(k, v.value.asInstanceOf[Double])
          case NullType =>
            res.putString(k, null)
          case a:ArrayType => res.putString(k, v.value.toString)
        }
      case (k, v:AnnotationReference) =>
        sys.error("column metadata can not have a reference to another column metadata")
    }
    res.build()
  }
}

Source File: ERPCurrencyConversionExpression.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.currency.erp

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes}
import org.apache.spark.sql.currency.CurrencyConversionException
import org.apache.spark.sql.currency.erp.ERPConversionLoader.RConversionOptionsCurried
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

import scala.util.control.NonFatal



case class ERPCurrencyConversionExpression(
    conversionFunction: RConversionOptionsCurried,
    children: Seq[Expression])
  extends Expression
  with ImplicitCastInputTypes
  with CodegenFallback {

  protected val CLIENT_INDEX = 0
  protected val CONVERSION_TYPE_INDEX = 1
  protected val AMOUNT_INDEX = 2
  protected val FROM_INDEX = 3
  protected val TO_INDEX = 4
  protected val DATE_INDEX = 5
  protected val NUM_ARGS = 6

  protected val errorMessage = "Currency conversion library encountered an internal error"


  override def eval(input: InternalRow): Any = {
    val inputArguments = children.map(_.eval(input))

    require(inputArguments.length == NUM_ARGS, "wrong number of arguments")

    // parse arguments
    val client = Option(inputArguments(CLIENT_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val conversionType =
      Option(inputArguments(CONVERSION_TYPE_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val amount = Option(inputArguments(AMOUNT_INDEX).asInstanceOf[Decimal].toJavaBigDecimal)
    val sourceCurrency =
      Option(inputArguments(FROM_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val targetCurrency = Option(inputArguments(TO_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val date = Option(inputArguments(DATE_INDEX).asInstanceOf[UTF8String]).map(_.toString)

    // perform conversion
    val conversion =
      conversionFunction(client, conversionType, sourceCurrency, targetCurrency, date)
    val resultTry = conversion(amount)

    // If 'resultTry' holds a 'Failure', we have to propagate it because potential failure
    // handling already took place. We just wrap it in case it is a cryptic error.
    resultTry.recover {
      case NonFatal(err) => throw new CurrencyConversionException(errorMessage, err)
    }.get.map(Decimal.apply).orNull
  }

  override def dataType: DataType = DecimalType.forType(DoubleType)

  override def nullable: Boolean = true

  override def inputTypes: Seq[AbstractDataType] =
    Seq(StringType, StringType, DecimalType, StringType, StringType, StringType)

  def inputNames: Seq[String] =
    Seq("client", "conversion_type", "amount", "source", "target", "date")

  def getChild(name: String): Option[Expression] = {
    inputNames.zip(children).find { case (n, _) => name == n }.map(_._2)
  }
}

Source File: BasicCurrencyConversionFunction.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.currency.basic

import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.currency._
import org.apache.spark.sql.util.ValidatingPropertyMap._

import scala.util.Try

protected object BasicCurrencyConversionConfig {

  
  private def updateRatesMapByTable(ratesTable: String, sqlContext: SQLContext): Unit = {
    val ratesTableData = sqlContext.sql(s"SELECT * FROM $ratesTable").collect()
    ratesTableData.foreach { row =>
      val from = row.getString(0)
      val to = row.getString(1)
      val date = row.getString(2).replaceAll("-", "").toInt
      val rate =
        Try(row.getDecimal(3)).recover {
          case ex: ClassCastException => new java.math.BigDecimal(row.getDouble(3))
        }.get
      ratesMap.put((from, to), date, rate)
    }
  }
}

Source File: HierarchyPlan.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst._
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SortOrder}
import org.apache.spark.sql.catalyst.plans.logical.LevelMatcher
import org.apache.spark.sql.hierarchy._
import org.apache.spark.sql.types._
import org.apache.spark.sql.util.RddUtils


    val schemaWithNode =
      StructType(child.schema.fields ++ Seq(StructField("", NodeType, nullable = false)))
    val resultInternalRdd = RDDConversions.rowToRowRdd(cachedResultRdd,
      schemaWithNode.fields.map(_.dataType))

    resultInternalRdd
  }
}

private[sql] case class AdjacencyListHierarchyPlan(child: SparkPlan,
                                                   parenthoodExp: Expression,
                                                   startWhere: Option[Expression],
                                                   orderBy: Seq[SortOrder],
                                                   node: Attribute,
                                                   dataType: DataType)
  extends HierarchyPlan(child, node) {

  override protected val builder: HierarchyBuilder[Row, Row] =
      HierarchyRowBroadcastBuilder(child.output, parenthoodExp, startWhere, orderBy)

  override protected val pathDataType = dataType
}

private[sql] case class LevelHierarchyPlan(child: SparkPlan,
                                           levels: Seq[Expression],
                                           startWhere: Option[Expression],
                                           orderBy: Seq[SortOrder],
                                           matcher: LevelMatcher,
                                           node: Attribute,
                                           dataType: DataType)
  extends HierarchyPlan(child, node) {

  override protected val builder: HierarchyBuilder[Row, Row] =
    HierarchyRowLevelBasedBuilder(
      child.output,
      levels,
      startWhere,
      orderBy,
      matcher)

  override protected val pathDataType = dataType
}

Source File: FunctionBuilders.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.{Expression, BinaryExpression, UnaryExpression}

import scala.reflect.ClassTag

object FunctionBuilders {

  type ExpressionBuilder = Seq[Expression] => Expression

  def expression[T <: Expression](arity: Int)(implicit tag: ClassTag[T]): ExpressionBuilder = {
    val argTypes = (1 to arity).map(x => classOf[Expression])
    val constructor = tag.runtimeClass.getDeclaredConstructor(argTypes: _*)
    (expressions: Seq[Expression]) => {
      if (expressions.size != arity) {
        throw new IllegalArgumentException(
          s"Invalid number of arguments: ${expressions.size} (must be equal to $arity)"
        )
      }
      constructor.newInstance(expressions: _*).asInstanceOf[Expression]
    }
  }

  def unaryExpression[T <: UnaryExpression](implicit tag: ClassTag[T]): ExpressionBuilder =
    expression[T](1)

  def binaryExpression[T <: BinaryExpression](implicit tag: ClassTag[T]): ExpressionBuilder =
    expression[T](2)

  def reverse(expressionBuilder: ExpressionBuilder): ExpressionBuilder =
    (expressions: Seq[Expression]) => {
      expressionBuilder(expressions.reverse)
    }

}

Source File: MetadataAccessorSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import org.apache.spark.sql.catalyst.expressions.{Literal, Expression}
import org.scalatest.FunSuite


// scalastyle:off magic.number
class MetadataAccessorSuite extends FunSuite {

  test("expression map is written correctly to Metadata") {
    val expressionMap = Map[String, Expression] (
      "stringKey" -> Literal.create("stringValue", StringType),
      "longKey" -> Literal.create(10L, LongType),
      "doubleKey" -> Literal.create(1.234, DoubleType),
      "nullKey" -> Literal.create(null, NullType)
    )
    val actual = MetadataAccessor.expressionMapToMetadata(expressionMap)

    assertResult("stringValue")(actual.getString("stringKey"))
    assertResult(10)(actual.getLong("longKey"))
    assertResult(1.234)(actual.getDouble("doubleKey"))
    assertResult(null)(actual.getString("nullKey"))
  }

  test("metadata propagation works correctly") {
    val oldMetadata = new MetadataBuilder()
      .putString("key1", "value1")
      .putString("key2", "value2")
      .putLong("key3", 10L)
      .build()

    val newMetadata = new MetadataBuilder()
      .putString("key1", "overriden")
      .putString("key4", "value4")
      .build()

    val expected = new MetadataBuilder()
      .putString("key1", "overriden")
      .putString("key2", "value2")
      .putLong("key3", 10L)
      .putString("key4", "value4")
      .build()

    val actual = MetadataAccessor.propagateMetadata(oldMetadata, newMetadata)

    assertResult(expected)(actual)
  }

  test("filter metadata works correctly") {
    val metadata = new MetadataBuilder()
      .putString("key1", "value1")
      .putString("key2", "value2")
      .putLong("key3", 10L)
      .build()

    val expected1 = new MetadataBuilder()
      .putString("key1", "value1")
      .build()

    assertResult(expected1)(MetadataAccessor.filterMetadata(metadata, ("key1" :: Nil).toSet))

    assertResult(metadata)(MetadataAccessor.filterMetadata(metadata, ("*" :: Nil).toSet))
  }
}

Source File: SqlBuilderSuiteBase.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.sql

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.DataType
import org.scalatest.FunSuite

import scala.util.matching.Regex

trait SqlBuilderSuiteBase {
  self: FunSuite =>

  val sqlBuilder: SqlBuilder // scalastyle:ignore

  def testExpressionToSql(sql: String)(expr: Expression): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"expressionToSql: $cleanSql | with $expr") {
      assertResult(cleanSql)(sqlBuilder.expressionToSql(expr))
    }
  }

  def testBuildSelect(sql: String)
                     (i1: SqlLikeRelation, i2: Seq[String], i3: Seq[Filter]): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"buildSelect: $cleanSql | with $i1 $i2 $i3") {
      assertResult(cleanSql)(sqlBuilder.buildSelect(i1, i2, i3))
    }
  }

  def testLogicalPlan(sql: String)(plan: LogicalPlan): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"logical plan: $cleanSql | with $plan") {
      assertResult(cleanSql)(sqlBuilder.logicalPlanToSql(plan))
    }
  }

  def testLogicalPlanInternal(sql: String)(plan: LogicalPlan): Unit = {
    val cleanSql = cleanUpSql(sql)
    test(s"logical plan (internal): $cleanSql | with $plan") {
      assertResult(cleanSql)(sqlBuilder.internalLogicalPlanToSql(plan, noProject = true))
    }
  }

  def testUnsupportedLogicalPlan(plan: LogicalPlan): Unit = {
    test(s"invalid logical plan: $plan") {
      intercept[RuntimeException] {
        sqlBuilder.logicalPlanToSql(plan)
      }
    }
  }

  private def cleanUpSql(q: String): String =
    q.replaceAll("\\s+", " ").trim

  def testUnsupportedLogicalPlanInternal(plan: LogicalPlan): Unit = {
    test(s"invalid logical plan (internal): $plan") {
      intercept[RuntimeException] {
        sqlBuilder.internalLogicalPlanToSql(plan)
      }
    }
  }

  
  def testGeneratedSqlDataType(expected: String)(dataType: DataType): Unit = {
    test(s"The generated sql type for ${dataType.simpleString} is $expected") {
      val generated = sqlBuilder.typeToSql(dataType)
      assertResult(expected)(generated)
    }
  }

}

Source File: DummyRelationUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.{ColumnName, Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.types.{StructField, StructType}


  case class DummyCatalystSourceRelation(
      schema: StructType,
      isMultiplePartitionExecutionFunc: Option[Seq[CatalystSource] => Boolean] = None,
      supportsLogicalPlanFunc: Option[LogicalPlan => Boolean] = None,
      supportsExpressionFunc: Option[Expression => Boolean] = None,
      logicalPlanToRDDFunc: Option[LogicalPlan => RDD[Row]] = None)
     (@transient implicit val sqlContext: SQLContext)
    extends BaseRelation
    with CatalystSource {

    override def isMultiplePartitionExecution(relations: Seq[CatalystSource]): Boolean =
      isMultiplePartitionExecutionFunc.forall(_.apply(relations))

    override def supportsLogicalPlan(plan: LogicalPlan): Boolean =
      supportsLogicalPlanFunc.forall(_.apply(plan))

    override def supportsExpression(expr: Expression): Boolean =
      supportsExpressionFunc.forall(_.apply(expr))

    override def logicalPlanToRDD(plan: LogicalPlan): RDD[Row] =
      logicalPlanToRDDFunc.getOrElse(
        (plan: LogicalPlan) => new LogicalPlanRDD(plan, sqlContext.sparkContext)).apply(plan)
  }
}

Source File: ColumnarShuffledHashJoinExec.scala From OAP with Apache License 2.0

5 votes

package com.intel.sparkColumnarPlugin.execution

import java.util.concurrent.TimeUnit._

import com.intel.sparkColumnarPlugin.vectorized._

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics

import scala.collection.JavaConverters._
import org.apache.spark.internal.Logging
import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
import scala.collection.mutable.ListBuffer
import org.apache.arrow.vector.ipc.message.ArrowFieldNode
import org.apache.arrow.vector.ipc.message.ArrowRecordBatch
import org.apache.arrow.vector.types.pojo.ArrowType
import org.apache.arrow.vector.types.pojo.Field
import org.apache.arrow.vector.types.pojo.Schema
import org.apache.arrow.gandiva.expression._
import org.apache.arrow.gandiva.evaluator._

import io.netty.buffer.ArrowBuf
import com.google.common.collect.Lists;

import com.intel.sparkColumnarPlugin.expression._
import com.intel.sparkColumnarPlugin.vectorized.ExpressionEvaluator
import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec
import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}


class ColumnarShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan) extends ShuffledHashJoinExec(
    leftKeys,
    rightKeys,
    joinType,
    buildSide,
    condition,
    left,
    right) {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "joinTime" -> SQLMetrics.createTimingMetric(sparkContext, "join time"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))

  override def supportsColumnar = true

  //TODO() Disable code generation
  //override def supportCodegen: Boolean = false

  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
    val numOutputRows = longMetric("numOutputRows")
    val joinTime = longMetric("joinTime")
    val buildTime = longMetric("buildTime")
    val resultSchema = this.schema
    streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) =>
      //val hashed = buildHashedRelation(buildIter)
      //join(streamIter, hashed, numOutputRows)
      val vjoin = ColumnarShuffledHashJoin.create(leftKeys, rightKeys, resultSchema, joinType, buildSide, condition, left, right, buildTime, joinTime, numOutputRows)
      val vjoinResult = vjoin.columnarInnerJoin(streamIter, buildIter)
      TaskContext.get().addTaskCompletionListener[Unit](_ => {
        vjoin.close()
      })
      new CloseableColumnBatchIterator(vjoinResult)
    }
  }
}

Source File: FileSourceScanExecAdapter.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.oap.adapter

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.execution.FileSourceScanExec
import org.apache.spark.sql.execution.datasources.HadoopFsRelation
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.collection.BitSet

object FileSourceScanExecAdapter {
  def createFileSourceScanExec(
      relation: HadoopFsRelation,
      output: Seq[Attribute],
      requiredSchema: StructType,
      partitionFilters: Seq[Expression],
      optionalBucketSets: Option[BitSet],
      dataFilters: Seq[Expression],
      metastoreTableIdentifier: Option[TableIdentifier]): FileSourceScanExec = {
    FileSourceScanExec(
      relation,
      output,
      requiredSchema,
      partitionFilters,
      optionalBucketSets,
      dataFilters,
      metastoreTableIdentifier)
  }
}

Source File: OapAggUtils.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.aggregate

import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression}
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, Partial}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.datasources.oap.OapAggregationFileScanExec

object OapAggUtils {
  private def createAggregate(
      requiredChildDistributionExpressions: Option[Seq[Expression]] = None,
      groupingExpressions: Seq[NamedExpression] = Nil,
      aggregateExpressions: Seq[AggregateExpression] = Nil,
      aggregateAttributes: Seq[Attribute] = Nil,
      initialInputBufferOffset: Int = 0,
      resultExpressions: Seq[NamedExpression] = Nil,
      child: SparkPlan): SparkPlan = {
    if (requiredChildDistributionExpressions.isDefined) {
      // final aggregate, fall back to Spark HashAggregateExec.
      HashAggregateExec(
        requiredChildDistributionExpressions = requiredChildDistributionExpressions,
        groupingExpressions = groupingExpressions,
        aggregateExpressions = aggregateExpressions,
        aggregateAttributes = aggregateAttributes,
        initialInputBufferOffset = initialInputBufferOffset,
        resultExpressions = resultExpressions,
        child = child)
    } else {
      // Apply partial aggregate optimizations.
      OapAggregateExec(
        requiredChildDistributionExpressions = None,
        groupingExpressions = groupingExpressions,
        aggregateExpressions = aggregateExpressions,
        aggregateAttributes = aggregateAttributes,
        initialInputBufferOffset = initialInputBufferOffset,
        resultExpressions = resultExpressions,
        child = child)
    }
  }

  def planAggregateWithoutDistinct(
      groupingExpressions: Seq[NamedExpression],
      aggregateExpressions: Seq[AggregateExpression],
      resultExpressions: Seq[NamedExpression],
      child: SparkPlan): Seq[SparkPlan] = {
    val useHash = HashAggregateExec.supportsAggregate(
      aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))

    if (!child.isInstanceOf[OapAggregationFileScanExec] || !useHash) {
      // Child can not leverage oap optimization reading.
      Nil
    } else {
      // 1. Create an Aggregate Operator for partial aggregations.
      val groupingAttributes = groupingExpressions.map(_.toAttribute)
      val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial))
      val partialAggregateAttributes =
        partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
      val partialResultExpressions =
        groupingAttributes ++
          partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)

      val partialAggregate = createAggregate(
        requiredChildDistributionExpressions = None,
        groupingExpressions = groupingExpressions,
        aggregateExpressions = partialAggregateExpressions,
        aggregateAttributes = partialAggregateAttributes,
        initialInputBufferOffset = 0,
        resultExpressions = partialResultExpressions,
        child = child)

      // 2. Create an Aggregate Operator for final aggregations.
      val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final))
      // The attributes of the final aggregation buffer, which is presented as input to the result
      // projection:
      val finalAggregateAttributes = finalAggregateExpressions.map(_.resultAttribute)

      val finalAggregate = createAggregate(
        requiredChildDistributionExpressions = Some(groupingAttributes),
        groupingExpressions = groupingAttributes,
        aggregateExpressions = finalAggregateExpressions,
        aggregateAttributes = finalAggregateAttributes,
        initialInputBufferOffset = groupingExpressions.length,
        resultExpressions = resultExpressions,
        child = partialAggregate)

      finalAggregate :: Nil
    }
  }
}

Source File: ArrowScan.scala From OAP with Apache License 2.0

5 votes

package com.intel.oap.spark.sql.execution.datasources.v2.arrow

import scala.collection.JavaConverters._

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.connector.read.PartitionReaderFactory
import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
import org.apache.spark.sql.execution.datasources.v2.FileScan
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.util.SerializableConfiguration

case class ArrowScan(
    sparkSession: SparkSession,
    fileIndex: PartitioningAwareFileIndex,
    readDataSchema: StructType,
    readPartitionSchema: StructType,
    pushedFilters: Array[Filter],
    options: CaseInsensitiveStringMap,
    partitionFilters: Seq[Expression] = Seq.empty,
    dataFilters: Seq[Expression] = Seq.empty)
    extends FileScan {

  override def createReaderFactory(): PartitionReaderFactory = {
    val caseSensitiveMap = options.asCaseSensitiveMap().asScala.toMap
    val hconf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
    val broadcastedConf =
      sparkSession.sparkContext.broadcast(new SerializableConfiguration(hconf))
    ArrowPartitionReaderFactory(
      sparkSession.sessionState.conf,
      broadcastedConf,
      readDataSchema,
      readPartitionSchema,
      pushedFilters,
      new ArrowOptions(options.asScala.toMap))
  }

  override def withFilters(partitionFilters: Seq[Expression],
                           dataFilters: Seq[Expression]): FileScan =
    this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
}

Source File: SparkExtension.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.sql

import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{SparkSession, SparkSessionExtensions}


class SparkExtension extends (SparkSessionExtensions => Unit) {
  def apply(e: SparkSessionExtensions): Unit = {
    e.injectParser(SparkAtlasConnectorParser)
  }
}

case class SparkAtlasConnectorParser(spark: SparkSession, delegate: ParserInterface)
  extends ParserInterface {
  override def parsePlan(sqlText: String): LogicalPlan = {
    SQLQuery.set(sqlText)
    delegate.parsePlan(sqlText)
  }

  override def parseExpression(sqlText: String): Expression =
    delegate.parseExpression(sqlText)

  override def parseTableIdentifier(sqlText: String): TableIdentifier =
    delegate.parseTableIdentifier(sqlText)

  override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier =
    delegate.parseFunctionIdentifier(sqlText)

  override def parseTableSchema(sqlText: String): StructType =
    delegate.parseTableSchema(sqlText)

  override def parseDataType(sqlText: String): DataType =
    delegate.parseDataType(sqlText)
}

object SQLQuery {
  private[this] val sqlQuery = new ThreadLocal[String]
  def get(): String = sqlQuery.get
  def set(s: String): Unit = sqlQuery.set(s)
}

Source File: SQLBuilderTest.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst

import scala.util.control.NonFatal

import org.apache.spark.sql.{DataFrame, Dataset, QueryTest}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.hive.test.TestHiveSingleton


abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
    val actualSQL = e.sql
    try {
      assert(actualSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following expression:
             |
             |${e.prettyName}
             |
             |$cause
           """.stripMargin)
    }
  }

  protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = {
    val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) =>
      fail(
        s"""Cannot convert the following logical query plan to SQL:
           |
           |${plan.treeString}
         """.stripMargin)
    }

    try {
      assert(generatedSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following logical query plan:
             |
             |${plan.treeString}
             |
             |$cause
           """.stripMargin)
    }

    checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan))
  }

  protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = {
    checkSQL(df.queryExecution.analyzed, expectedSQL)
  }
}

Source File: ScriptTransformation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: EventTimeWatermark.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval

object EventTimeWatermark {
  
case class EventTimeWatermark(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: LogicalPlan) extends LogicalPlan {

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }

  override val children: Seq[LogicalPlan] = child :: Nil
}

Source File: CodegenFallback.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic}


trait CodegenFallback extends Expression {

  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    // LeafNode does not need `input`
    val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW
    val idx = ctx.references.length
    ctx.references += this
    var childIndex = idx
    this.foreach {
      case n: Nondeterministic =>
        // This might add the current expression twice, but it won't hurt.
        ctx.references += n
        childIndex += 1
        ctx.addPartitionInitializationStatement(
          s"""
             |((Nondeterministic) references[$childIndex])
             |  .initialize(partitionIndex);
          """.stripMargin)
      case _ =>
    }
    val objectTerm = ctx.freshName("obj")
    val placeHolder = ctx.registerComment(this.toString)
    if (nullable) {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        boolean ${ev.isNull} = $objectTerm == null;
        ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)};
        if (!${ev.isNull}) {
          ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        }""")
    } else {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        """, isNull = "false")
    }
  }
}

Source File: SubstituteUnresolvedOrdinals.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.CatalystConf
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder}
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
import org.apache.spark.sql.types.IntegerType


class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] {
  private def isIntLiteral(e: Expression) = e match {
    case Literal(_, IntegerType) => true
    case _ => false
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) =>
      val newOrders = s.order.map {
        case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) =>
          val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
          withOrigin(order.origin)(order.copy(child = newOrdinal))
        case other => other
      }
      withOrigin(s.origin)(s.copy(order = newOrders))

    case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) =>
      val newGroups = a.groupingExpressions.map {
        case ordinal @ Literal(index: Int, IntegerType) =>
          withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
        case other => other
      }
      withOrigin(a.origin)(a.copy(groupingExpressions = newGroups))
  }
}

Source File: RuleExecutorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.errors.TreeNodeException
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    val message = intercept[TreeNodeException[LogicalPlan]] {
      ToFixedPoint.execute(Literal(100))
    }.getMessage
    assert(message.contains("Max iterations (10) reached for batch fixedPoint"))
  }
}

Source File: ShuffledHashJoinExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
    val buildDataSize = longMetric("buildDataSize")
    val buildTime = longMetric("buildTime")
    val start = System.nanoTime()
    val context = TaskContext.get()
    val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager())
    buildTime += (System.nanoTime() - start) / 1000000
    buildDataSize += relation.estimatedSize
    // This relation is usually used until the end of task.
    context.addTaskCompletionListener(_ => relation.close())
    relation
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
      val hashed = buildHashedRelation(buildIter)
      join(streamIter, hashed, numOutputRows)
    }
  }
}

Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter


class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    // We will not sort the rows, so prefixComparator and recordComparator are null.
    val sorter = UnsafeExternalSorter.create(
      context.taskMemoryManager(),
      SparkEnv.get.blockManager,
      SparkEnv.get.serializerManager,
      context,
      null,
      null,
      1024,
      SparkEnv.get.memoryManager.pageSizeBytes,
      SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
        UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD),
      false)

    val partition = split.asInstanceOf[CartesianPartition]
    for (y <- rdd2.iterator(partition.s2, context)) {
      sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false)
    }

    // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow]
    def createIter(): Iterator[UnsafeRow] = {
      val iter = sorter.getIterator
      val unsafeRow = new UnsafeRow(numFieldsOfRight)
      new Iterator[UnsafeRow] {
        override def hasNext: Boolean = {
          iter.hasNext
        }
        override def next(): UnsafeRow = {
          iter.loadNext()
          unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength)
          unsafeRow
        }
      }
    }

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, sorter.cleanupResources())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
    pair.mapPartitionsWithIndexInternal { (index, iter) =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
        boundCondition.initialize(index)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition.eval(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
}

Source File: package.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.mv

import scala.language.implicitConversions

import org.apache.spark.sql.catalyst._
import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import org.apache.carbondata.mv.plans._
import org.apache.carbondata.mv.plans.modular.{JoinEdge, ModularPlan}
import org.apache.carbondata.mv.plans.modular.Flags._
import org.apache.carbondata.mv.plans.util._


package object dsl {

  object Plans {

    implicit class DslModularPlan(val modularPlan: ModularPlan) {
      def select(outputExprs: NamedExpression*)
        (inputExprs: Expression*)
        (predicateExprs: Expression*)
        (aliasMap: Map[Int, String])
        (joinEdges: JoinEdge*): ModularPlan = {
        modular
          .Select(
            outputExprs,
            inputExprs,
            predicateExprs,
            aliasMap,
            joinEdges,
            Seq(modularPlan),
            NoFlags,
            Seq.empty,
            Seq.empty)
      }

      def groupBy(outputExprs: NamedExpression*)
        (inputExprs: Expression*)
        (predicateExprs: Expression*): ModularPlan = {
        modular
          .GroupBy(outputExprs, inputExprs, predicateExprs, None, modularPlan, NoFlags, Seq.empty)
      }

      def harmonize: ModularPlan = modularPlan.harmonized
    }

    implicit class DslModularPlans(val modularPlans: Seq[ModularPlan]) {
      def select(outputExprs: NamedExpression*)
        (inputExprs: Expression*)
        (predicateList: Expression*)
        (aliasMap: Map[Int, String])
        (joinEdges: JoinEdge*): ModularPlan = {
        modular
          .Select(
            outputExprs,
            inputExprs,
            predicateList,
            aliasMap,
            joinEdges,
            modularPlans,
            NoFlags,
            Seq.empty,
            Seq.empty)
      }

      def union(): ModularPlan = modular.Union(modularPlans, NoFlags, Seq.empty)
    }

    implicit class DslLogical2Modular(val logicalPlan: LogicalPlan) {
      def resolveonly: LogicalPlan = analysis.SimpleAnalyzer.execute(logicalPlan)

      def modularize: ModularPlan = modular.SimpleModularizer.modularize(logicalPlan).next

      def optimize: LogicalPlan = BirdcageOptimizer.execute(logicalPlan)
    }

   }

}

Source File: package.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.mv

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, PredicateHelper, ScalaUDF}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import org.apache.carbondata.mv.plans.modular.ModularPlan
import org.apache.carbondata.mv.plans.util.{CheckSPJG, LogicalPlanSignatureGenerator, Signature}


    def canEvaluate(exp: ScalaUDF, exprList: Seq[Expression]): Boolean = {
      var canBeDerived = false
      exprList.forall {
        case udf: ScalaUDF =>
          if (udf.children.length == exp.children.length) {
            if (udf.children.zip(exp.children).forall(e => e._1.sql.equalsIgnoreCase(e._2.sql))) {
              canBeDerived = true
            }
          }
          canBeDerived
        case _ =>
          canBeDerived
      }
    }

    def canEvaluate(expr: Expression, exprList: Seq[Expression]): Boolean = {
      expr match {
        case exp: ScalaUDF =>
          canEvaluate(exp, exprList)
        case _ =>
          expr.references.subsetOf(AttributeSet(exprList))
      }
    }
  }

  def supports(supported: Boolean, message: Any) {
    if (!supported) {
      throw new UnsupportedOperationException(s"unsupported operation: $message")
    }
  }
}

Source File: ExpressionHelper.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.mv.plans.modular

import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression}
import org.apache.spark.sql.types.{DataType, Metadata}

object ExpressionHelper {

  def createReference(
     name: String,
     dataType: DataType,
     nullable: Boolean,
     metadata: Metadata,
     exprId: ExprId,
     qualifier: Option[String],
     attrRef : NamedExpression = null): AttributeReference = {
    AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier)
  }

  def createAlias(
       child: Expression,
       name: String,
       exprId: ExprId = NamedExpression.newExprId,
       qualifier: Option[String] = None,
       explicitMetadata: Option[Metadata] = None,
       namedExpr : Option[NamedExpression] = None ) : Alias = {
    Alias(child, name)(exprId, qualifier, explicitMetadata)
  }

  def getTheLastQualifier(reference: AttributeReference): String = {
    reference.qualifier.head
  }

}

Source File: ExpressionHelper.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.mv.plans.modular

import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression}
import org.apache.spark.sql.types.{DataType, Metadata}

object ExpressionHelper {

  def createReference(
     name: String,
     dataType: DataType,
     nullable: Boolean,
     metadata: Metadata,
     exprId: ExprId,
     qualifier: Option[String],
     attrRef : NamedExpression = null): AttributeReference = {
    val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty
    AttributeReference(name, dataType, nullable, metadata)(exprId, qf)
  }

  def createAlias(
      child: Expression,
      name: String,
      exprId: ExprId,
      qualifier: Option[String]) : Alias = {
    val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty
    Alias(child, name)(exprId, qf, None)
  }

  def getTheLastQualifier(reference: AttributeReference): String = {
    reference.qualifier.reverse.head
  }

}

Source File: CarbonBoundReference.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources.Filter

case class CastExpr(expr: Expression) extends Filter {
  override def references: Array[String] = null
}

case class FalseExpr() extends Filter {
  override def references: Array[String] = null
}

case class CarbonEndsWith(expr: Expression) extends Filter {
  override def references: Array[String] = null
}

case class CarbonContainsWith(expr: Expression) extends Filter {
  override def references: Array[String] = null
}

Source File: CleanFilesPostEventListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.secondaryindex.events

import scala.collection.JavaConverters._

import org.apache.log4j.Logger
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.index.CarbonIndexUtil
import org.apache.spark.sql.optimizer.CarbonFilters

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.SegmentStatusManager
import org.apache.carbondata.events.{CleanFilesPostEvent, Event, OperationContext, OperationEventListener}

class CleanFilesPostEventListener extends OperationEventListener with Logging {

  val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  
  override def onEvent(event: Event, operationContext: OperationContext): Unit = {
    event match {
      case cleanFilesPostEvent: CleanFilesPostEvent =>
        LOGGER.info("Clean files post event listener called")
        val carbonTable = cleanFilesPostEvent.carbonTable
        val indexTables = CarbonIndexUtil
          .getIndexCarbonTables(carbonTable, cleanFilesPostEvent.sparkSession)
        indexTables.foreach { indexTable =>
          val partitions: Option[Seq[PartitionSpec]] = CarbonFilters.getPartitions(
            Seq.empty[Expression],
            cleanFilesPostEvent.sparkSession,
            indexTable)
          SegmentStatusManager.deleteLoadsAndUpdateMetadata(
            indexTable, true, partitions.map(_.asJava).orNull)
          CarbonUpdateUtil.cleanUpDeltaFiles(indexTable, true)
        }
    }
  }
}

Source File: MergeProjection.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.mutation.merge

import java.sql.{Date, Timestamp}

import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection}
import org.apache.spark.sql.catalyst.util.DateTimeUtils


case class MergeProjection(
    @transient tableCols: Seq[String],
    @transient statusCol : String,
    @transient ds: Dataset[Row],
    @transient rltn: CarbonDatasourceHadoopRelation,
    @transient sparkSession: SparkSession,
    @transient mergeAction: MergeAction) {

  private val cutOffDate = Integer.MAX_VALUE >> 1

  val isUpdate = mergeAction.isInstanceOf[UpdateAction]
  val isDelete = mergeAction.isInstanceOf[DeleteAction]

  def apply(row: GenericRowWithSchema): InternalRow = {
    // TODO we can avoid these multiple conversions if this is added as a SparkPlan node.
    val values = row.values.map {
      case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s)
      case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d)
      case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b)
      case d: Date => DateTimeUtils.fromJavaDate(d)
      case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
      case value => value
    }

    projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow]
  }

  val (projection, output) = generateProjection

  private def generateProjection: (Projection, Array[Expression]) = {
    val existingDsOutput = rltn.carbonRelation.schema.toAttributes
    val colsMap = mergeAction match {
      case UpdateAction(updateMap) => updateMap
      case InsertAction(insertMap) => insertMap
      case _ => null
    }
    if (colsMap != null) {
      val output = new Array[Expression](tableCols.length)
      val expecOutput = new Array[Expression](tableCols.length)
      colsMap.foreach { case (k, v) =>
        val tableIndex = tableCols.indexOf(k.toString().toLowerCase)
        if (tableIndex < 0) {
          throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap")
        }
        output(tableIndex) = v.expr.transform {
          case a: Attribute if !a.resolved =>
            ds.queryExecution.analyzed.resolveQuoted(a.name,
              sparkSession.sessionState.analyzer.resolver).get
        }
        expecOutput(tableIndex) =
          existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get
      }
      if (output.contains(null)) {
        throw new CarbonMergeDataSetException(s"Not all columns are mapped")
      }
      (new InterpretedMutableProjection(output++Seq(
        ds.queryExecution.analyzed.resolveQuoted(statusCol,
        sparkSession.sessionState.analyzer.resolver).get),
        ds.queryExecution.analyzed.output), expecOutput)
    } else {
      (null, null)
    }
  }
}

Source File: CarbonExpressions.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, ScalaUDF}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.execution.command.DescribeTableCommand
import org.apache.spark.sql.types.DataType


  object CarbonScalaUDF {
    def unapply(expression: Expression): Option[(ScalaUDF)] = {
      expression match {
        case a: ScalaUDF =>
          Some(a)
        case _ =>
          None
      }
    }
  }
}

Source File: SparkSqlAdapter.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.execution.FileSourceScanExec
import org.apache.spark.sql.execution.datasources.HadoopFsRelation
import org.apache.spark.sql.types.StructType

import org.apache.carbondata.core.util.ThreadLocalSessionInfo

object SparkSqlAdapter {

  def initSparkSQL(): Unit = {
  }

  def getScanForSegments(
      @transient relation: HadoopFsRelation,
      output: Seq[Attribute],
      outputSchema: StructType,
      partitionFilters: Seq[Expression],
      dataFilters: Seq[Expression],
      tableIdentifier: Option[TableIdentifier]
  ): FileSourceScanExec = {
    FileSourceScanExec(
      relation,
      output,
      outputSchema,
      partitionFilters,
      dataFilters,
      tableIdentifier)
  }

  def addSparkSessionListener(sparkSession: SparkSession): Unit = {
    sparkSession.sparkContext.addSparkListener(new SparkListener {
      override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
        CarbonEnv.carbonEnvMap.remove(sparkSession)
        ThreadLocalSessionInfo.unsetAll()
      }
    })
  }
}

Source File: SparkSqlAdapter.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.execution.FileSourceScanExec
import org.apache.spark.sql.execution.datasources.HadoopFsRelation
import org.apache.spark.sql.types.StructType

object SparkSqlAdapter {

  def initSparkSQL(): Unit = {
  }

  def getScanForSegments(
      @transient relation: HadoopFsRelation,
      output: Seq[Attribute],
      outputSchema: StructType,
      partitionFilters: Seq[Expression],
      dataFilters: Seq[Expression],
      tableIdentifier: Option[TableIdentifier]
  ): FileSourceScanExec = {
    FileSourceScanExec(
      relation,
      output,
      outputSchema,
      partitionFilters,
      None,
      dataFilters,
      tableIdentifier)
  }
}

Source File: DeltaSourceUtils.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.sources

import java.util.Locale

import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources
import org.apache.spark.sql.sources.Filter

object DeltaSourceUtils {
  val NAME = "delta"
  val ALT_NAME = "delta"

  // Batch relations don't pass partitioning columns to `CreatableRelationProvider`s, therefore
  // as a hack, we pass in the partitioning columns among the options.
  val PARTITIONING_COLUMNS_KEY = "__partition_columns"

  def isDeltaDataSourceName(name: String): Boolean = {
    name.toLowerCase(Locale.ROOT) == NAME || name.toLowerCase(Locale.ROOT) == ALT_NAME
  }

  
  def translateFilters(filters: Array[Filter]): Expression = filters.map {
    case sources.EqualTo(attribute, value) =>
      expressions.EqualTo(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.EqualNullSafe(attribute, value) =>
      expressions.EqualNullSafe(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.GreaterThan(attribute, value) =>
      expressions.GreaterThan(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.GreaterThanOrEqual(attribute, value) =>
      expressions.GreaterThanOrEqual(
        UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.LessThan(attribute, value) =>
      expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.LessThanOrEqual(attribute, value) =>
      expressions.LessThanOrEqual(UnresolvedAttribute(attribute), expressions.Literal.create(value))
    case sources.In(attribute, values) =>
      expressions.In(UnresolvedAttribute(attribute), values.map(createLiteral))
    case sources.IsNull(attribute) => expressions.IsNull(UnresolvedAttribute(attribute))
    case sources.IsNotNull(attribute) => expressions.IsNotNull(UnresolvedAttribute(attribute))
    case sources.Not(otherFilter) => expressions.Not(translateFilters(Array(otherFilter)))
    case sources.And(filter1, filter2) =>
      expressions.And(translateFilters(Array(filter1)), translateFilters(Array(filter2)))
    case sources.Or(filter1, filter2) =>
      expressions.Or(translateFilters(Array(filter1)), translateFilters(Array(filter2)))
    case sources.StringStartsWith(attribute, value) =>
      new expressions.Like(
        UnresolvedAttribute(attribute), expressions.Literal.create(s"${value}%"))
    case sources.StringEndsWith(attribute, value) =>
      new expressions.Like(
        UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}"))
    case sources.StringContains(attribute, value) =>
      new expressions.Like(
        UnresolvedAttribute(attribute), expressions.Literal.create(s"%${value}%"))
    case sources.AlwaysTrue() => expressions.Literal.TrueLiteral
    case sources.AlwaysFalse() => expressions.Literal.FalseLiteral
  }.reduce(expressions.And)
}

Source File: AnalysisHelper.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.util

import org.apache.spark.sql.delta.DeltaErrors

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

trait AnalysisHelper {
  import AnalysisHelper._

  protected def tryResolveReferences(
      sparkSession: SparkSession)(
      expr: Expression,
      planContainingExpr: LogicalPlan): Expression = {
    val newPlan = FakeLogicalPlan(expr, planContainingExpr.children)
    sparkSession.sessionState.analyzer.execute(newPlan) match {
      case FakeLogicalPlan(resolvedExpr, _) =>
        // Return even if it did not successfully resolve
        return resolvedExpr
      case _ =>
        // This is unexpected
        throw DeltaErrors.analysisException(
          s"Could not resolve expression $expr", plan = Option(planContainingExpr))
    }
  }

  protected def toDataset(sparkSession: SparkSession, logicalPlan: LogicalPlan): Dataset[Row] = {
    Dataset.ofRows(sparkSession, logicalPlan)
  }

  protected def improveUnsupportedOpError(f: => Unit): Unit = {
    val possibleErrorMsgs = Seq(
      "is only supported with v2 table", // full error: DELETE is only supported with v2 tables
      "is not supported temporarily",    // full error: UPDATE TABLE is not supported temporarily
      "Table does not support read",
      "Table implementation does not support writes"
    ).map(_.toLowerCase())

    def isExtensionOrCatalogError(error: Exception): Boolean = {
      possibleErrorMsgs.exists(m => error.getMessage().toLowerCase().contains(m))
    }

    try { f } catch {
      case e: Exception if isExtensionOrCatalogError(e) =>
        throw DeltaErrors.configureSparkSessionWithExtensionAndCatalog(e)
    }
  }
}

object AnalysisHelper {
  
  case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan])
    extends LogicalPlan {
    override def output: Seq[Attribute] = Nil
  }
}

Source File: DeltaInvariantCheckerExec.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.schema

import org.apache.spark.sql.delta.DeltaErrors
import org.apache.spark.sql.delta.schema.Invariants.NotNull

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BindReferences, Expression, GetStructField, Literal, SortOrder}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.types.{NullType, StructType}


  private def buildExtractors(invariant: Invariant): Option[Expression] = {
    assert(invariant.column.nonEmpty)
    val topLevelColumn = invariant.column.head
    val topLevelRefOpt = output.collectFirst {
      case a: AttributeReference if SchemaUtils.DELTA_COL_RESOLVER(a.name, topLevelColumn) => a
    }
    val rejectColumnNotFound = isNullNotOkay(invariant)
    if (topLevelRefOpt.isEmpty) {
      if (rejectColumnNotFound) {
        throw DeltaErrors.notNullInvariantException(invariant)
      }
    }

    if (invariant.column.length == 1) {
      topLevelRefOpt.map(BindReferences.bindReference[Expression](_, output))
    } else {
      topLevelRefOpt.flatMap { topLevelRef =>
        val boundTopLevel = BindReferences.bindReference[Expression](topLevelRef, output)
        try {
          val nested = invariant.column.tail.foldLeft(boundTopLevel) { case (e, fieldName) =>
            e.dataType match {
              case StructType(fields) =>
                val ordinal = fields.indexWhere(f =>
                  SchemaUtils.DELTA_COL_RESOLVER(f.name, fieldName))
                if (ordinal == -1) {
                  throw new IndexOutOfBoundsException(s"Not nullable column not found in struct: " +
                      s"${fields.map(_.name).mkString("[", ",", "]")}")
                }
                GetStructField(e, ordinal, Some(fieldName))
              case _ =>
                throw new UnsupportedOperationException(
                  "Invariants on nested fields other than StructTypes are not supported.")
            }
          }
          Some(nested)
        } catch {
          case i: IndexOutOfBoundsException if rejectColumnNotFound =>
            throw InvariantViolationException(invariant, i.getMessage)
          case _: IndexOutOfBoundsException if !rejectColumnNotFound =>
            None
        }
      }
    }
  }

  override protected def doExecute(): RDD[InternalRow] = {
    if (invariants.isEmpty) return child.execute()
    val boundRefs = invariants.map { invariant =>
      CheckDeltaInvariant(buildExtractors(invariant).getOrElse(Literal(null, NullType)), invariant)
    }

    child.execute().mapPartitionsInternal { rows =>
      val assertions = GenerateUnsafeProjection.generate(boundRefs)
      rows.map { row =>
        assertions(row)
        row
      }
    }
  }

  override def outputOrdering: Seq[SortOrder] = child.outputOrdering

  override def outputPartitioning: Partitioning = child.outputPartitioning
}

Source File: CheckDeltaInvariant.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.schema

import org.apache.spark.sql.delta.schema.Invariants.{ArbitraryExpression, NotNull}

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, UnaryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{Block, CodegenContext, ExprCode, JavaCode, TrueLiteral}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types.{DataType, NullType}


case class CheckDeltaInvariant(
    child: Expression,
    invariant: Invariant) extends UnaryExpression with NonSQLExpression {

  override def dataType: DataType = NullType
  override def foldable: Boolean = false
  override def nullable: Boolean = true

  override def flatArguments: Iterator[Any] = Iterator(child)

  private def assertRule(input: InternalRow): Unit = invariant.rule match {
    case NotNull if child.eval(input) == null =>
      throw InvariantViolationException(invariant, "")
    case ArbitraryExpression(expr) =>
      val resolvedExpr = expr.transform {
        case _: UnresolvedAttribute => child
      }
      val result = resolvedExpr.eval(input)
      if (result == null || result == false) {
        throw InvariantViolationException(
          invariant, s"Value ${child.eval(input)} violates requirement.")
      }
  }

  override def eval(input: InternalRow): Any = {
    assertRule(input)
    null
  }

  private def generateNotNullCode(ctx: CodegenContext): Block = {
    val childGen = child.genCode(ctx)
    val invariantField = ctx.addReferenceObj("errMsg", invariant)
    code"""${childGen.code}
       |
       |if (${childGen.isNull}) {
       |  throw org.apache.spark.sql.delta.schema.InvariantViolationException.apply(
       |    $invariantField, "");
       |}
     """.stripMargin
  }

  private def generateExpressionValidationCode(expr: Expression, ctx: CodegenContext): Block = {
    val resolvedExpr = expr.transform {
      case _: UnresolvedAttribute => child
    }
    val elementValue = child.genCode(ctx)
    val childGen = resolvedExpr.genCode(ctx)
    val invariantField = ctx.addReferenceObj("errMsg", invariant)
    val eValue = ctx.freshName("elementResult")
    code"""${elementValue.code}
       |${childGen.code}
       |
       |if (${childGen.isNull} || ${childGen.value} == false) {
       |  Object $eValue = "null";
       |  if (!${elementValue.isNull}) {
       |    $eValue = (Object) ${elementValue.value};
       |  }
       |  throw org.apache.spark.sql.delta.schema.InvariantViolationException.apply(
       |     $invariantField, "Value " + $eValue + " violates requirement.");
       |}
     """.stripMargin
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val code = invariant.rule match {
      case NotNull => generateNotNullCode(ctx)
      case ArbitraryExpression(expr) => generateExpressionValidationCode(expr, ctx)
    }
    ev.copy(code = code, isNull = TrueLiteral, value = JavaCode.literal("null", NullType))
  }
}

Source File: DeltaSourceSnapshot.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.files

import org.apache.spark.sql.delta.{DeltaLog, DeltaTableUtils, Snapshot}
import org.apache.spark.sql.delta.sources.IndexedFile
import org.apache.spark.sql.delta.util.StateCache

import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions.lit


class DeltaSourceSnapshot(
    val spark: SparkSession,
    val snapshot: Snapshot,
    val filters: Seq[Expression])
  extends SnapshotIterator
  with StateCache {

  protected val version = snapshot.version
  protected val path = snapshot.path

  protected lazy val (partitionFilters, dataFilters) = {
    val partitionCols = snapshot.metadata.partitionColumns
    filters.partition { e =>
      DeltaTableUtils.isPredicatePartitionColumnsOnly(e, partitionCols, spark)
    }
  }

  protected def initialFiles: Dataset[IndexedFile] = {
    import spark.implicits._

    cacheDS(
      snapshot.allFiles.sort("modificationTime", "path")
        .rdd.zipWithIndex()
        .toDF("add", "index")
        .withColumn("version", lit(version))
        .withColumn("isLast", lit(false))
        .as[IndexedFile],
      s"Delta Source Snapshot #$version - ${snapshot.redactedPath}").getDS
  }

  override def close(unpersistSnapshot: Boolean): Unit = {
    super.close(unpersistSnapshot)

    if (unpersistSnapshot) {
      snapshot.uncache()
    }
  }
}

trait SnapshotIterator {
  self: DeltaSourceSnapshot =>

  private var result: Iterable[IndexedFile] = _

  def iterator(): Iterator[IndexedFile] = {
    import spark.implicits._
    if (result == null) {
      result = DeltaLog.filterFileList(
        snapshot.metadata.partitionSchema,
        initialFiles.toDF(),
        partitionFilters,
        Seq("add")).as[IndexedFile].collect().toIterable
    }
    // This will always start from the beginning and re-use resources. If any exceptions were to
    // be thrown, the stream would stop, we would call stop on the source, and that will make
    // sure that we clean up resources.
    result.toIterator
  }

  def close(unpersistSnapshot: Boolean): Unit = { }
}

Source File: DeltaTableOperations.scala From delta with Apache License 2.0

5 votes

package io.delta.tables.execution

import scala.collection.Map

import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate}
import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand}
import org.apache.spark.sql.delta.util.AnalysisHelper
import io.delta.tables.DeltaTable

import org.apache.spark.sql.{functions, Column, DataFrame, Dataset}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
import org.apache.spark.sql.catalyst.plans.logical._


trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable =>

  protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError {
    val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition)
    toDataset(sparkSession, delete)
  }

  protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = {
    val history = new DeltaHistoryManager(deltaLog)
    val spark = self.toDF.sparkSession
    spark.createDataFrame(history.getHistory(limit))
  }

  protected def executeGenerate(tblIdentifier: String, mode: String): Unit = {
    val tableId: TableIdentifier = sparkSession
      .sessionState
      .sqlParser
      .parseTableIdentifier(tblIdentifier)
    val generate = DeltaGenerateCommand(mode, tableId)
    generate.run(sparkSession)
  }

  protected def executeUpdate(
      set: Map[String, Column],
      condition: Option[Column]): Unit = improveUnsupportedOpError {
    val assignments = set.map { case (targetColName, column) =>
      Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr)
    }.toSeq
    val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr))
    toDataset(sparkSession, update)
  }

  protected def executeVacuum(
      deltaLog: DeltaLog,
      retentionHours: Option[Double]): DataFrame = {
    VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours)
    sparkSession.emptyDataFrame
  }

  protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = {
    map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap
  }

  protected def sparkSession = self.toDF.sparkSession
}

Source File: DruidOperatorSchema.scala From spark-druid-olap with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.druid

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, NamedExpression}
import org.apache.spark.sql.types.DataType
import org.sparklinedata.druid.{DruidOperatorAttribute, DruidQueryBuilder}


  lazy val pushedDownExprToDruidAttr : Map[Expression, DruidOperatorAttribute] =
    buildPushDownDruidAttrsMap

  private def pushDownExpressionMap : Map[String, (Expression, DataType, DataType, String)] =
    dqb.outputAttributeMap.filter(t => t._2._1 != null)

  private def buildPushDownDruidAttrsMap : Map[Expression, DruidOperatorAttribute] =
    (pushDownExpressionMap map {
    case (nm, (e, oDT, dDT, tf)) => {
      (e -> druidAttrMap(nm))
    }
  })


  private def buildDruidOpAttr : Map[String, DruidOperatorAttribute] =
    (dqb.outputAttributeMap map {
      case (nm, (e, oDT, dDT, tf)) => {
        val druidEid = e match {
          case null => NamedExpression.newExprId
          case n: NamedExpression => n.exprId
          case _ => NamedExpression.newExprId
        }
        (nm -> DruidOperatorAttribute(druidEid, nm, dDT, tf))
      }
    }
      )

}

Source File: PlanningTest.scala From spark-druid-olap with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.druid.test

import java.util.TimeZone

import com.github.nscala_time.time.Imports._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.expressions.{Expression, PredicateHelper}
import org.apache.spark.sql.catalyst.plans.logical.Filter
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.hive.test.sparklinedata.TestHive
import org.apache.spark.sql.sources.druid.DruidPlanner
import org.scalatest.BeforeAndAfterEach
import org.sparklinedata.druid._
import org.sparklinedata.druid.client.test.BaseTest
import org.sparklinedata.druid.metadata.DruidRelationInfo

trait PlanningTestHelper extends PredicateHelper {
  System.setProperty("user.timezone", "UTC")
  TimeZone.setDefault(TimeZone.getTimeZone("UTC"))

  override def splitConjunctivePredicates(condition: Expression): Seq[Expression] = {
    super.splitConjunctivePredicates(condition)
  }
}

abstract class PlanningTest extends BaseTest with BeforeAndAfterEach with PlanningTestHelper {

  val dPlanner = new DruidPlanner(TestHive)
  var tab: DataFrame = _
  var drInfo: DruidRelationInfo = _
  var dqb: DruidQueryBuilder = _
  var iCE: IntervalConditionExtractor = _
  var iCE2: SparkIntervalConditionExtractor = _

  override def beforeAll() = {
    super.beforeAll()
    tab = TestHive.table("orderLineItemPartSupplier")
    drInfo = tab.queryExecution.optimizedPlan.
      asInstanceOf[LogicalRelation].relation.asInstanceOf[DruidRelation].info
  }

  override protected def beforeEach(): Unit = {
    dqb = DruidQueryBuilder(drInfo)
    iCE = new IntervalConditionExtractor(dqb)
    iCE2 = new SparkIntervalConditionExtractor(dqb)
  }

  def validateFilter(filterStr: String,
                     pushedToDruid: Boolean = true,
                     filSpec: Option[FilterSpec] = None,
                     intervals: List[Interval] = List()
                    ): Unit = {
    val q = tab.where(filterStr)
    val filter = q.queryExecution.optimizedPlan.asInstanceOf[Filter]
    val dqbs = dPlanner.translateProjectFilter(
      Some(dqb),
      Seq(),
      splitConjunctivePredicates(filter.condition),
      true
    )
    if (pushedToDruid) {
      assert(dqbs.size == 1)
      val odqb = dqbs(0)
      assert(odqb.filterSpec == filSpec)
      assert(odqb.queryIntervals.intervals == intervals)
    }
  }

}

Source File: Serialize.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.impl.expressions

import java.io.ByteArrayOutputStream

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, _}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
import org.opencypher.morpheus.impl.expressions.EncodeLong.encodeLong
import org.opencypher.morpheus.impl.expressions.Serialize._
import org.opencypher.okapi.impl.exception


case class Serialize(children: Seq[Expression]) extends Expression {

  override def dataType: DataType = BinaryType

  override def nullable: Boolean = false

  // TODO: Only write length if more than one column is serialized
  override def eval(input: InternalRow): Any = {
    // TODO: Reuse from a pool instead of allocating a new one for each serialization
    val out = new ByteArrayOutputStream()
    children.foreach { child =>
      child.dataType match {
        case BinaryType => write(child.eval(input).asInstanceOf[Array[Byte]], out)
        case StringType => write(child.eval(input).asInstanceOf[UTF8String], out)
        case IntegerType => write(child.eval(input).asInstanceOf[Int], out)
        case LongType => write(child.eval(input).asInstanceOf[Long], out)
        case other => throw exception.UnsupportedOperationException(s"Cannot serialize Spark data type $other.")
      }
    }
    out.toByteArray
  }

  override protected def doGenCode(
    ctx: CodegenContext,
    ev: ExprCode
  ): ExprCode = {
    ev.isNull = FalseLiteral
    val out = ctx.freshName("out")
    val serializeChildren = children.map { child =>
      val childEval = child.genCode(ctx)
      s"""|${childEval.code}
          |if (!${childEval.isNull}) {
          |  ${Serialize.getClass.getName.dropRight(1)}.write(${childEval.value}, $out);
          |}""".stripMargin
    }.mkString("\n")
    val baos = classOf[ByteArrayOutputStream].getName
    ev.copy(
      code = code"""|$baos $out = new $baos();
          |$serializeChildren
          |byte[] ${ev.value} = $out.toByteArray();""".stripMargin)
  }

}

object Serialize {

  val supportedTypes: Set[DataType] = Set(BinaryType, StringType, IntegerType, LongType)

  @inline final def write(value: Array[Byte], out: ByteArrayOutputStream): Unit = {
    out.write(encodeLong(value.length))
    out.write(value)
  }

  @inline final def write(
    value: Boolean,
    out: ByteArrayOutputStream
  ): Unit = write(if (value) 1.toLong else 0.toLong, out)

  @inline final def write(value: Byte, out: ByteArrayOutputStream): Unit = write(value.toLong, out)

  @inline final def write(value: Int, out: ByteArrayOutputStream): Unit = write(value.toLong, out)

  @inline final def write(value: Long, out: ByteArrayOutputStream): Unit = write(encodeLong(value), out)

  @inline final def write(value: UTF8String, out: ByteArrayOutputStream): Unit = write(value.getBytes, out)

  @inline final def write(value: String, out: ByteArrayOutputStream): Unit = write(value.getBytes, out)

}

Source File: EncodeLong.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.impl.expressions

import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression}
import org.apache.spark.sql.types.{BinaryType, DataType, LongType}
import org.opencypher.morpheus.api.value.MorpheusElement._


case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes {

  override val dataType: DataType = BinaryType

  override val inputTypes: Seq[LongType] = Seq(LongType)

  override protected def nullSafeEval(input: Any): Any =
    EncodeLong.encodeLong(input.asInstanceOf[Long])

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode =
    defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))")
}

object EncodeLong {

  private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2)
  private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2)
  private final val otherBitsMask = ~varLength7BitMask
  private final val maxBytesForLongVarEncoding = 10

  // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding
  @inline
  final def encodeLong(l: Long): Array[Byte] = {
    val tempResult = new Array[Byte](maxBytesForLongVarEncoding)

    var remainder = l
    var index = 0

    while ((remainder & otherBitsMask) != 0) {
      tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte
      remainder >>>= 7
      index += 1
    }
    tempResult(index) = remainder.toByte

    val result = new Array[Byte](index + 1)
    System.arraycopy(tempResult, 0, result, 0, index + 1)
    result
  }

  // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding
  @inline
  final def decodeLong(input: Array[Byte]): Long = {
    assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input")
    var index = 0
    var currentByte = input(index)
    var decoded = currentByte & varLength7BitMask
    var nextLeftShift = 7

    while ((currentByte & moreBytesBitMask) != 0) {
      index += 1
      currentByte = input(index)
      decoded |= (currentByte & varLength7BitMask) << nextLeftShift
      nextLeftShift += 7
    }
    assert(index == input.length - 1,
      s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.")
    decoded
  }

  implicit class ColumnLongOps(val c: Column) extends AnyVal {

    def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name)

    def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr))

  }

}

Source File: SQLBuilderTest.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst

import scala.util.control.NonFatal

import org.apache.spark.sql.{DataFrame, Dataset, QueryTest}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.hive.test.TestHiveSingleton


abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
    val actualSQL = e.sql
    try {
      assert(actualSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following expression:
             |
             |${e.prettyName}
             |
             |$cause
           """.stripMargin)
    }
  }

  protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = {
    val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) =>
      fail(
        s"""Cannot convert the following logical query plan to SQL:
           |
           |${plan.treeString}
         """.stripMargin)
    }

    try {
      assert(generatedSQL === expectedSQL)
    } catch {
      case cause: Throwable =>
        fail(
          s"""Wrong SQL generated for the following logical query plan:
             |
             |${plan.treeString}
             |
             |$cause
           """.stripMargin)
    }

    checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan))
  }

  protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = {
    checkSQL(df.queryExecution.analyzed, expectedSQL)
  }
}

Source File: ScriptTransformation.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: EventTimeWatermark.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval

object EventTimeWatermark {
  
case class EventTimeWatermark(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: LogicalPlan) extends LogicalPlan {

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }

  override val children: Seq[LogicalPlan] = child :: Nil
}

Source File: CodegenFallback.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic}


trait CodegenFallback extends Expression {

  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    // LeafNode does not need `input`
    val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW
    val idx = ctx.references.length
    ctx.references += this
    var childIndex = idx
    this.foreach {
      case n: Nondeterministic =>
        // This might add the current expression twice, but it won't hurt.
        ctx.references += n
        childIndex += 1
        ctx.addPartitionInitializationStatement(
          s"""
             |((Nondeterministic) references[$childIndex])
             |  .initialize(partitionIndex);
          """.stripMargin)
      case _ =>
    }
    val objectTerm = ctx.freshName("obj")
    val placeHolder = ctx.registerComment(this.toString)
    if (nullable) {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        boolean ${ev.isNull} = $objectTerm == null;
        ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)};
        if (!${ev.isNull}) {
          ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        }""")
    } else {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        """, isNull = "false")
    }
  }
}

Source File: SubstituteUnresolvedOrdinals.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.CatalystConf
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder}
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
import org.apache.spark.sql.types.IntegerType


class SubstituteUnresolvedOrdinals(conf: CatalystConf) extends Rule[LogicalPlan] {
  private def isIntLiteral(e: Expression) = e match {
    case Literal(_, IntegerType) => true
    case _ => false
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) =>
      val newOrders = s.order.map {
        case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _) =>
          val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
          withOrigin(order.origin)(order.copy(child = newOrdinal))
        case other => other
      }
      withOrigin(s.origin)(s.copy(order = newOrders))

    case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) =>
      val newGroups = a.groupingExpressions.map {
        case ordinal @ Literal(index: Int, IntegerType) =>
          withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
        case other => other
      }
      withOrigin(a.origin)(a.copy(groupingExpressions = newGroups))
  }
}

Source File: RuleExecutorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.errors.TreeNodeException
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    val message = intercept[TreeNodeException[LogicalPlan]] {
      ToFixedPoint.execute(Literal(100))
    }.getMessage
    assert(message.contains("Max iterations (10) reached for batch fixedPoint"))
  }
}

Source File: ShuffledHashJoinExec.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
    val buildDataSize = longMetric("buildDataSize")
    val buildTime = longMetric("buildTime")
    val start = System.nanoTime()
    val context = TaskContext.get()
    val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager())
    buildTime += (System.nanoTime() - start) / 1000000
    buildDataSize += relation.estimatedSize
    // This relation is usually used until the end of task.
    context.addTaskCompletionListener(_ => relation.close())
    relation
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
      val hashed = buildHashedRelation(buildIter)
      join(streamIter, hashed, numOutputRows)
    }
  }
}

Source File: CartesianProductExec.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.hadoop.security.UserGroupInformation

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter


class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  private[this] val user = UserGroupInformation.getCurrentUser.getShortUserName

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    // We will not sort the rows, so prefixComparator and recordComparator are null.
    val sorter = UnsafeExternalSorter.create(
      context.taskMemoryManager(),
      SparkEnv.get(user).blockManager,
      SparkEnv.get(user).serializerManager,
      context,
      null,
      null,
      1024,
      SparkEnv.get(user).memoryManager.pageSizeBytes,
      SparkEnv.get(user).conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
        UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD),
      false)

    val partition = split.asInstanceOf[CartesianPartition]
    for (y <- rdd2.iterator(partition.s2, context)) {
      sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false)
    }

    // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow]
    def createIter(): Iterator[UnsafeRow] = {
      val iter = sorter.getIterator
      val unsafeRow = new UnsafeRow(numFieldsOfRight)
      new Iterator[UnsafeRow] {
        override def hasNext: Boolean = {
          iter.hasNext
        }
        override def next(): UnsafeRow = {
          iter.loadNext()
          unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength)
          unsafeRow
        }
      }
    }

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, sorter.cleanupResources())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
    pair.mapPartitionsWithIndexInternal { (index, iter) =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
        boundCondition.initialize(index)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition.eval(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
}

Source File: OTBLeftSemiHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.online.joins

import java.util.{HashSet => JHashSet}

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._


case class OTBLeftSemiHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override val buildSide = BuildRight

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  override def output = left.output

  @transient private[this] lazy val keyGenerator: () => MutableProjection =
    newMutableProjection(buildKeys, buildPlan.output)

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  @transient
  private lazy val broadcastFuture = future {
    prevBatch match {
      case None =>
        // Note that we use .execute().collect() because we don't want to convert data to Scala types
        val input: Array[Row] = buildPlan.execute()
          .mapPartitions(HashedSet(_, keyGenerator())).collect()
        val hashed = HashedSet(input.iterator)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]]
    }
  }

  override def doExecute() = {
    val broadcastRelation: Broadcast[JHashSet[Row]] = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamIter =>
      val hashSet = broadcastRelation.value
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = OTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId)
    join.broadcastFuture
    join
  }
}

Source File: MTBLeftSemiHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.online.joins

import java.util.{HashSet => JHashSet}

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._


case class MTBLeftSemiHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override val buildSide = BuildRight

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  override def output = left.output

  @transient private[this] lazy val keyGenerator: () => MutableProjection =
    newMutableProjection(buildKeys, buildPlan.output)

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  val watcher = controller.getWatcher

  @transient
  private lazy val broadcastFuture = future {
    // Note that we use .execute().collect() because we don't want to convert data to Scala types
    val input: Array[Row] = buildPlan.execute()
      .mapPartitions(HashedSet(_, keyGenerator())).collect()
    prevBatch match {
      case None =>
        val hashed = HashedSet(input.iterator)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        // TODO: fix this integrity error by supporting join whose both branches may grow
        val hashed = HashedSet(input.iterator)
        val previous = controller.broadcasts((opId, bId)).value.asInstanceOf[JHashSet[Row]]
        if (!previous.containsAll(hashed)) {
          watcher += -1
          logError(s"Integrity Error in MTBLeftSemiHashJoin(Op $opId, Batch $currentBatch)")
        }
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]]
    }
  }

  override def doExecute() = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamIter =>
      val hashSet = broadcastRelation.value
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = MTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId)
    join.broadcastFuture
    join
  }
}

Source File: OTShuffledHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.online.joins

import org.apache.spark.SparkEnv
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning}
import org.apache.spark.sql.execution.joins.{BuildSide, HashJoin, HashedRelation}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.ComposeRDDFunctions._
import org.apache.spark.sql.hive.online._
import org.apache.spark.storage.{OLABlockId, StorageLevel}

case class OTShuffledHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override def outputPartitioning: Partitioning = left.outputPartitioning

  override def requiredChildDistribution =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  def retrieveState(): RDD[HashedRelation] = prevBatch match {
    case Some(bId) =>
      val numParts = controller.olaBlocks(opId, bId)
      OLABlockRDD.create[HashedRelation](sparkContext, opId.id, Array((numParts, bId)), numParts)
    case None =>
      sys.error(s"Unexpected prevBatch = $prevBatch")
  }

  override def doExecute() = {
    prevBatch match {
      case None =>
        val buildRdd = buildPlan.execute()
        controller.olaBlocks((opId, currentBatch)) = buildRdd.partitions.length

        buildRdd.zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) =>
          val hashed = HashedRelation(buildIter, buildSideKeyGenerator)
          SparkEnv.get.blockManager.putSingle(
            OLABlockId(opId.id, currentBatch, index), hashed, StorageLevel.MEMORY_AND_DISK)
          hashJoin(streamIter, hashed)
        }
      case Some(_) =>
        retrieveState().zipPartitionsWithIndex(streamedPlan.execute()) {
          (index, buildIter, streamIter) =>
            val hashed = buildIter.next()
            hashJoin(streamIter, hashed)
        }
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan =
    OTShuffledHashJoin(leftKeys, rightKeys, buildSide, left, right)(controller, newTrace, opId)
}

Source File: OTBroadcastHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.online.joins

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, BuildSide, HashJoin, HashedRelation}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent._
import scala.concurrent.duration._


case class OTBroadcastHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  @transient
  private lazy val broadcastFuture = future {
    prevBatch match {
      case None =>
        // Note that we use .execute().collect() because we don't want to convert data to Scala types
        val input: Array[Row] = buildPlan.execute().map(_.copy()).collect()
        val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[HashedRelation]]
    }
  }(BroadcastHashJoin.broadcastHashJoinExecutionContext)

  override def doExecute() = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamedIter =>
      hashJoin(streamedIter, broadcastRelation.value)
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = OTBroadcastHashJoin(leftKeys, rightKeys, buildSide, left, right)(
      controller, newTrace, opId)
    join.broadcastFuture
    join
  }
}

Source File: binary.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.mathfuncs

import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, BinaryExpression, Expression, Row}
import org.apache.spark.sql.types._


abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
  extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product =>
  type EvaluatedType = Any
  override def symbol: String = null
  override def expectedChildTypes: Seq[DataType] = Seq(DoubleType, DoubleType)

  override def nullable: Boolean = left.nullable || right.nullable
  override def toString: String = s"$name($left, $right)"

  override lazy val resolved =
    left.resolved && right.resolved &&
      left.dataType == right.dataType &&
      !DecimalType.isFixed(left.dataType)

  override def dataType: DataType = DoubleType

  override def eval(input: Row): Any = {
    val evalE1 = left.eval(input)
    if (evalE1 == null) {
      null
    } else {
      val evalE2 = right.eval(input)
      if (evalE2 == null) {
        null
      } else {
        val result = f(evalE1.asInstanceOf[Double], evalE2.asInstanceOf[Double])
        if (result.isNaN) null else result
      }
    }
  }
}

case class Atan2(
    left: Expression,
    right: Expression) extends BinaryMathExpression(math.atan2, "ATAN2") {
  override def eval(input: Row): Any = {
    val evalE1 = left.eval(input)
    if (evalE1 == null) {
      null
    } else {
      val evalE2 = right.eval(input)
      if (evalE2 == null) {
        null
      } else {
        // With codegen, the values returned by -0.0 and 0.0 are different. Handled with +0.0
        val result = math.atan2(evalE1.asInstanceOf[Double] + 0.0,
          evalE2.asInstanceOf[Double] + 0.0)
        if (result.isNaN) null else result
      }
    }
  }
}

case class Hypot(
    left: Expression,
    right: Expression) extends BinaryMathExpression(math.hypot, "HYPOT")

case class Pow(left: Expression, right: Expression) extends BinaryMathExpression(math.pow, "POWER")

Source File: FunctionRegistry.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.CatalystConf
import org.apache.spark.sql.catalyst.expressions.Expression
import scala.collection.mutable


object StringKeyHashMap {
  def apply[T](caseSensitive: Boolean): StringKeyHashMap[T] = caseSensitive match {
    case false => new StringKeyHashMap[T](_.toLowerCase)
    case true => new StringKeyHashMap[T](identity)
  }
}

class StringKeyHashMap[T](normalizer: (String) => String) {
  private val base = new collection.mutable.HashMap[String, T]()

  def apply(key: String): T = base(normalizer(key))

  def get(key: String): Option[T] = base.get(normalizer(key))
  def put(key: String, value: T): Option[T] = base.put(normalizer(key), value)
  def remove(key: String): Option[T] = base.remove(normalizer(key))
  def iterator: Iterator[(String, T)] = base.toIterator
}

Source File: RuleExecutorSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(100)) === Literal(90))
  }
}

Source File: LeftSemiJoinHash.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class LeftSemiJoinHash(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan) extends BinaryNode with HashJoin {

  override val buildSide: BuildSide = BuildRight

  override def requiredChildDistribution: Seq[ClusteredDistribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  override def output: Seq[Attribute] = left.output

  protected override def doExecute(): RDD[Row] = {
    buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
      val hashSet = new java.util.HashSet[Row]()
      var currentRow: Row = null

      // Create a Hash set of buildKeys
      while (buildIter.hasNext) {
        currentRow = buildIter.next()
        val rowKey = buildSideKeyGenerator(currentRow)
        if (!rowKey.anyNull) {
          val keyExists = hashSet.contains(rowKey)
          if (!keyExists) {
            hashSet.add(rowKey)
          }
        }
      }

      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }
}

Source File: BroadcastHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.rdd.RDD
import org.apache.spark.util.ThreadUtils

import scala.concurrent._
import scala.concurrent.duration._

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.expressions.{Row, Expression}
import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class BroadcastHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryNode with HashJoin {

  val timeout: Duration = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution: Seq[Distribution] =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  @transient
  lazy val broadcastFuture = future {
    // Note that we use .execute().collect() because we don't want to convert data to Scala types
    val input: Array[Row] = buildPlan.execute().map(_.copy()).collect()
    val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
    sparkContext.broadcast(hashed)
  }(BroadcastHashJoin.broadcastHashJoinExecutionContext)

  protected override def doExecute(): RDD[Row] = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamedIter =>
      hashJoin(streamedIter, broadcastRelation.value)
    }
  }
}

object BroadcastHashJoin {
  private[sql] val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService(
    ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 128))
}

Source File: BroadcastLeftSemiJoinHash.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class BroadcastLeftSemiJoinHash(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan) extends BinaryNode with HashJoin {

  override val buildSide: BuildSide = BuildRight

  override def output: Seq[Attribute] = left.output

  protected override def doExecute(): RDD[Row] = {
    val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator
    val hashSet = new java.util.HashSet[Row]()
    var currentRow: Row = null

    // Create a Hash set of buildKeys
    while (buildIter.hasNext) {
      currentRow = buildIter.next()
      val rowKey = buildSideKeyGenerator(currentRow)
      if (!rowKey.anyNull) {
        val keyExists = hashSet.contains(rowKey)
        if (!keyExists) {
          hashSet.add(rowKey)
        }
      }
    }

    val broadcastedRelation = sparkContext.broadcast(hashSet)

    streamedPlan.execute().mapPartitions { streamIter =>
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue)
      })
    }
  }
}

Source File: ShuffledHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}


@DeveloperApi
case class ShuffledHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryNode with HashJoin {

  override def outputPartitioning: Partitioning = left.outputPartitioning

  override def requiredChildDistribution: Seq[ClusteredDistribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  protected override def doExecute(): RDD[Row] = {
    buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
      val hashed = HashedRelation(buildIter, buildSideKeyGenerator)
      hashJoin(streamIter, hashed)
    }
  }
}

Source File: FramelessInternals.scala From frameless with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.{Alias, CreateStruct}
import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.ObjectType
import scala.reflect.ClassTag

object FramelessInternals {
  def objectTypeFor[A](implicit classTag: ClassTag[A]): ObjectType = ObjectType(classTag.runtimeClass)

  def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = {
    ds.toDF.queryExecution.analyzed.resolve(colNames, ds.sparkSession.sessionState.analyzer.resolver).getOrElse {
      throw new AnalysisException(
        s"""Cannot resolve column name "$colNames" among (${ds.schema.fieldNames.mkString(", ")})""")
    }
  }

  def expr(column: Column): Expression = column.expr

  def column(column: Column): Expression = column.expr

  def logicalPlan(ds: Dataset[_]): LogicalPlan = ds.logicalPlan

  def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution =
    ds.sparkSession.sessionState.executePlan(plan)

  def joinPlan(ds: Dataset[_], plan: LogicalPlan, leftPlan: LogicalPlan, rightPlan: LogicalPlan): LogicalPlan = {
    val joined = executePlan(ds, plan)
    val leftOutput = joined.analyzed.output.take(leftPlan.output.length)
    val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length)

    Project(List(
      Alias(CreateStruct(leftOutput), "_1")(),
      Alias(CreateStruct(rightOutput), "_2")()
    ), joined.analyzed)
  }

  def mkDataset[T](sqlContext: SQLContext, plan: LogicalPlan, encoder: Encoder[T]): Dataset[T] =
    new Dataset(sqlContext, plan, encoder)

  def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame =
    Dataset.ofRows(sparkSession, logicalPlan)

  // because org.apache.spark.sql.types.UserDefinedType is private[spark]
  type UserDefinedType[A >: Null] =  org.apache.spark.sql.types.UserDefinedType[A]

  
  case class DisambiguateRight[T](tagged: Expression) extends Expression with NonSQLExpression {
    def eval(input: InternalRow): Any = tagged.eval(input)
    def nullable: Boolean = false
    def children: Seq[Expression] = tagged :: Nil
    def dataType: DataType = tagged.dataType
    protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ???
    override def genCode(ctx: CodegenContext): ExprCode = tagged.genCode(ctx)
  }
}

Source File: Lit.scala From frameless with Apache License 2.0

5 votes

package frameless.functions

import frameless.TypedEncoder
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, NonSQLExpression}
import org.apache.spark.sql.types.DataType

case class FramelessLit[A](obj: A, encoder: TypedEncoder[A]) extends Expression with NonSQLExpression {
  override def nullable: Boolean = encoder.nullable
  override def toString: String = s"FramelessLit($obj)"

  def eval(input: InternalRow): Any = {
    val ctx = new CodegenContext()
    val eval = genCode(ctx)

    val codeBody = s"""
      public scala.Function1<InternalRow, Object> generate(Object[] references) {
        return new FramelessLitEvalImpl(references);
      }

      class FramelessLitEvalImpl extends scala.runtime.AbstractFunction1<InternalRow, Object> {
        private final Object[] references;
        ${ctx.declareMutableStates()}
        ${ctx.declareAddedFunctions()}

        public FramelessLitEvalImpl(Object[] references) {
          this.references = references;
          ${ctx.initMutableStates()}
        }

        public java.lang.Object apply(java.lang.Object z) {
          InternalRow ${ctx.INPUT_ROW} = (InternalRow) z;
          ${eval.code}
          return ${eval.isNull} ? ((Object)null) : ((Object)${eval.value});
        }
      }
    """

    val code = CodeFormatter.stripOverlappingComments(
      new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))

    val (clazz, _) = CodeGenerator.compile(code)
    val codegen = clazz.generate(ctx.references.toArray).asInstanceOf[InternalRow => AnyRef]

    codegen(input)
  }

  def dataType: DataType = encoder.catalystRepr
  def children: Seq[Expression] = Nil

  override def genCode(ctx: CodegenContext): ExprCode = {
    encoder.toCatalyst(new Literal(obj, encoder.jvmRepr)).genCode(ctx)
  }

  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ???
}

Source File: Sources.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.types.StructType
import tech.sourced.engine.util.{CompiledFilter, Filter}


  def getSources(tableSource: Option[String],
                 schema: StructType): Seq[String] =
    tableSource match {
      case Some(ts) => Seq(ts)
      case None =>
        schema
          .map(_.metadata.getString(SourceKey))
          .distinct
          .sortWith(Sources.compare(_, _) < 0)
    }

  def getFiltersBySource(filters: Seq[Expression]): Map[String, Seq[CompiledFilter]] =
    filters.flatMap(Filter.compile)
      .map(e => (e.sources.distinct, e))
      .filter(_._1.lengthCompare(1) == 0)
      .groupBy(_._1)
      .map { case (k, v) => (k.head, v.map(_._2)) }

}

Source File: RuleExecutorSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(100)) === Literal(90))
  }
}

Source File: ShuffledHashJoin.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


@DeveloperApi
case class ShuffledHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryNode with HashJoin {

  override private[sql] lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  override def outputPartitioning: Partitioning =
    PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning))

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  protected override def doExecute(): RDD[InternalRow] = {
    val (numBuildRows, numStreamedRows) = buildSide match {
      case BuildLeft => (longMetric("numLeftRows"), longMetric("numRightRows"))
      case BuildRight => (longMetric("numRightRows"), longMetric("numLeftRows"))
    }
    val numOutputRows = longMetric("numOutputRows")

    buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
      val hashed = HashedRelation(buildIter, numBuildRows, buildSideKeyGenerator)
      hashJoin(streamIter, numStreamedRows, hashed, numOutputRows)
    }
  }
}

Source File: SemiJoinSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.sql.{SQLConf, DataFrame, Row}
import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
import org.apache.spark.sql.catalyst.plans.Inner
import org.apache.spark.sql.catalyst.plans.logical.Join
import org.apache.spark.sql.catalyst.expressions.{And, LessThan, Expression}
import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan, SparkPlanTest}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}
//半连接测试套件
class SemiJoinSuite extends SparkPlanTest with SharedSQLContext {

  private lazy val left = ctx.createDataFrame(
    ctx.sparkContext.parallelize(Seq(
      Row(1, 2.0),
      Row(1, 2.0),
      Row(2, 1.0),
      Row(2, 1.0),
      Row(3, 3.0),
      Row(null, null),
      Row(null, 5.0),
      Row(6, null)
    )), new StructType().add("a", IntegerType).add("b", DoubleType))

  private lazy val right = ctx.createDataFrame(
    ctx.sparkContext.parallelize(Seq(
      Row(2, 3.0),
      Row(2, 3.0),
      Row(3, 2.0),
      Row(4, 1.0),
      Row(null, null),
      Row(null, 5.0),
      Row(6, null)
    )), new StructType().add("c", IntegerType).add("d", DoubleType))

  private lazy val condition = {
    And((left.col("a") === right.col("c")).expr,
      LessThan(left.col("b").expr, right.col("d").expr))
  }

  // Note: the input dataframes and expression must be evaluated lazily because
  // the SQLContext should be used only within a test to keep SQL tests stable
  private def testLeftSemiJoin(
      testName: String,
      leftRows: => DataFrame,
      rightRows: => DataFrame,
      condition: => Expression,
      expectedAnswer: Seq[Product]): Unit = {

    def extractJoinParts(): Option[ExtractEquiJoinKeys.ReturnType] = {
      val join = Join(leftRows.logicalPlan, rightRows.logicalPlan, Inner, Some(condition))
      ExtractEquiJoinKeys.unapply(join)
    }

    test(s"$testName using LeftSemiJoinHash") {
      extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) =>
        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
            EnsureRequirements(left.sqlContext).apply(
              LeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition)),
            expectedAnswer.map(Row.fromTuple),
            sortAnswers = true)
        }
      }
    }

    test(s"$testName using BroadcastLeftSemiJoinHash") {
      extractJoinParts().foreach { case (joinType, leftKeys, rightKeys, boundCondition, _, _) =>
        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
            BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, boundCondition),
            expectedAnswer.map(Row.fromTuple),
            sortAnswers = true)
        }
      }
    }

    test(s"$testName using LeftSemiJoinBNL") {
      withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
        checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
          LeftSemiJoinBNL(left, right, Some(condition)),
          expectedAnswer.map(Row.fromTuple),
          sortAnswers = true)
      }
    }
  }
  //测试左半连接
  testLeftSemiJoin(
    "basic test",
    left,
    right,
    condition,
    Seq(
      (2, 1.0),
      (2, 1.0)
    )
  )
}

Source File: StatefulApproxQuantile.scala From deequ with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.aggregate

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile.PercentileDigest
import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, Literal}
import org.apache.spark.sql.types._


private[sql] case class StatefulApproxQuantile(
    child: Expression,
    accuracyExpression: Expression,
    override val mutableAggBufferOffset: Int,
    override val inputAggBufferOffset: Int)
  extends TypedImperativeAggregate[PercentileDigest] with ImplicitCastInputTypes {

  def this(child: Expression, accuracyExpression: Expression) = {
    this(child, accuracyExpression, 0, 0)
  }

  def this(child: Expression) = {
    this(child, Literal(ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY))
  }

  // Mark as lazy so that accuracyExpression is not evaluated during tree transformation.
  private lazy val accuracy: Double = accuracyExpression.eval().asInstanceOf[Double]

  override def inputTypes: Seq[AbstractDataType] = {
    Seq(DoubleType, TypeCollection(DoubleType, ArrayType(DoubleType)), IntegerType)
  }

  override def checkInputDataTypes(): TypeCheckResult = {
    val defaultCheck = super.checkInputDataTypes()
    if (defaultCheck.isFailure) {
      defaultCheck
    } else if (!accuracyExpression.foldable) {
      TypeCheckFailure(s"The accuracy provided must be a constant literal")
    } else if (accuracy <= 0) {
      TypeCheckFailure(
        s"The accuracy provided must be a positive integer literal (current value = $accuracy)")
    } else {
      TypeCheckSuccess
    }
  }

  override def createAggregationBuffer(): PercentileDigest = {
    val relativeError = 1.0D / accuracy
    new PercentileDigest(relativeError)
  }

  override def update(buffer: PercentileDigest, inputRow: InternalRow): PercentileDigest = {
    val value = child.eval(inputRow)
    // Ignore empty rows, for example: percentile_approx(null)
    if (value != null) {
      buffer.add(value.asInstanceOf[Double])
    }
    buffer
  }

  override def merge(buffer: PercentileDigest, other: PercentileDigest): PercentileDigest = {
    buffer.merge(other)
    buffer
  }

  override def eval(buffer: PercentileDigest): Any = {
    // instead of evaluating the PercentileDigest quantile summary here,
    // serialize the digest and return it as byte array
    serialize(buffer)
  }

  override def withNewMutableAggBufferOffset(newOffset: Int): StatefulApproxQuantile =
    copy(mutableAggBufferOffset = newOffset)

  override def withNewInputAggBufferOffset(newOffset: Int): StatefulApproxQuantile =
    copy(inputAggBufferOffset = newOffset)

  override def children: Seq[Expression] = Seq(child, accuracyExpression)

  // Returns null for empty inputs
  override def nullable: Boolean = true

  override def dataType: DataType = BinaryType

  override def prettyName: String = "percentile_approx"

  override def serialize(digest: PercentileDigest): Array[Byte] = {
    ApproximatePercentile.serializer.serialize(digest)
  }

  override def deserialize(bytes: Array[Byte]): PercentileDigest = {
    ApproximatePercentile.serializer.deserialize(bytes)
  }
}

Source File: ScriptTransformation.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: CodegenFallback.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, Nondeterministic}


trait CodegenFallback extends Expression {

  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    // LeafNode does not need `input`
    val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW
    val idx = ctx.references.length
    ctx.references += this
    var childIndex = idx
    this.foreach {
      case n: Nondeterministic =>
        // This might add the current expression twice, but it won't hurt.
        ctx.references += n
        childIndex += 1
        ctx.addPartitionInitializationStatement(
          s"""
             |((Nondeterministic) references[$childIndex])
             |  .initialize(partitionIndex);
          """.stripMargin)
      case _ =>
    }
    val objectTerm = ctx.freshName("obj")
    val placeHolder = ctx.registerComment(this.toString)
    if (nullable) {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        boolean ${ev.isNull} = $objectTerm == null;
        ${ctx.javaType(this.dataType)} ${ev.value} = ${ctx.defaultValue(this.dataType)};
        if (!${ev.isNull}) {
          ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        }""")
    } else {
      ev.copy(code = s"""
        $placeHolder
        Object $objectTerm = ((Expression) references[$idx]).eval($input);
        ${ctx.javaType(this.dataType)} ${ev.value} = (${ctx.boxedType(this.dataType)}) $objectTerm;
        """, isNull = "false")
    }
  }
}

Source File: SubstituteUnresolvedOrdinals.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, SortOrder}
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Sort}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.IntegerType


class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] {
  private def isIntLiteral(e: Expression) = e match {
    case Literal(_, IntegerType) => true
    case _ => false
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
    case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) =>
      val newOrders = s.order.map {
        case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) =>
          val newOrdinal = withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
          withOrigin(order.origin)(order.copy(child = newOrdinal))
        case other => other
      }
      withOrigin(s.origin)(s.copy(order = newOrders))

    case a: Aggregate if conf.groupByOrdinal && a.groupingExpressions.exists(isIntLiteral) =>
      val newGroups = a.groupingExpressions.map {
        case ordinal @ Literal(index: Int, IntegerType) =>
          withOrigin(ordinal.origin)(UnresolvedOrdinal(index))
        case other => other
      }
      withOrigin(a.origin)(a.copy(groupingExpressions = newGroups))
  }
}

Source File: QueryPlanSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.dsl.plans
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal, NamedExpression}
import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
import org.apache.spark.sql.types.IntegerType

class QueryPlanSuite extends SparkFunSuite {

  test("origin remains the same after mapExpressions (SPARK-23823)") {
    CurrentOrigin.setPosition(0, 0)
    val column = AttributeReference("column", IntegerType)(NamedExpression.newExprId)
    val query = plans.DslLogicalPlan(plans.table("table")).select(column)
    CurrentOrigin.reset()

    val mappedQuery = query mapExpressions {
      case _: Expression => Literal(1)
    }

    val mappedOrigin = mappedQuery.expressions.apply(0).origin
    assert(mappedOrigin == Origin.apply(Some(0), Some(0)))
  }

}

Source File: RuleExecutorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.errors.TreeNodeException
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    val message = intercept[TreeNodeException[LogicalPlan]] {
      ToFixedPoint.execute(Literal(100))
    }.getMessage
    assert(message.contains("Max iterations (10) reached for batch fixedPoint"))
  }

  test("structural integrity checker") {
    object WithSIChecker extends RuleExecutor[Expression] {
      override protected def isPlanIntegral(expr: Expression): Boolean = expr match {
        case IntegerLiteral(_) => true
        case _ => false
      }
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(WithSIChecker.execute(Literal(10)) === Literal(9))

    val message = intercept[TreeNodeException[LogicalPlan]] {
      WithSIChecker.execute(Literal(10.1))
    }.getMessage
    assert(message.contains("the structural integrity of the plan is broken"))
  }
}

Source File: CheckCartesianProductsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.scalatest.Matchers._

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED

class CheckCartesianProductsSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil
  }

  val testRelation1 = LocalRelation('a.int, 'b.int)
  val testRelation2 = LocalRelation('c.int, 'd.int)

  val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter)
  val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists))

  test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "true") {
      noException should be thrownBy {
        for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  test("CheckCartesianProducts throws an exception for join types that require a join condition") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        val thrownException = the [AnalysisException] thrownBy {
          performCartesianProductCheck(joinType)
        }
        assert(thrownException.message.contains("Detected implicit cartesian product"))
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if a join condition is present") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType, Some('a === 'd))
        }
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithoutRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  private def performCartesianProductCheck(
      joinType: JoinType,
      condition: Option[Expression] = None): Unit = {
    val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze
    val optimizedPlan = Optimize.execute(analyzedPlan)
    comparePlans(analyzedPlan, optimizedPlan)
  }
}

Source File: ShuffledHashJoinExec.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"),
    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))

  override def requiredChildDistribution: Seq[Distribution] =
    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil

  private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
    val buildDataSize = longMetric("buildDataSize")
    val buildTime = longMetric("buildTime")
    val start = System.nanoTime()
    val context = TaskContext.get()
    val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager())
    buildTime += (System.nanoTime() - start) / 1000000
    buildDataSize += relation.estimatedSize
    // This relation is usually used until the end of task.
    context.addTaskCompletionListener(_ => relation.close())
    relation
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    val avgHashProbe = longMetric("avgHashProbe")
    streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
      val hashed = buildHashedRelation(buildIter)
      join(streamIter, hashed, numOutputRows, avgHashProbe)
    }
  }
}

Source File: CartesianProductExec.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.util.CompletionIterator


class UnsafeCartesianRDD(
    left : RDD[UnsafeRow],
    right : RDD[UnsafeRow],
    numFieldsOfRight: Int,
    inMemoryBufferThreshold: Int,
    spillThreshold: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold)

    val partition = split.asInstanceOf[CartesianPartition]
    rdd2.iterator(partition.s2, context).foreach(rowArray.add)

    // Create an iterator from rowArray
    def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator()

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, rowArray.clear())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(
      leftResults,
      rightResults,
      right.output.size,
      sqlContext.conf.cartesianProductExecBufferInMemoryThreshold,
      sqlContext.conf.cartesianProductExecBufferSpillThreshold)
    pair.mapPartitionsWithIndexInternal { (index, iter) =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
        boundCondition.initialize(index)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition.eval(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
}

Source File: DataSourcePartitioning.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression}
import org.apache.spark.sql.catalyst.plans.physical
import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning}


class DataSourcePartitioning(
    partitioning: Partitioning,
    colNames: AttributeMap[String]) extends physical.Partitioning {

  override val numPartitions: Int = partitioning.numPartitions()

  override def satisfies(required: physical.Distribution): Boolean = {
    super.satisfies(required) || {
      required match {
        case d: physical.ClusteredDistribution if isCandidate(d.clustering) =>
          val attrs = d.clustering.map(_.asInstanceOf[Attribute])
          partitioning.satisfy(
            new ClusteredDistribution(attrs.map { a =>
              val name = colNames.get(a)
              assert(name.isDefined, s"Attribute ${a.name} is not found in the data source output")
              name.get
            }.toArray))

        case _ => false
      }
    }
  }

  private def isCandidate(clustering: Seq[Expression]): Boolean = {
    clustering.forall {
      case a: Attribute => colNames.contains(a)
      case _ => false
    }
  }
}

Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.exchange

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType


case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
    val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]()
    plan.transformUp {
      case exchange: Exchange =>
        // the exchanges that have same results usually also have same schemas (same column names).
        val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]())
        val samePlan = sameSchema.find { e =>
          exchange.sameResult(e)
        }
        if (samePlan.isDefined) {
          // Keep the output of this exchange, the following plans require that to resolve
          // attributes.
          ReusedExchangeExec(exchange.output, samePlan.get)
        } else {
          sameSchema += exchange
          exchange
        }
    }
  }
}

Source File: PythonUDF.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import org.apache.spark.api.python.PythonFunction
import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, Unevaluable, UserDefinedExpression}
import org.apache.spark.sql.types.DataType


case class PythonUDF(
    name: String,
    func: PythonFunction,
    dataType: DataType,
    children: Seq[Expression],
    evalType: Int,
    udfDeterministic: Boolean)
  extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression {

  override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)

  override def toString: String = s"$name(${children.mkString(", ")})"

  override def nullable: Boolean = true
}

Source File: StarryHashJoinExec.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import com.github.passionke.starry.SparkPlanExecutor
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}


case class StarryHashJoinExec(
                               leftKeys: Seq[Expression],
                               rightKeys: Seq[Expression],
                               joinType: JoinType,
                               buildSide: BuildSide,
                               condition: Option[Expression],
                               left: SparkPlan,
                               right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    val avgHashProbe = longMetric("avgHashProbe")
    val rows = SparkPlanExecutor.doExec(buildPlan)
    val hashed = HashedRelation(rows.iterator, buildKeys, rows.length, null)
    streamedPlan.execute().mapPartitions { streamedIter =>
      join(streamedIter, hashed, numOutputRows, avgHashProbe)
    }
  }
}

Source File: GPlanExpander.scala From ingraph with Eclipse Public License 1.0

5 votes

package ingraph.compiler.cypher2gplan

import ingraph.model.{expr, gplan}
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.{expressions => cExpr}

object GPlanExpander {
  def expandGPlan(rawQueryPlan: gplan.GNode): gplan.GNode = {
    // should there be other rule sets (partial functions), combine them using orElse,
    // e.g. pfunc1 orElse pfunc2
    // expanding GetVertices involves creating other GetVertices, so transformUp is to avoid infinite recursion
    val full = rawQueryPlan.transformUp(gplanExpander)

    full.asInstanceOf[gplan.GNode]
  }

  
  val gplanExpander: PartialFunction[LogicalPlan, LogicalPlan] = {
    // Nullary
    case gplan.GetVertices(vertexAttribute) if vertexAttribute.properties.nonEmpty => {
      val condition: Expression = propertyMapToCondition(vertexAttribute.properties, vertexAttribute.name)
      gplan.Selection(condition, gplan.GetVertices(vertexAttribute))
    }
    case gplan.Expand(srcVertexAttribute, trgVertexAttribute, edge, dir, child) if edge.properties.nonEmpty || trgVertexAttribute.properties.nonEmpty => {
      val selectionOnEdge = gplan.Selection(propertyMapToCondition(edge.properties, edge.name), gplan.Expand(srcVertexAttribute, trgVertexAttribute, edge, dir, child))
      val selectionOnTargetVertex = gplan.Selection(propertyMapToCondition(trgVertexAttribute.properties, trgVertexAttribute.name), selectionOnEdge)
      selectionOnTargetVertex
    }
  }

  def propertyMapToCondition(properties: expr.types.TPropertyMap, baseName: String): Expression = {
    properties.map( (p) => cExpr.EqualTo(UnresolvedAttribute(Seq(baseName, p._1)), p._2) )
              .foldLeft[Expression]( cExpr.Literal(true) )( (b, a) => cExpr.And(b, a) )
  }
}

Source File: monotonicaggregates.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, Greatest, Least, Literal, Unevaluable}
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, DataType}

abstract class MonotonicAggregateFunction extends DeclarativeAggregate with Serializable {}

case class MMax(child: Expression) extends MonotonicAggregateFunction {

  override def children: Seq[Expression] = child :: Nil

  override def nullable: Boolean = true

  // Return data type.
  override def dataType: DataType = child.dataType

  // Expected input data type.
  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)

  override def checkInputDataTypes(): TypeCheckResult =
    TypeUtils.checkForOrderingExpr(child.dataType, "function mmax")

  private lazy val mmax = AttributeReference("mmax", child.dataType)()

  override lazy val aggBufferAttributes: Seq[AttributeReference] = mmax :: Nil

  override lazy val initialValues: Seq[Literal] = Seq(
     Least(Seq(mmin.left, mmin.right))
    )
  }

  override lazy val evaluateExpression: AttributeReference = mmin
}

case class MonotonicAggregateExpression(aggregateFunction: MonotonicAggregateFunction,
                                        mode: AggregateMode,
                                        isDistinct: Boolean)
  extends Expression
    with Unevaluable {

  override def children: Seq[Expression] = aggregateFunction :: Nil
  override def dataType: DataType = aggregateFunction.dataType
  override def foldable: Boolean = false
  override def nullable: Boolean = aggregateFunction.nullable

  override def references: AttributeSet = {
    val childReferences = mode match {
      case Partial | Complete => aggregateFunction.references.toSeq
      case PartialMerge | Final => aggregateFunction.aggBufferAttributes
    }

    AttributeSet(childReferences)
  }

  override def prettyString: String = aggregateFunction.prettyString

  override def toString: String = s"(${aggregateFunction},mode=$mode,isDistinct=$isDistinct)"
}

Source File: ShuffleHashJoin.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution

import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, PartitioningCollection}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashJoin, HashedRelation}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffleHashJoin(leftKeys: Seq[Expression],
                            rightKeys: Seq[Expression],
                            buildSide: BuildSide,
                            left: SparkPlan,
                            right: SparkPlan)
  extends BinaryNode with HashJoin {

  @transient
  final protected val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext]

  val cacheBuildSide = bigDatalogContext.getConf.getBoolean("spark.datalog.shufflehashjoin.cachebuildside", true)

  override lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  var cachedBuildPlan: RDD[HashedRelation] = null

  override def output: Seq[Attribute] = left.output ++ right.output

  override def outputPartitioning: Partitioning =
    PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning))

  override def requiredChildDistribution: Seq[ClusteredDistribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  override def outputsUnsafeRows: Boolean = true
  override def canProcessUnsafeRows: Boolean = true
  override def canProcessSafeRows: Boolean = false

  protected override def doExecute(): RDD[InternalRow] = {
    val numStreamedRows = buildSide match {
      case BuildLeft => longMetric("numRightRows")
      case BuildRight => longMetric("numLeftRows")
    }
    val numOutputRows = longMetric("numOutputRows")

    if (cacheBuildSide) {
      if (cachedBuildPlan == null) {
        cachedBuildPlan = buildPlan.execute()
          .mapPartitionsInternal(iter => Iterator(HashedRelation(iter, SQLMetrics.nullLongMetric, buildSideKeyGenerator)))
          .persist()
      }
      cachedBuildPlan.zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) =>
        hashJoin(streamedIter, numStreamedRows, buildIter.next(), numOutputRows)}
    } else {
      buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) =>
        val hashedRelation = HashedRelation(buildIter, SQLMetrics.nullLongMetric, buildSideKeyGenerator)
        hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows)
      }
    }
  }
}

Source File: operators.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression}
import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LeafNode, LogicalPlan, Statistics, UnaryNode}

case class Recursion(name: String,
                     isLinear: Boolean,
                     left: LogicalPlan,
                     right: LogicalPlan,
                     partitioning: Seq[Int]) extends BinaryNode {
  // left is exitRules plan
  // right is recursive rules plan
  override def output: Seq[Attribute] = right.output
}

case class MutualRecursion(name: String,
                             isLinear: Boolean,
                             left: LogicalPlan,
                             right: LogicalPlan,
                             partitioning: Seq[Int]) extends BinaryNode {

  override def output: Seq[Attribute] = right.output

  override def children: Seq[LogicalPlan] = {
    if (left == null)
      Seq(right)
    else
      Seq(left, right)
  }

  override def generateTreeString(depth: Int,
                         lastChildren: Seq[Boolean],
                         builder: StringBuilder): StringBuilder = {
    if (depth > 0) {
      lastChildren.init.foreach { isLast =>
        val prefixFragment = if (isLast) "   " else ":  "
        builder.append(prefixFragment)
      }

      val branch = if (lastChildren.last) "+- " else ":- "
      builder.append(branch)
    }

    builder.append(simpleString)
    builder.append("\n")

    if (children.nonEmpty) {
      val exitRule = children.init
      if (exitRule != null)
        exitRule.foreach(_.generateTreeString(depth + 1, lastChildren :+ false, builder))
      children.last.generateTreeString(depth + 1, lastChildren :+ true, builder)
    }

    builder
  }
}

case class LinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode {
  override def statistics: Statistics = Statistics(Long.MaxValue)
  var name = _name
}

case class NonLinearRecursiveRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode {
  override def statistics: Statistics = Statistics(Long.MaxValue)

  def name = "all_" + _name
}

case class MonotonicAggregate(groupingExpressions: Seq[Expression],
                              aggregateExpressions: Seq[NamedExpression],
                              child: LogicalPlan,
                              partitioning: Seq[Int]) extends UnaryNode {
  override lazy val resolved: Boolean = !expressions.exists(!_.resolved) && childrenResolved

  override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute)
}

case class AggregateRecursion(name: String,
                              isLinear: Boolean,
                              left: LogicalPlan,
                              right: LogicalPlan,
                              partitioning: Seq[Int]) extends BinaryNode {
  // left is exitRules plan
  // right is recursive rules plan
  override def output: Seq[Attribute] = right.output
}

case class AggregateRelation(_name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode {
  override def statistics: Statistics = Statistics(Long.MaxValue)
  var name = _name
}


case class CacheHint(child: LogicalPlan) extends UnaryNode {
  override def output: Seq[Attribute] = child.output
}

Source File: RuleExecutorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.trees

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Expression, IntegerLiteral, Literal}
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor}

class RuleExecutorSuite extends SparkFunSuite {
  object DecrementLiterals extends Rule[Expression] {
    def apply(e: Expression): Expression = e transform {
      case IntegerLiteral(i) if i > 0 => Literal(i - 1)
    }
  }

  test("only once") {
    object ApplyOnce extends RuleExecutor[Expression] {
      val batches = Batch("once", Once, DecrementLiterals) :: Nil
    }

    assert(ApplyOnce.execute(Literal(10)) === Literal(9))
  }

  test("to fixed point") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(100), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(10)) === Literal(0))
  }

  test("to maxIterations") {
    object ToFixedPoint extends RuleExecutor[Expression] {
      val batches = Batch("fixedPoint", FixedPoint(10), DecrementLiterals) :: Nil
    }

    assert(ToFixedPoint.execute(Literal(100)) === Literal(90))
  }
}

Source File: FilterNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate


case class FilterNode(conf: SQLConf, condition: Expression, child: LocalNode)
  extends UnaryLocalNode(conf) {

  private[this] var predicate: (InternalRow) => Boolean = _

  override def output: Seq[Attribute] = child.output

  override def open(): Unit = {
    child.open()
    predicate = GeneratePredicate.generate(condition, child.output)
  }

  override def next(): Boolean = {
    var found = false
    while (!found && child.next()) {
      found = predicate.apply(child.fetch())
    }
    found
  }

  override def fetch(): InternalRow = child.fetch()

  override def close(): Unit = child.close()
}

Source File: ExpandNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Projection}

case class ExpandNode(
    conf: SQLConf,
    projections: Seq[Seq[Expression]],
    output: Seq[Attribute],
    child: LocalNode) extends UnaryLocalNode(conf) {

  assert(projections.size > 0)

  private[this] var result: InternalRow = _
  private[this] var idx: Int = _
  private[this] var input: InternalRow = _
  private[this] var groups: Array[Projection] = _

  override def open(): Unit = {
    child.open()
    groups = projections.map(ee => newProjection(ee, child.output)).toArray
    idx = groups.length
  }

  override def next(): Boolean = {
    if (idx >= groups.length) {
      if (child.next()) {
        input = child.fetch()
        idx = 0
      } else {
        return false
      }
    }
    result = groups(idx)(input)
    idx += 1
    true
  }

  override def fetch(): InternalRow = result

  override def close(): Unit = child.close()
}

Source File: LocalNodeTest.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Expression, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType}


class LocalNodeTest extends SparkFunSuite {

  protected val conf: SQLConf = new SQLConf
  protected val kvIntAttributes = Seq(
    AttributeReference("k", IntegerType)(),
    AttributeReference("v", IntegerType)())
  protected val joinNameAttributes = Seq(
    AttributeReference("id1", IntegerType)(),
    AttributeReference("name", StringType)())
  protected val joinNicknameAttributes = Seq(
    AttributeReference("id2", IntegerType)(),
    AttributeReference("nickname", StringType)())

  
  protected def resolveExpressions(
      expressions: Seq[Expression],
      localNode: LocalNode): Seq[Expression] = {
    require(localNode.expressions.forall(_.resolved))
    val inputMap = localNode.output.map { a => (a.name, a) }.toMap
    expressions.map { expression =>
      expression.transformUp {
        case UnresolvedAttribute(Seq(u)) =>
          inputMap.getOrElse(u,
            sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
      }
    }
  }

}

Source File: BasicCurrencyConversionExpression.scala From HANAVora-Extensions with Apache License 2.0

4 votes

package org.apache.spark.sql.currency.basic

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String


case class BasicCurrencyConversionExpression(
    conversion: BasicCurrencyConversion,
    children: Seq[Expression])
  extends Expression
  with ImplicitCastInputTypes
  with CodegenFallback {

  protected val AMOUNT_INDEX = 0
  protected val FROM_INDEX = 1
  protected val TO_INDEX = 2
  protected val DATE_INDEX = 3
  protected val NUM_ARGS = 4

  override def eval(input: InternalRow): Any = {
    val inputArguments = children.map(_.eval(input))

    require(inputArguments.length == NUM_ARGS, "wrong number of arguments")

    val sourceCurrency =
      Option(inputArguments(FROM_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val targetCurrency = Option(inputArguments(TO_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val amount = Option(inputArguments(AMOUNT_INDEX).asInstanceOf[Decimal].toJavaBigDecimal)
    val date = Option(inputArguments(DATE_INDEX).asInstanceOf[UTF8String]).map(_.toString)

    (amount, sourceCurrency, targetCurrency, date) match {
      case (Some(a), Some(s), Some(t), Some(d)) => nullSafeEval(a, s, t, d)
      case _ => null
    }
  }

  def nullSafeEval(amount: java.math.BigDecimal,
                   sourceCurrency: String,
                   targetCurrency: String,
                   date: String): Any = {
    conversion.convert(amount, sourceCurrency, targetCurrency, date)
      .get
      .map(Decimal.apply)
      .orNull
  }

  override def dataType: DataType = DecimalType.forType(DoubleType)

  override def nullable: Boolean = true

  // TODO(MD, CS): use DateType but support date string
  override def inputTypes: Seq[AbstractDataType] =
    Seq(DecimalType, StringType, StringType, StringType)
}

org.apache.spark.sql.catalyst.expressions.Expression Scala Examples