org.apache.spark.sql.catalyst.expressions.AttributeSet Scala Example

Source File: ScriptTransformation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql.execution.command

import java.util.Locale

import org.apache.spark.SparkException
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.execution.streaming.StreamingRelationV2
import org.apache.spark.sql.sources.v2.StreamWriteSupport
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.xsql.DataSourceManager._
import org.apache.spark.sql.xsql.StreamingSinkType


case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand {

  private var outputMode: OutputMode = OutputMode.Append
  // dummy
  override def output: Seq[AttributeReference] = Seq.empty
  // dummy
  override def producedAttributes: AttributeSet = plan.producedAttributes

  override def run(sparkSession: SparkSession): Seq[Row] = {
    import StreamingSinkType._
    val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan))
    val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema))
    plan.collectLeaves.head match {
      case StreamingRelationV2(_, _, extraOptions, _, _) =>
        val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK)
        val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv =>
          val key = kv._1.substring(STREAMING_SINK_PREFIX.length)
          (key, kv._2)
        }
        StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match {
          case CONSOLE =>
          case TEXT | PARQUET | ORC | JSON | CSV =>
            if (sinkOptions.get(STREAMING_SINK_PATH) == None) {
              throw new SparkException("Sink type is file, must config path")
            }
          case KAFKA =>
            if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) {
              throw new SparkException("Sink type is kafka, must config bootstrap servers")
            }
            if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) {
              throw new SparkException("Sink type is kafka, must config kafka topic")
            }
          case _ =>
            throw new SparkException(
              "Sink type is invalid, " +
                s"select from ${StreamingSinkType.values}")
        }
        val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf)
        val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",")
        val sink = ds.newInstance() match {
          case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) =>
            w
          case _ =>
            val ds = DataSource(
              sparkSession,
              className = source,
              options = sinkOptions.toMap,
              partitionColumns = Nil)
            ds.createSink(InternalOutputModes.Append)
        }
        val outputMode = InternalOutputModes(
          extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE))
        val duration =
          extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION)
        val trigger =
          extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match {
            case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration)
            case STREAMING_ONCE_TRIGGER => Trigger.Once()
            case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration)
          }
        val query = sparkSession.sessionState.streamingQueryManager.startQuery(
          extraOptions.get("queryName"),
          extraOptions.get(STREAMING_CHECKPOINT_LOCATION),
          df,
          sinkOptions.toMap,
          sink,
          outputMode,
          useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK,
          recoverFromCheckpointLocation = true,
          trigger = trigger)
        query.awaitTermination()
    }
    // dummy
    Seq.empty
  }
}

case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode {
  override def output: Seq[Attribute] = child.output
}

Source File: ScriptTransformation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: DataSourceV2Strategy.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import scala.collection.mutable

import org.apache.spark.sql.{sources, Strategy}
import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression}
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition}
import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
import org.apache.spark.sql.execution.datasources.DataSourceStrategy
import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec}
import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns}
import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader

object DataSourceV2Strategy extends Strategy {

  
  // TODO: nested column pruning.
  private def pruneColumns(
      reader: DataSourceReader,
      relation: DataSourceV2Relation,
      exprs: Seq[Expression]): Seq[AttributeReference] = {
    reader match {
      case r: SupportsPushDownRequiredColumns =>
        val requiredColumns = AttributeSet(exprs.flatMap(_.references))
        val neededOutput = relation.output.filter(requiredColumns.contains)
        if (neededOutput != relation.output) {
          r.pruneColumns(neededOutput.toStructType)
          val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap
          r.readSchema().toAttributes.map {
            // We have to keep the attribute id during transformation.
            a => a.withExprId(nameToAttr(a.name).exprId)
          }
        } else {
          relation.output
        }

      case _ => relation.output
    }
  }


  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
    case PhysicalOperation(project, filters, relation: DataSourceV2Relation) =>
      val reader = relation.newReader()
      // `pushedFilters` will be pushed down and evaluated in the underlying data sources.
      // `postScanFilters` need to be evaluated after the scan.
      // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter.
      val (pushedFilters, postScanFilters) = pushFilters(reader, filters)
      val output = pruneColumns(reader, relation, project ++ postScanFilters)
      logInfo(
        s"""
           |Pushing operators to ${relation.source.getClass}
           |Pushed Filters: ${pushedFilters.mkString(", ")}
           |Post-Scan Filters: ${postScanFilters.mkString(",")}
           |Output: ${output.mkString(", ")}
         """.stripMargin)

      val scan = DataSourceV2ScanExec(
        output, relation.source, relation.options, pushedFilters, reader)

      val filterCondition = postScanFilters.reduceLeftOption(And)
      val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)

      // always add the projection, which will produce unsafe rows required by some operators
      ProjectExec(project, withFilter) :: Nil

    case r: StreamingDataSourceV2Relation =>
      // ensure there is a projection, which will produce unsafe rows required by some operators
      ProjectExec(r.output,
        DataSourceV2ScanExec(r.output, r.source, r.options, r.pushedFilters, r.reader)) :: Nil

    case WriteToDataSourceV2(writer, query) =>
      WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil

    case AppendData(r: DataSourceV2Relation, query, _) =>
      WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil

    case WriteToContinuousDataSource(writer, query) =>
      WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil

    case Repartition(1, false, child) =>
      val isContinuous = child.collectFirst {
        case StreamingDataSourceV2Relation(_, _, _, r: ContinuousReader) => r
      }.isDefined

      if (isContinuous) {
        ContinuousCoalesceExec(1, planLater(child)) :: Nil
      } else {
        Nil
      }

    case _ => Nil
  }
}

Source File: ScriptTransformation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: package.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.mv

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, PredicateHelper, ScalaUDF}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import org.apache.carbondata.mv.plans.modular.ModularPlan
import org.apache.carbondata.mv.plans.util.{CheckSPJG, LogicalPlanSignatureGenerator, Signature}


    def canEvaluate(exp: ScalaUDF, exprList: Seq[Expression]): Boolean = {
      var canBeDerived = false
      exprList.forall {
        case udf: ScalaUDF =>
          if (udf.children.length == exp.children.length) {
            if (udf.children.zip(exp.children).forall(e => e._1.sql.equalsIgnoreCase(e._2.sql))) {
              canBeDerived = true
            }
          }
          canBeDerived
        case _ =>
          canBeDerived
      }
    }

    def canEvaluate(expr: Expression, exprList: Seq[Expression]): Boolean = {
      expr match {
        case exp: ScalaUDF =>
          canEvaluate(exp, exprList)
        case _ =>
          expr.references.subsetOf(AttributeSet(exprList))
      }
    }
  }

  def supports(supported: Boolean, message: Any) {
    if (!supported) {
      throw new UnsupportedOperationException(s"unsupported operation: $message")
    }
  }
}

Source File: ScriptTransformation.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: ScriptTransformation.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}


  private def getRowFormatSQL(
    rowFormat: Seq[(String, String)],
    serdeClass: Option[String],
    serdeProps: Seq[(String, String)]): Option[String] = {
    if (schemaLess) return Some("")

    val rowFormatDelimited =
      rowFormat.map {
        case ("TOK_TABLEROWFORMATFIELD", value) =>
          "FIELDS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATCOLLITEMS", value) =>
          "COLLECTION ITEMS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATMAPKEYS", value) =>
          "MAP KEYS TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATLINES", value) =>
          "LINES TERMINATED BY " + value
        case ("TOK_TABLEROWFORMATNULL", value) =>
          "NULL DEFINED AS " + value
        case o => return None
      }

    val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("")
    val serdePropsSQL =
      if (serdeClass.nonEmpty) {
        val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ")
        if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else ""
      } else {
        ""
      }
    if (rowFormat.nonEmpty) {
      Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" "))
    } else {
      Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL)
    }
  }
}

Source File: monotonicaggregates.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, Greatest, Least, Literal, Unevaluable}
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, DataType}

abstract class MonotonicAggregateFunction extends DeclarativeAggregate with Serializable {}

case class MMax(child: Expression) extends MonotonicAggregateFunction {

  override def children: Seq[Expression] = child :: Nil

  override def nullable: Boolean = true

  // Return data type.
  override def dataType: DataType = child.dataType

  // Expected input data type.
  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)

  override def checkInputDataTypes(): TypeCheckResult =
    TypeUtils.checkForOrderingExpr(child.dataType, "function mmax")

  private lazy val mmax = AttributeReference("mmax", child.dataType)()

  override lazy val aggBufferAttributes: Seq[AttributeReference] = mmax :: Nil

  override lazy val initialValues: Seq[Literal] = Seq(
     Least(Seq(mmin.left, mmin.right))
    )
  }

  override lazy val evaluateExpression: AttributeReference = mmin
}

case class MonotonicAggregateExpression(aggregateFunction: MonotonicAggregateFunction,
                                        mode: AggregateMode,
                                        isDistinct: Boolean)
  extends Expression
    with Unevaluable {

  override def children: Seq[Expression] = aggregateFunction :: Nil
  override def dataType: DataType = aggregateFunction.dataType
  override def foldable: Boolean = false
  override def nullable: Boolean = aggregateFunction.nullable

  override def references: AttributeSet = {
    val childReferences = mode match {
      case Partial | Complete => aggregateFunction.references.toSeq
      case PartialMerge | Final => aggregateFunction.aggBufferAttributes
    }

    AttributeSet(childReferences)
  }

  override def prettyString: String = aggregateFunction.prettyString

  override def toString: String = s"(${aggregateFunction},mode=$mode,isDistinct=$isDistinct)"
}

org.apache.spark.sql.catalyst.expressions.AttributeSet Scala Examples