org.apache.spark.sql.catalyst.expressions.AttributeSet Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.AttributeSet.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ScriptTransformation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 2
Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import java.util.Locale import org.apache.spark.SparkException import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.StreamingRelationV2 import org.apache.spark.sql.sources.v2.StreamWriteSupport import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.StreamingSinkType case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand { private var outputMode: OutputMode = OutputMode.Append // dummy override def output: Seq[AttributeReference] = Seq.empty // dummy override def producedAttributes: AttributeSet = plan.producedAttributes override def run(sparkSession: SparkSession): Seq[Row] = { import StreamingSinkType._ val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan)) val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema)) plan.collectLeaves.head match { case StreamingRelationV2(_, _, extraOptions, _, _) => val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK) val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv => val key = kv._1.substring(STREAMING_SINK_PREFIX.length) (key, kv._2) } StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match { case CONSOLE => case TEXT | PARQUET | ORC | JSON | CSV => if (sinkOptions.get(STREAMING_SINK_PATH) == None) { throw new SparkException("Sink type is file, must config path") } case KAFKA => if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) { throw new SparkException("Sink type is kafka, must config bootstrap servers") } if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) { throw new SparkException("Sink type is kafka, must config kafka topic") } case _ => throw new SparkException( "Sink type is invalid, " + s"select from ${StreamingSinkType.values}") } val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") val sink = ds.newInstance() match { case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w case _ => val ds = DataSource( sparkSession, className = source, options = sinkOptions.toMap, partitionColumns = Nil) ds.createSink(InternalOutputModes.Append) } val outputMode = InternalOutputModes( extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE)) val duration = extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION) val trigger = extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match { case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration) case STREAMING_ONCE_TRIGGER => Trigger.Once() case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration) } val query = sparkSession.sessionState.streamingQueryManager.startQuery( extraOptions.get("queryName"), extraOptions.get(STREAMING_CHECKPOINT_LOCATION), df, sinkOptions.toMap, sink, outputMode, useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK, recoverFromCheckpointLocation = true, trigger = trigger) query.awaitTermination() } // dummy Seq.empty } } case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 3
Source File: ScriptTransformation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 4
Source File: DataSourceV2Strategy.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.{sources, Strategy} import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Repartition} import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownFilters, SupportsPushDownRequiredColumns} import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader object DataSourceV2Strategy extends Strategy { // TODO: nested column pruning. private def pruneColumns( reader: DataSourceReader, relation: DataSourceV2Relation, exprs: Seq[Expression]): Seq[AttributeReference] = { reader match { case r: SupportsPushDownRequiredColumns => val requiredColumns = AttributeSet(exprs.flatMap(_.references)) val neededOutput = relation.output.filter(requiredColumns.contains) if (neededOutput != relation.output) { r.pruneColumns(neededOutput.toStructType) val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap r.readSchema().toAttributes.map { // We have to keep the attribute id during transformation. a => a.withExprId(nameToAttr(a.name).exprId) } } else { relation.output } case _ => relation.output } } override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => val reader = relation.newReader() // `pushedFilters` will be pushed down and evaluated in the underlying data sources. // `postScanFilters` need to be evaluated after the scan. // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. val (pushedFilters, postScanFilters) = pushFilters(reader, filters) val output = pruneColumns(reader, relation, project ++ postScanFilters) logInfo( s""" |Pushing operators to ${relation.source.getClass} |Pushed Filters: ${pushedFilters.mkString(", ")} |Post-Scan Filters: ${postScanFilters.mkString(",")} |Output: ${output.mkString(", ")} """.stripMargin) val scan = DataSourceV2ScanExec( output, relation.source, relation.options, pushedFilters, reader) val filterCondition = postScanFilters.reduceLeftOption(And) val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan) // always add the projection, which will produce unsafe rows required by some operators ProjectExec(project, withFilter) :: Nil case r: StreamingDataSourceV2Relation => // ensure there is a projection, which will produce unsafe rows required by some operators ProjectExec(r.output, DataSourceV2ScanExec(r.output, r.source, r.options, r.pushedFilters, r.reader)) :: Nil case WriteToDataSourceV2(writer, query) => WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil case AppendData(r: DataSourceV2Relation, query, _) => WriteToDataSourceV2Exec(r.newWriter(), planLater(query)) :: Nil case WriteToContinuousDataSource(writer, query) => WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil case Repartition(1, false, child) => val isContinuous = child.collectFirst { case StreamingDataSourceV2Relation(_, _, _, r: ContinuousReader) => r }.isDefined if (isContinuous) { ContinuousCoalesceExec(1, planLater(child)) :: Nil } else { Nil } case _ => Nil } }
Example 5
Source File: ScriptTransformation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 6
Source File: package.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, PredicateHelper, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.carbondata.mv.plans.modular.ModularPlan import org.apache.carbondata.mv.plans.util.{CheckSPJG, LogicalPlanSignatureGenerator, Signature} def canEvaluate(exp: ScalaUDF, exprList: Seq[Expression]): Boolean = { var canBeDerived = false exprList.forall { case udf: ScalaUDF => if (udf.children.length == exp.children.length) { if (udf.children.zip(exp.children).forall(e => e._1.sql.equalsIgnoreCase(e._2.sql))) { canBeDerived = true } } canBeDerived case _ => canBeDerived } } def canEvaluate(expr: Expression, exprList: Seq[Expression]): Boolean = { expr match { case exp: ScalaUDF => canEvaluate(exp, exprList) case _ => expr.references.subsetOf(AttributeSet(exprList)) } } } def supports(supported: Boolean, message: Any) { if (!supported) { throw new UnsupportedOperationException(s"unsupported operation: $message") } } }
Example 7
Source File: ScriptTransformation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 8
Source File: ScriptTransformation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} private def getRowFormatSQL( rowFormat: Seq[(String, String)], serdeClass: Option[String], serdeProps: Seq[(String, String)]): Option[String] = { if (schemaLess) return Some("") val rowFormatDelimited = rowFormat.map { case ("TOK_TABLEROWFORMATFIELD", value) => "FIELDS TERMINATED BY " + value case ("TOK_TABLEROWFORMATCOLLITEMS", value) => "COLLECTION ITEMS TERMINATED BY " + value case ("TOK_TABLEROWFORMATMAPKEYS", value) => "MAP KEYS TERMINATED BY " + value case ("TOK_TABLEROWFORMATLINES", value) => "LINES TERMINATED BY " + value case ("TOK_TABLEROWFORMATNULL", value) => "NULL DEFINED AS " + value case o => return None } val serdeClassSQL = serdeClass.map("'" + _ + "'").getOrElse("") val serdePropsSQL = if (serdeClass.nonEmpty) { val props = serdeProps.map{p => s"'${p._1}' = '${p._2}'"}.mkString(", ") if (props.nonEmpty) " WITH SERDEPROPERTIES(" + props + ")" else "" } else { "" } if (rowFormat.nonEmpty) { Some("ROW FORMAT DELIMITED " + rowFormatDelimited.mkString(" ")) } else { Some("ROW FORMAT SERDE " + serdeClassSQL + serdePropsSQL) } } }
Example 9
Source File: monotonicaggregates.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, Greatest, Least, Literal, Unevaluable} import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, DataType} abstract class MonotonicAggregateFunction extends DeclarativeAggregate with Serializable {} case class MMax(child: Expression) extends MonotonicAggregateFunction { override def children: Seq[Expression] = child :: Nil override def nullable: Boolean = true // Return data type. override def dataType: DataType = child.dataType // Expected input data type. override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType) override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function mmax") private lazy val mmax = AttributeReference("mmax", child.dataType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = mmax :: Nil override lazy val initialValues: Seq[Literal] = Seq( Least(Seq(mmin.left, mmin.right)) ) } override lazy val evaluateExpression: AttributeReference = mmin } case class MonotonicAggregateExpression(aggregateFunction: MonotonicAggregateFunction, mode: AggregateMode, isDistinct: Boolean) extends Expression with Unevaluable { override def children: Seq[Expression] = aggregateFunction :: Nil override def dataType: DataType = aggregateFunction.dataType override def foldable: Boolean = false override def nullable: Boolean = aggregateFunction.nullable override def references: AttributeSet = { val childReferences = mode match { case Partial | Complete => aggregateFunction.references.toSeq case PartialMerge | Final => aggregateFunction.aggBufferAttributes } AttributeSet(childReferences) } override def prettyString: String = aggregateFunction.prettyString override def toString: String = s"(${aggregateFunction},mode=$mode,isDistinct=$isDistinct)" }