org.apache.spark.sql.execution.SQLExecution Scala Examples
The following examples show how to use org.apache.spark.sql.execution.SQLExecution.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: S2SparkSqlStreamingSink.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark.sql.streaming import java.util.UUID import com.typesafe.config.{Config, ConfigRenderOptions} import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.streaming.{MetadataLog, Sink} import org.apache.spark.sql.{DataFrame, SparkSession} class S2SparkSqlStreamingSink( sparkSession: SparkSession, config:Config ) extends Sink with Logger { import S2SinkConfigs._ private val APP_NAME = "s2graph" private val writeLog: MetadataLog[Array[S2SinkStatus]] = { val logPath = getCommitLogPath(config) logger.info(s"MetaDataLogPath: $logPath") new S2SinkMetadataLog(sparkSession, config, logPath) } override def addBatch(batchId: Long, data: DataFrame): Unit = { logger.debug(s"addBatch : $batchId, getLatest : ${writeLog.getLatest()}") if (batchId <= writeLog.getLatest().map(_._1).getOrElse(-1L)) { logger.info(s"Skipping already committed batch [$batchId]") } else { val queryName = getConfigStringOpt(config, "queryname").getOrElse(UUID.randomUUID().toString) val commitProtocol = new S2CommitProtocol(writeLog) val jobState = JobState(queryName, batchId) val serializedConfig = config.root().render(ConfigRenderOptions.concise()) val queryExecution = data.queryExecution val schema = data.schema SQLExecution.withNewExecutionId(sparkSession, queryExecution) { try { val taskCommits = sparkSession.sparkContext.runJob(queryExecution.toRdd, (taskContext: TaskContext, iter: Iterator[InternalRow]) => { new S2StreamQueryWriter(serializedConfig, schema, commitProtocol).run(taskContext, iter) } ) commitProtocol.commitJob(jobState, taskCommits) } catch { case t: Throwable => commitProtocol.abortJob(jobState) throw t; } } } } private def getCommitLogPath(config:Config): String = { val logPathOpt = getConfigStringOpt(config, S2_SINK_LOG_PATH) val userCheckpointLocationOpt = getConfigStringOpt(config, S2_SINK_CHECKPOINT_LOCATION) (logPathOpt, userCheckpointLocationOpt) match { case (Some(logPath), _) => logPath case (None, Some(userCheckpoint)) => s"$userCheckpoint/sinks/$APP_NAME" case _ => throw new IllegalArgumentException(s"failed to get commit log path") } } override def toString(): String = "S2GraphSink" }
Example 2
Source File: SparkSQLDriver.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { execution.hiveResultString() } tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 3
Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.FileNotFoundException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.{SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.SerializableConfiguration class BasicWriteJobStatsTracker( serializableHadoopConf: SerializableConfiguration, @transient val metrics: Map[String, SQLMetric]) extends WriteJobStatsTracker { override def newTaskInstance(): WriteTaskStatsTracker = { new BasicWriteTaskStatsTracker(serializableHadoopConf.value) } override def processStats(stats: Seq[WriteTaskStats]): Unit = { val sparkContext = SparkContext.getActive.get var numPartitions: Long = 0L var numFiles: Long = 0L var totalNumBytes: Long = 0L var totalNumOutput: Long = 0L val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats]) basicStats.foreach { summary => numPartitions += summary.numPartitions numFiles += summary.numFiles totalNumBytes += summary.numBytes totalNumOutput += summary.numRows } metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput) metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions) val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList) } } object BasicWriteJobStatsTracker { private val NUM_FILES_KEY = "numFiles" private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes" private val NUM_OUTPUT_ROWS_KEY = "numOutputRows" private val NUM_PARTS_KEY = "numParts" def metrics: Map[String, SQLMetric] = { val sparkContext = SparkContext.getActive.get Map( NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"), NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"), NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"), NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part") ) } }
Example 4
Source File: DeltaSink.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.sources import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.SetTransaction import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.NullType class DeltaSink( sqlContext: SQLContext, path: Path, partitionColumns: Seq[String], outputMode: OutputMode, options: DeltaOptions) extends Sink with ImplicitMetadataOperation with DeltaLogging { private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path) private val sqlConf = sqlContext.sparkSession.sessionState.conf override protected val canOverwriteSchema: Boolean = outputMode == OutputMode.Complete() && options.canOverwriteSchema override protected val canMergeSchema: Boolean = options.canMergeSchema override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn => val sc = data.sparkSession.sparkContext val metrics = Map[String, SQLMetric]( "numAddedFiles" -> createMetric(sc, "number of files added"), "numRemovedFiles" -> createMetric(sc, "number of files removed") ) val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY) assert(queryId != null) if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) { throw DeltaErrors.streamWriteNullTypeException } // If the batch reads the same Delta table as this sink is going to write to, then this // write has dependencies. Then make sure that this commit set hasDependencies to true // by injecting a read on the whole table. This needs to be done explicitly because // MicroBatchExecution has already enforced all the data skipping (by forcing the generation // of the executed plan) even before the transaction was started. val selfScan = data.queryExecution.analyzed.collectFirst { case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true }.nonEmpty if (selfScan) { txn.readWholeTable() } // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details updateMetadata( txn, data, partitionColumns, configuration = Map.empty, outputMode == OutputMode.Complete()) val currentVersion = txn.txnVersion(queryId) if (currentVersion >= batchId) { logInfo(s"Skipping already complete epoch $batchId, in query $queryId") return } val deletedFiles = outputMode match { case o if o == OutputMode.Complete() => deltaLog.assertRemovable() txn.filterFiles().map(_.remove) case _ => Nil } val newFiles = txn.writeFiles(data, Some(options)) val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata) metrics("numRemovedFiles").set(deletedFiles.size) metrics("numAddedFiles").set(newFiles.size) txn.registerSQLMetrics(sqlContext.sparkSession, metrics) txn.commit(setTxn ++ newFiles ++ deletedFiles, info) // This is needed to make the SQL metrics visible in the Spark UI val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates( sqlContext.sparkContext, executionId, metrics.values.toSeq) } override def toString(): String = s"DeltaSink[$path]" }
Example 5
Source File: KafkaWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.types.{BinaryType, StringType} import org.apache.spark.util.Utils private[kafka010] object KafkaWriter extends Logging { val TOPIC_ATTRIBUTE_NAME: String = "topic" val KEY_ATTRIBUTE_NAME: String = "key" val VALUE_ATTRIBUTE_NAME: String = "value" override def toString: String = "KafkaWriter" def validateQuery( schema: Seq[Attribute], kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse( if (topic.isEmpty) { throw new AnalysisException(s"topic option required when no " + s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " + s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.") } else { Literal(topic.get, StringType) } ).dataType match { case StringType => // good case _ => throw new AnalysisException(s"Topic type must be a String") } schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse( Literal(null, StringType) ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse( throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found") ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, kafkaParameters, topic) queryExecution.toRdd.foreachPartition { iter => val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close()) } } }
Example 6
Source File: SparkSQLDriver.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { execution.hiveResultString() } tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 7
Source File: KinesisWriter.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.util.Utils private[kinesis] object KinesisWriter extends Logging { val DATA_ATTRIBUTE_NAME: String = "data" val PARTITION_KEY_ATTRIBUTE_NAME: String = "partitionKey" override def toString: String = "KinesisWriter" def write(sparkSession: SparkSession, queryExecution: QueryExecution, kinesisParameters: Map[String, String]): Unit = { val schema = queryExecution.analyzed.output SQLExecution.withNewExecutionId(sparkSession, queryExecution) { queryExecution.toRdd.foreachPartition { iter => val writeTask = new KinesisWriteTask(kinesisParameters, schema) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close()) } } } }