org.apache.spark.sql.execution.streaming.StreamExecution Scala Examples
The following examples show how to use org.apache.spark.sql.execution.streaming.StreamExecution.
Example 1
Source File: StreamingQueryException.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.annotation.Experimental import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution} val time: Long = System.currentTimeMillis override def toString(): String = { val causeStr = s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}" s""" |$causeStr | |${query.asInstanceOf[StreamExecution].toDebugString} """.stripMargin } }
Example 2
Source File: StateStoreRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.EpochTracker import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration override def getPreferredLocations(partition: Partition): Seq[String] = { val stateStoreProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) // If we're in continuous processing mode, we should get the store version for the current // epoch rather than the one at planning time. val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING)) .map(_.toBoolean).getOrElse(false) val currentVersion = if (isContinuous) { val epoch = EpochTracker.getCurrentEpoch assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.") epoch.get } else { storeVersion } store = StateStore.get( storeProviderId, keySchema, valueSchema, indexOrdinal, currentVersion, storeConf, hadoopConfBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 3
Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan) extends SparkPlan with Logging { override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { val writerFactory = writer.createWriterFactory() val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) logInfo(s"Start processing data source writer: $writer. " + s"The input RDD has ${rdd.partitions.length} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions)) try { // Force the RDD to run so continuous processing starts; no data is actually being collected // to the driver, as ContinuousWriteRDD outputs nothing. rdd.collect() } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. case _ if StreamExecution.isInterruptionException(cause) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause } } sparkContext.emptyRDD } }
Example 4
Source File: StateStoreMetricsTest.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.sql.execution.streaming.StreamExecution trait StateStoreMetricsTest extends StreamTest { private var lastCheckedRecentProgressIndex = -1 private var lastQuery: StreamExecution = null override def beforeEach(): Unit = { super.beforeEach() lastCheckedRecentProgressIndex = -1 } def assertNumStateRows(total: Seq[Long], updated: Seq[Long]): AssertOnQuery = AssertOnQuery(s"Check total state rows = $total, updated state rows = $updated") { q => // This assumes that the streaming query will not make any progress while the eventually // is being executed. eventually(timeout(streamingTimeout)) { val recentProgress = q.recentProgress require(recentProgress.nonEmpty, "No progress made, cannot check num state rows") require(recentProgress.length < spark.sessionState.conf.streamingProgressRetention, "This test assumes that all progresses are present in q.recentProgress but " + "some may have been dropped due to retention limits") if ( lastCheckedRecentProgressIndex = -1 lastQuery = q val numStateOperators = recentProgress.last.stateOperators.length val progressesSinceLastCheck = recentProgress .slice(lastCheckedRecentProgressIndex + 1, recentProgress.length) .filter(_.stateOperators.length == numStateOperators) val allNumUpdatedRowsSinceLastCheck = lazy val debugString = "recent progresses:\n" +"\n\n") val numTotalRows = assert(numTotalRows === total, s"incorrect total rows, $debugString") val numUpdatedRows = arraySum(allNumUpdatedRowsSinceLastCheck, numStateOperators) assert(numUpdatedRows === updated, s"incorrect updates rows, $debugString") lastCheckedRecentProgressIndex = recentProgress.length - 1 } true } def assertNumStateRows(total: Long, updated: Long): AssertOnQuery = assertNumStateRows(Seq(total), Seq(updated)) def arraySum(arraySeq: Seq[Array[Long]], arrayLength: Int): Seq[Long] = { if (arraySeq.isEmpty) return Seq.fill(arrayLength)(0L) assert(arraySeq.forall(_.length == arrayLength), "Arrays are of different lengths:\n" +"\n")) (0 until arrayLength).map { index => } } }
Example 5
Source File: SparkAtlasStreamingQueryEventTracker.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas import com.hortonworks.spark.atlas.sql.{QueryDetail, SparkExecutionPlanProcessor} import scala.collection.mutable import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.sql.streaming.StreamingQueryListener._ import com.hortonworks.spark.atlas.utils.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper} class SparkAtlasStreamingQueryEventTracker( atlasClient: AtlasClient, atlasClientConf: AtlasClientConf) extends StreamingQueryListener with Logging { def this(atlasClientConf: AtlasClientConf) = { this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf) } def this() { this(new AtlasClientConf) } private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf) private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf) executionPlanTracker.startThread() override def onQueryStarted(event: QueryStartedEvent): Unit = { logDebug(s"Start to track the Spark Streaming query in the Spark Atlas $event") } override def onQueryProgress(event: QueryProgressEvent): Unit = { if (!enabled) { // No op if SAC is disabled return } logInfo(s"Track running Spark Streaming query in the Spark Atlas: $event") val query = if (query != null) { val qd = query match { case query: StreamingQueryWrapper => Some(QueryDetail.fromStreamingQueryListener(query.streamingQuery, event)) case query: StreamExecution => Some(QueryDetail.fromStreamingQueryListener(query, event)) case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}") None } qd.foreach { q => if (q.qe != null) { executionPlanTracker.pushEvent(q) } else { logInfo(s"Can't retrieve query execution information for query ${}" + " - skip and wait for next batch.") } } } else { logWarn(s"Cannot find query ${} from active spark session!") } } override def onQueryTerminated(event: QueryTerminatedEvent): Unit = { logDebug(s"Tack Spark Streaming query in the Spark Atlas Terminated: $event") } }
Example 6
Source File: AtlasStreamingQueryProgressListener.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.sql.testhelper import com.hortonworks.spark.atlas.sql.QueryDetail import com.hortonworks.spark.atlas.utils.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper} import scala.collection.mutable import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} class AtlasStreamingQueryProgressListener extends StreamingQueryListener with Logging { val queryDetails = new mutable.MutableList[QueryDetail]() def onQueryStarted(event: QueryStartedEvent): Unit = {} def onQueryProgress(event: QueryProgressEvent): Unit = { // FIXME: this is totally duplicated with SparkAtlasStreamingQueryEventTracker... // Extract into somewhere... val query = if (query != null) { query match { case query: StreamingQueryWrapper => val qd = QueryDetail.fromStreamingQueryListener(query.streamingQuery, event) queryDetails += qd case query: StreamExecution => val qd = QueryDetail.fromStreamingQueryListener(query, event) queryDetails += qd case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}") } } else { logWarn(s"Cannot find query ${} from active spark session!") } } def onQueryTerminated(event: QueryTerminatedEvent): Unit = {} def clear(): Unit = { queryDetails.clear() } }
Example 7
Source File: CarbonStreamingQueryListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.streaming import java.util import java.util.UUID import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, StreamExecution} import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.carbondata.common.logging.LogServiceFactory class CarbonStreamingQueryListener(spark: SparkSession) extends StreamingQueryListener { private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) private val cache = new util.HashMap[UUID, String]() override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { val streamQuery = spark.streams.get( val qry = if (streamQuery.isInstanceOf[StreamExecution]) { // adapt spark 2.1 streamQuery.asInstanceOf[StreamExecution] } else { // adapt spark 2.2 and later version val clazz = Class.forName("org.apache.spark.sql.execution.streaming.StreamingQueryWrapper") val method = clazz.getMethod("streamingQuery") method.invoke(streamQuery).asInstanceOf[StreamExecution] } if (qry.sink.isInstanceOf[CarbonAppendableStreamSink]) {"Carbon streaming query started: " + val sink = qry.sink.asInstanceOf[CarbonAppendableStreamSink] val carbonTable = sink.carbonTable cache.put(, carbonTable.getTableUniqueName) } } override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { } override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = { val tableUniqueName = cache.remove( if (null != tableUniqueName) {"Carbon streaming query End: " + StreamSinkFactory.unLock(tableUniqueName) } } }
Example 8
Source File: DeltaSink.scala From delta with Apache License 2.0 | 5 votes |
package import import import import{ImplicitMetadataOperation, SchemaUtils} import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.NullType class DeltaSink( sqlContext: SQLContext, path: Path, partitionColumns: Seq[String], outputMode: OutputMode, options: DeltaOptions) extends Sink with ImplicitMetadataOperation with DeltaLogging { private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path) private val sqlConf = sqlContext.sparkSession.sessionState.conf override protected val canOverwriteSchema: Boolean = outputMode == OutputMode.Complete() && options.canOverwriteSchema override protected val canMergeSchema: Boolean = options.canMergeSchema override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn => val sc = data.sparkSession.sparkContext val metrics = Map[String, SQLMetric]( "numAddedFiles" -> createMetric(sc, "number of files added"), "numRemovedFiles" -> createMetric(sc, "number of files removed") ) val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY) assert(queryId != null) if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) { throw DeltaErrors.streamWriteNullTypeException } // If the batch reads the same Delta table as this sink is going to write to, then this // write has dependencies. Then make sure that this commit set hasDependencies to true // by injecting a read on the whole table. This needs to be done explicitly because // MicroBatchExecution has already enforced all the data skipping (by forcing the generation // of the executed plan) even before the transaction was started. val selfScan = data.queryExecution.analyzed.collectFirst { case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true }.nonEmpty if (selfScan) { txn.readWholeTable() } // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details updateMetadata( txn, data, partitionColumns, configuration = Map.empty, outputMode == OutputMode.Complete()) val currentVersion = txn.txnVersion(queryId) if (currentVersion >= batchId) { logInfo(s"Skipping already complete epoch $batchId, in query $queryId") return } val deletedFiles = outputMode match { case o if o == OutputMode.Complete() => deltaLog.assertRemovable() txn.filterFiles().map(_.remove) case _ => Nil } val newFiles = txn.writeFiles(data, Some(options)) val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata) metrics("numRemovedFiles").set(deletedFiles.size) metrics("numAddedFiles").set(newFiles.size) txn.registerSQLMetrics(sqlContext.sparkSession, metrics) txn.commit(setTxn ++ newFiles ++ deletedFiles, info) // This is needed to make the SQL metrics visible in the Spark UI val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates( sqlContext.sparkContext, executionId, metrics.values.toSeq) } override def toString(): String = s"DeltaSink[$path]" }
Example 9
Source File: KafkaContinuousTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession // Trait to configure StreamTest for kafka continuous execution tests. trait KafkaContinuousTest extends KafkaSourceTest { override val defaultTrigger = Trigger.Continuous(1000) override val defaultUseV2Sink = true // We need more than the default local[2] to be able to schedule all partitions simultaneously. override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // In addition to setting the partitions in Kafka, we have to wait until the query has // reconfigured to the new count so the test framework can hook in properly. override protected def setTopicPartitions( topic: String, newCount: Int, query: StreamExecution) = { testUtils.addPartitions(topic, newCount) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists(_.knownPartitions.size == newCount), s"query never reconfigured to $newCount partitions") } } // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } test("ensure continuous stream is being used") { val query = spark.readStream .format("rate") .option("numPartitions", "1") .option("rowsPerSecond", "1") .load() testStream(query)( Execute(q => assert(q.isInstanceOf[ContinuousExecution])) ) } }
Example 10
Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import scala.collection.mutable import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.{StreamTest, Trigger} import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession} // Run tests in KafkaSourceSuiteBase in continuous execution mode. class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest { import testImplicits._ override val brokerProps = Map("auto.create.topics.enable" -> "false") test("subscribing topic by pattern with topic deletions") { val topicPrefix = newTopic() val topic = topicPrefix + "-seems" val topic2 = topicPrefix + "-bad" testUtils.createTopic(topic, partitions = 5) testUtils.sendMessages(topic, Array("-1")) require(testUtils.getLatestOffsets(Set(topic)).size === 5) val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", testUtils.brokerAddress) .option("", "1") .option("subscribePattern", s"$topicPrefix-.*") .option("failOnDataLoss", "false") val kafka = reader.load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = => kv._2.toInt + 1) testStream(mapped)( makeSureGetOffsetCalled, AddKafkaData(Set(topic), 1, 2, 3), CheckAnswer(2, 3, 4), Execute { query => testUtils.deleteTopic(topic) testUtils.createTopic(topic2, partitions = 5) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists { r => // Ensure the new topic is present and the old topic is gone. r.knownPartitions.exists(_.topic == topic2) }, s"query never reconfigured to new topic $topic2") } }, AddKafkaData(Set(topic2), 4, 5, 6), CheckAnswer(2, 3, 4, 5, 6, 7) ) } } class KafkaContinuousSourceStressForDontFailOnDataLossSuite extends KafkaSourceStressForDontFailOnDataLossSuite { override protected def startStream(ds: Dataset[Int]) = { ds.writeStream .format("memory") .queryName("memory") .trigger(Trigger.Continuous("1 second")) .start() } }