org.apache.spark.sql.execution.streaming.StreamExecution Scala Examples
The following examples show how to use org.apache.spark.sql.execution.streaming.StreamExecution.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingQueryException.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.annotation.Experimental import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution} val time: Long = System.currentTimeMillis override def toString(): String = { val causeStr = s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}" s""" |$causeStr | |${query.asInstanceOf[StreamExecution].toDebugString} """.stripMargin } }
Example 2
Source File: StateStoreRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.EpochTracker import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration override def getPreferredLocations(partition: Partition): Seq[String] = { val stateStoreProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) // If we're in continuous processing mode, we should get the store version for the current // epoch rather than the one at planning time. val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING)) .map(_.toBoolean).getOrElse(false) val currentVersion = if (isContinuous) { val epoch = EpochTracker.getCurrentEpoch assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.") epoch.get } else { storeVersion } store = StateStore.get( storeProviderId, keySchema, valueSchema, indexOrdinal, currentVersion, storeConf, hadoopConfBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 3
Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan) extends SparkPlan with Logging { override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { val writerFactory = writer.createWriterFactory() val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) logInfo(s"Start processing data source writer: $writer. " + s"The input RDD has ${rdd.partitions.length} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions)) try { // Force the RDD to run so continuous processing starts; no data is actually being collected // to the driver, as ContinuousWriteRDD outputs nothing. rdd.collect() } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. case _ if StreamExecution.isInterruptionException(cause) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause } } sparkContext.emptyRDD } }
Example 4
Source File: StateStoreMetricsTest.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.sql.execution.streaming.StreamExecution trait StateStoreMetricsTest extends StreamTest { private var lastCheckedRecentProgressIndex = -1 private var lastQuery: StreamExecution = null override def beforeEach(): Unit = { super.beforeEach() lastCheckedRecentProgressIndex = -1 } def assertNumStateRows(total: Seq[Long], updated: Seq[Long]): AssertOnQuery = AssertOnQuery(s"Check total state rows = $total, updated state rows = $updated") { q => // This assumes that the streaming query will not make any progress while the eventually // is being executed. eventually(timeout(streamingTimeout)) { val recentProgress = q.recentProgress require(recentProgress.nonEmpty, "No progress made, cannot check num state rows") require(recentProgress.length < spark.sessionState.conf.streamingProgressRetention, "This test assumes that all progresses are present in q.recentProgress but " + "some may have been dropped due to retention limits") if (q.ne(lastQuery)) lastCheckedRecentProgressIndex = -1 lastQuery = q val numStateOperators = recentProgress.last.stateOperators.length val progressesSinceLastCheck = recentProgress .slice(lastCheckedRecentProgressIndex + 1, recentProgress.length) .filter(_.stateOperators.length == numStateOperators) val allNumUpdatedRowsSinceLastCheck = progressesSinceLastCheck.map(_.stateOperators.map(_.numRowsUpdated)) lazy val debugString = "recent progresses:\n" + progressesSinceLastCheck.map(_.prettyJson).mkString("\n\n") val numTotalRows = recentProgress.last.stateOperators.map(_.numRowsTotal) assert(numTotalRows === total, s"incorrect total rows, $debugString") val numUpdatedRows = arraySum(allNumUpdatedRowsSinceLastCheck, numStateOperators) assert(numUpdatedRows === updated, s"incorrect updates rows, $debugString") lastCheckedRecentProgressIndex = recentProgress.length - 1 } true } def assertNumStateRows(total: Long, updated: Long): AssertOnQuery = assertNumStateRows(Seq(total), Seq(updated)) def arraySum(arraySeq: Seq[Array[Long]], arrayLength: Int): Seq[Long] = { if (arraySeq.isEmpty) return Seq.fill(arrayLength)(0L) assert(arraySeq.forall(_.length == arrayLength), "Arrays are of different lengths:\n" + arraySeq.map(_.toSeq).mkString("\n")) (0 until arrayLength).map { index => arraySeq.map(_.apply(index)).sum } } }
Example 5
Source File: SparkAtlasStreamingQueryEventTracker.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas import com.hortonworks.spark.atlas.sql.{QueryDetail, SparkExecutionPlanProcessor} import scala.collection.mutable import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.sql.streaming.StreamingQueryListener._ import com.hortonworks.spark.atlas.utils.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper} class SparkAtlasStreamingQueryEventTracker( atlasClient: AtlasClient, atlasClientConf: AtlasClientConf) extends StreamingQueryListener with Logging { def this(atlasClientConf: AtlasClientConf) = { this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf) } def this() { this(new AtlasClientConf) } private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf) private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf) executionPlanTracker.startThread() override def onQueryStarted(event: QueryStartedEvent): Unit = { logDebug(s"Start to track the Spark Streaming query in the Spark Atlas $event") } override def onQueryProgress(event: QueryProgressEvent): Unit = { if (!enabled) { // No op if SAC is disabled return } logInfo(s"Track running Spark Streaming query in the Spark Atlas: $event") val query = SparkSession.active.streams.get(event.progress.id) if (query != null) { val qd = query match { case query: StreamingQueryWrapper => Some(QueryDetail.fromStreamingQueryListener(query.streamingQuery, event)) case query: StreamExecution => Some(QueryDetail.fromStreamingQueryListener(query, event)) case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}") None } qd.foreach { q => if (q.qe != null) { executionPlanTracker.pushEvent(q) } else { logInfo(s"Can't retrieve query execution information for query ${event.progress.id}" + " - skip and wait for next batch.") } } } else { logWarn(s"Cannot find query ${event.progress.id} from active spark session!") } } override def onQueryTerminated(event: QueryTerminatedEvent): Unit = { logDebug(s"Tack Spark Streaming query in the Spark Atlas Terminated: $event") } }
Example 6
Source File: AtlasStreamingQueryProgressListener.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.sql.testhelper import com.hortonworks.spark.atlas.sql.QueryDetail import com.hortonworks.spark.atlas.utils.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper} import scala.collection.mutable import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} class AtlasStreamingQueryProgressListener extends StreamingQueryListener with Logging { val queryDetails = new mutable.MutableList[QueryDetail]() def onQueryStarted(event: QueryStartedEvent): Unit = {} def onQueryProgress(event: QueryProgressEvent): Unit = { // FIXME: this is totally duplicated with SparkAtlasStreamingQueryEventTracker... // Extract into somewhere... val query = SparkSession.active.streams.get(event.progress.id) if (query != null) { query match { case query: StreamingQueryWrapper => val qd = QueryDetail.fromStreamingQueryListener(query.streamingQuery, event) queryDetails += qd case query: StreamExecution => val qd = QueryDetail.fromStreamingQueryListener(query, event) queryDetails += qd case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}") } } else { logWarn(s"Cannot find query ${event.progress.id} from active spark session!") } } def onQueryTerminated(event: QueryTerminatedEvent): Unit = {} def clear(): Unit = { queryDetails.clear() } }
Example 7
Source File: CarbonStreamingQueryListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.streaming import java.util import java.util.UUID import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, StreamExecution} import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.carbondata.common.logging.LogServiceFactory class CarbonStreamingQueryListener(spark: SparkSession) extends StreamingQueryListener { private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName) private val cache = new util.HashMap[UUID, String]() override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { val streamQuery = spark.streams.get(event.id) val qry = if (streamQuery.isInstanceOf[StreamExecution]) { // adapt spark 2.1 streamQuery.asInstanceOf[StreamExecution] } else { // adapt spark 2.2 and later version val clazz = Class.forName("org.apache.spark.sql.execution.streaming.StreamingQueryWrapper") val method = clazz.getMethod("streamingQuery") method.invoke(streamQuery).asInstanceOf[StreamExecution] } if (qry.sink.isInstanceOf[CarbonAppendableStreamSink]) { LOGGER.info("Carbon streaming query started: " + event.id) val sink = qry.sink.asInstanceOf[CarbonAppendableStreamSink] val carbonTable = sink.carbonTable cache.put(event.id, carbonTable.getTableUniqueName) } } override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { } override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = { val tableUniqueName = cache.remove(event.id) if (null != tableUniqueName) { LOGGER.info("Carbon streaming query End: " + event.id) StreamSinkFactory.unLock(tableUniqueName) } } }
Example 8
Source File: DeltaSink.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.sources import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.SetTransaction import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.NullType class DeltaSink( sqlContext: SQLContext, path: Path, partitionColumns: Seq[String], outputMode: OutputMode, options: DeltaOptions) extends Sink with ImplicitMetadataOperation with DeltaLogging { private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path) private val sqlConf = sqlContext.sparkSession.sessionState.conf override protected val canOverwriteSchema: Boolean = outputMode == OutputMode.Complete() && options.canOverwriteSchema override protected val canMergeSchema: Boolean = options.canMergeSchema override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn => val sc = data.sparkSession.sparkContext val metrics = Map[String, SQLMetric]( "numAddedFiles" -> createMetric(sc, "number of files added"), "numRemovedFiles" -> createMetric(sc, "number of files removed") ) val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY) assert(queryId != null) if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) { throw DeltaErrors.streamWriteNullTypeException } // If the batch reads the same Delta table as this sink is going to write to, then this // write has dependencies. Then make sure that this commit set hasDependencies to true // by injecting a read on the whole table. This needs to be done explicitly because // MicroBatchExecution has already enforced all the data skipping (by forcing the generation // of the executed plan) even before the transaction was started. val selfScan = data.queryExecution.analyzed.collectFirst { case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true }.nonEmpty if (selfScan) { txn.readWholeTable() } // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details updateMetadata( txn, data, partitionColumns, configuration = Map.empty, outputMode == OutputMode.Complete()) val currentVersion = txn.txnVersion(queryId) if (currentVersion >= batchId) { logInfo(s"Skipping already complete epoch $batchId, in query $queryId") return } val deletedFiles = outputMode match { case o if o == OutputMode.Complete() => deltaLog.assertRemovable() txn.filterFiles().map(_.remove) case _ => Nil } val newFiles = txn.writeFiles(data, Some(options)) val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata) metrics("numRemovedFiles").set(deletedFiles.size) metrics("numAddedFiles").set(newFiles.size) txn.registerSQLMetrics(sqlContext.sparkSession, metrics) txn.commit(setTxn ++ newFiles ++ deletedFiles, info) // This is needed to make the SQL metrics visible in the Spark UI val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates( sqlContext.sparkContext, executionId, metrics.values.toSeq) } override def toString(): String = s"DeltaSink[$path]" }
Example 9
Source File: KafkaContinuousTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession // Trait to configure StreamTest for kafka continuous execution tests. trait KafkaContinuousTest extends KafkaSourceTest { override val defaultTrigger = Trigger.Continuous(1000) override val defaultUseV2Sink = true // We need more than the default local[2] to be able to schedule all partitions simultaneously. override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // In addition to setting the partitions in Kafka, we have to wait until the query has // reconfigured to the new count so the test framework can hook in properly. override protected def setTopicPartitions( topic: String, newCount: Int, query: StreamExecution) = { testUtils.addPartitions(topic, newCount) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists(_.knownPartitions.size == newCount), s"query never reconfigured to $newCount partitions") } } // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } test("ensure continuous stream is being used") { val query = spark.readStream .format("rate") .option("numPartitions", "1") .option("rowsPerSecond", "1") .load() testStream(query)( Execute(q => assert(q.isInstanceOf[ContinuousExecution])) ) } }
Example 10
Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import scala.collection.mutable import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.{StreamTest, Trigger} import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession} // Run tests in KafkaSourceSuiteBase in continuous execution mode. class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest { import testImplicits._ override val brokerProps = Map("auto.create.topics.enable" -> "false") test("subscribing topic by pattern with topic deletions") { val topicPrefix = newTopic() val topic = topicPrefix + "-seems" val topic2 = topicPrefix + "-bad" testUtils.createTopic(topic, partitions = 5) testUtils.sendMessages(topic, Array("-1")) require(testUtils.getLatestOffsets(Set(topic)).size === 5) val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", testUtils.brokerAddress) .option("kafka.metadata.max.age.ms", "1") .option("subscribePattern", s"$topicPrefix-.*") .option("failOnDataLoss", "false") val kafka = reader.load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = kafka.map(kv => kv._2.toInt + 1) testStream(mapped)( makeSureGetOffsetCalled, AddKafkaData(Set(topic), 1, 2, 3), CheckAnswer(2, 3, 4), Execute { query => testUtils.deleteTopic(topic) testUtils.createTopic(topic2, partitions = 5) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists { r => // Ensure the new topic is present and the old topic is gone. r.knownPartitions.exists(_.topic == topic2) }, s"query never reconfigured to new topic $topic2") } }, AddKafkaData(Set(topic2), 4, 5, 6), CheckAnswer(2, 3, 4, 5, 6, 7) ) } } class KafkaContinuousSourceStressForDontFailOnDataLossSuite extends KafkaSourceStressForDontFailOnDataLossSuite { override protected def startStream(ds: Dataset[Int]) = { ds.writeStream .format("memory") .queryName("memory") .trigger(Trigger.Continuous("1 second")) .start() } }