org.apache.spark.sql.execution.streaming.StreamExecution Scala Examples

The following examples show how to use org.apache.spark.sql.execution.streaming.StreamExecution. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingQueryException.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution}


  val time: Long = System.currentTimeMillis

  override def toString(): String = {
    val causeStr =
      s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}"
    s"""
       |$causeStr
       |
       |${query.asInstanceOf[StreamExecution].toDebugString}
       """.stripMargin
  }
} 
Example 2
Source File: StateStoreRDD.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.state

import java.util.UUID

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.EpochTracker
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration


  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val stateStoreProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)
    storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq
  }

  override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
    var store: StateStore = null
    val storeProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)

    // If we're in continuous processing mode, we should get the store version for the current
    // epoch rather than the one at planning time.
    val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING))
      .map(_.toBoolean).getOrElse(false)
    val currentVersion = if (isContinuous) {
      val epoch = EpochTracker.getCurrentEpoch
      assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.")
      epoch.get
    } else {
      storeVersion
    }

    store = StateStore.get(
      storeProviderId, keySchema, valueSchema, indexOrdinal, currentVersion,
      storeConf, hadoopConfBroadcast.value.value)
    val inputIter = dataRDD.iterator(partition, ctxt)
    storeUpdateFunction(store, inputIter)
  }
} 
Example 3
Source File: WriteToContinuousDataSourceExec.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous

import scala.util.control.NonFatal

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter


case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan)
    extends SparkPlan with Logging {
  override def children: Seq[SparkPlan] = Seq(query)
  override def output: Seq[Attribute] = Nil

  override protected def doExecute(): RDD[InternalRow] = {
    val writerFactory = writer.createWriterFactory()
    val rdd = new ContinuousWriteRDD(query.execute(), writerFactory)

    logInfo(s"Start processing data source writer: $writer. " +
      s"The input RDD has ${rdd.partitions.length} partitions.")
    EpochCoordinatorRef.get(
      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
      sparkContext.env)
      .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))

    try {
      // Force the RDD to run so continuous processing starts; no data is actually being collected
      // to the driver, as ContinuousWriteRDD outputs nothing.
      rdd.collect()
    } catch {
      case _: InterruptedException =>
        // Interruption is how continuous queries are ended, so accept and ignore the exception.
      case cause: Throwable =>
        cause match {
          // Do not wrap interruption exceptions that will be handled by streaming specially.
          case _ if StreamExecution.isInterruptionException(cause) => throw cause
          // Only wrap non fatal exceptions.
          case NonFatal(e) => throw new SparkException("Writing job aborted.", e)
          case _ => throw cause
        }
    }

    sparkContext.emptyRDD
  }
} 
Example 4
Source File: StateStoreMetricsTest.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.sql.execution.streaming.StreamExecution

trait StateStoreMetricsTest extends StreamTest {

  private var lastCheckedRecentProgressIndex = -1
  private var lastQuery: StreamExecution = null

  override def beforeEach(): Unit = {
    super.beforeEach()
    lastCheckedRecentProgressIndex = -1
  }

  def assertNumStateRows(total: Seq[Long], updated: Seq[Long]): AssertOnQuery =
    AssertOnQuery(s"Check total state rows = $total, updated state rows = $updated") { q =>
      // This assumes that the streaming query will not make any progress while the eventually
      // is being executed.
      eventually(timeout(streamingTimeout)) {
        val recentProgress = q.recentProgress
        require(recentProgress.nonEmpty, "No progress made, cannot check num state rows")
        require(recentProgress.length < spark.sessionState.conf.streamingProgressRetention,
          "This test assumes that all progresses are present in q.recentProgress but " +
            "some may have been dropped due to retention limits")

        if (q.ne(lastQuery)) lastCheckedRecentProgressIndex = -1
        lastQuery = q

        val numStateOperators = recentProgress.last.stateOperators.length
        val progressesSinceLastCheck = recentProgress
          .slice(lastCheckedRecentProgressIndex + 1, recentProgress.length)
          .filter(_.stateOperators.length == numStateOperators)

        val allNumUpdatedRowsSinceLastCheck =
          progressesSinceLastCheck.map(_.stateOperators.map(_.numRowsUpdated))

        lazy val debugString = "recent progresses:\n" +
          progressesSinceLastCheck.map(_.prettyJson).mkString("\n\n")

        val numTotalRows = recentProgress.last.stateOperators.map(_.numRowsTotal)
        assert(numTotalRows === total, s"incorrect total rows, $debugString")

        val numUpdatedRows = arraySum(allNumUpdatedRowsSinceLastCheck, numStateOperators)
        assert(numUpdatedRows === updated, s"incorrect updates rows, $debugString")

        lastCheckedRecentProgressIndex = recentProgress.length - 1
      }
      true
    }

  def assertNumStateRows(total: Long, updated: Long): AssertOnQuery =
    assertNumStateRows(Seq(total), Seq(updated))

  def arraySum(arraySeq: Seq[Array[Long]], arrayLength: Int): Seq[Long] = {
    if (arraySeq.isEmpty) return Seq.fill(arrayLength)(0L)

    assert(arraySeq.forall(_.length == arrayLength),
      "Arrays are of different lengths:\n" + arraySeq.map(_.toSeq).mkString("\n"))
    (0 until arrayLength).map { index => arraySeq.map(_.apply(index)).sum }
  }
} 
Example 5
Source File: SparkAtlasStreamingQueryEventTracker.scala    From spark-atlas-connector   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.atlas

import com.hortonworks.spark.atlas.sql.{QueryDetail, SparkExecutionPlanProcessor}

import scala.collection.mutable
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener._
import com.hortonworks.spark.atlas.utils.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper}

class SparkAtlasStreamingQueryEventTracker(
     atlasClient: AtlasClient,
     atlasClientConf: AtlasClientConf)
  extends StreamingQueryListener with Logging {

  def this(atlasClientConf: AtlasClientConf) = {
    this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf)
  }

  def this() {
    this(new AtlasClientConf)
  }

  private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf)

  private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf)
  executionPlanTracker.startThread()

  override def onQueryStarted(event: QueryStartedEvent): Unit = {
    logDebug(s"Start to track the Spark Streaming query in the Spark Atlas $event")
  }

  override def onQueryProgress(event: QueryProgressEvent): Unit = {
    if (!enabled) {
      // No op if SAC is disabled
      return
    }
    logInfo(s"Track running Spark Streaming query in the Spark Atlas: $event")
    val query = SparkSession.active.streams.get(event.progress.id)
    if (query != null) {
      val qd = query match {
        case query: StreamingQueryWrapper =>
          Some(QueryDetail.fromStreamingQueryListener(query.streamingQuery, event))

        case query: StreamExecution =>
          Some(QueryDetail.fromStreamingQueryListener(query, event))

        case _ =>
          logWarn(s"Unexpected type of streaming query: ${query.getClass}")
          None
      }

      qd.foreach { q =>
        if (q.qe != null) {
          executionPlanTracker.pushEvent(q)
        } else {
          logInfo(s"Can't retrieve query execution information for query ${event.progress.id}" +
            " - skip and wait for next batch.")
        }
      }
    } else {
      logWarn(s"Cannot find query ${event.progress.id} from active spark session!")
    }
  }

  override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
    logDebug(s"Tack Spark Streaming query in the Spark Atlas Terminated: $event")
  }
} 
Example 6
Source File: AtlasStreamingQueryProgressListener.scala    From spark-atlas-connector   with Apache License 2.0 5 votes vote down vote up
package com.hortonworks.spark.atlas.sql.testhelper

import com.hortonworks.spark.atlas.sql.QueryDetail
import com.hortonworks.spark.atlas.utils.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper}

import scala.collection.mutable
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent}

class AtlasStreamingQueryProgressListener extends StreamingQueryListener with Logging {
  val queryDetails = new mutable.MutableList[QueryDetail]()

  def onQueryStarted(event: QueryStartedEvent): Unit = {}

  def onQueryProgress(event: QueryProgressEvent): Unit = {
    // FIXME: this is totally duplicated with SparkAtlasStreamingQueryEventTracker...
    //  Extract into somewhere...
    val query = SparkSession.active.streams.get(event.progress.id)
    if (query != null) {
      query match {
        case query: StreamingQueryWrapper =>
          val qd = QueryDetail.fromStreamingQueryListener(query.streamingQuery, event)
          queryDetails += qd

        case query: StreamExecution =>
          val qd = QueryDetail.fromStreamingQueryListener(query, event)
          queryDetails += qd

        case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}")
      }
    } else {
      logWarn(s"Cannot find query ${event.progress.id} from active spark session!")
    }
  }

  def onQueryTerminated(event: QueryTerminatedEvent): Unit = {}

  def clear(): Unit = {
    queryDetails.clear()
  }
} 
Example 7
Source File: CarbonStreamingQueryListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.streaming

import java.util
import java.util.UUID

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, StreamExecution}
import org.apache.spark.sql.streaming.StreamingQueryListener

import org.apache.carbondata.common.logging.LogServiceFactory

class CarbonStreamingQueryListener(spark: SparkSession) extends StreamingQueryListener {

  private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  private val cache = new util.HashMap[UUID, String]()

  override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {
    val streamQuery = spark.streams.get(event.id)
    val qry = if (streamQuery.isInstanceOf[StreamExecution]) {
      // adapt spark 2.1
      streamQuery.asInstanceOf[StreamExecution]
    } else {
      // adapt spark 2.2 and later version
      val clazz = Class.forName("org.apache.spark.sql.execution.streaming.StreamingQueryWrapper")
      val method = clazz.getMethod("streamingQuery")
      method.invoke(streamQuery).asInstanceOf[StreamExecution]
    }
    if (qry.sink.isInstanceOf[CarbonAppendableStreamSink]) {
      LOGGER.info("Carbon streaming query started: " + event.id)
      val sink = qry.sink.asInstanceOf[CarbonAppendableStreamSink]
      val carbonTable = sink.carbonTable
      cache.put(event.id, carbonTable.getTableUniqueName)
    }
  }

  override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
  }

  override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {
    val tableUniqueName = cache.remove(event.id)
    if (null != tableUniqueName) {
      LOGGER.info("Carbon streaming query End: " + event.id)
      StreamSinkFactory.unLock(tableUniqueName)
    }
  }
} 
Example 8
Source File: DeltaSink.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.sources

import org.apache.spark.sql.delta._
import org.apache.spark.sql.delta.actions.SetTransaction
import org.apache.spark.sql.delta.metering.DeltaLogging
import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils}
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.execution.SQLExecution
import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric
import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.NullType


class DeltaSink(
    sqlContext: SQLContext,
    path: Path,
    partitionColumns: Seq[String],
    outputMode: OutputMode,
    options: DeltaOptions)
  extends Sink with ImplicitMetadataOperation with DeltaLogging {

  private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path)

  private val sqlConf = sqlContext.sparkSession.sessionState.conf

  override protected val canOverwriteSchema: Boolean =
    outputMode == OutputMode.Complete() && options.canOverwriteSchema

  override protected val canMergeSchema: Boolean = options.canMergeSchema

  override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn =>
    val sc = data.sparkSession.sparkContext
    val metrics = Map[String, SQLMetric](
      "numAddedFiles" -> createMetric(sc, "number of files added"),
      "numRemovedFiles" -> createMetric(sc, "number of files removed")
    )
    val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY)
    assert(queryId != null)

    if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) {
      throw DeltaErrors.streamWriteNullTypeException
    }

    // If the batch reads the same Delta table as this sink is going to write to, then this
    // write has dependencies. Then make sure that this commit set hasDependencies to true
    // by injecting a read on the whole table. This needs to be done explicitly because
    // MicroBatchExecution has already enforced all the data skipping (by forcing the generation
    // of the executed plan) even before the transaction was started.
    val selfScan = data.queryExecution.analyzed.collectFirst {
      case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true
    }.nonEmpty
    if (selfScan) {
      txn.readWholeTable()
    }

    // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details
    updateMetadata(
      txn,
      data,
      partitionColumns,
      configuration = Map.empty,
      outputMode == OutputMode.Complete())

    val currentVersion = txn.txnVersion(queryId)
    if (currentVersion >= batchId) {
      logInfo(s"Skipping already complete epoch $batchId, in query $queryId")
      return
    }

    val deletedFiles = outputMode match {
      case o if o == OutputMode.Complete() =>
        deltaLog.assertRemovable()
        txn.filterFiles().map(_.remove)
      case _ => Nil
    }
    val newFiles = txn.writeFiles(data, Some(options))
    val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil
    val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata)
    metrics("numRemovedFiles").set(deletedFiles.size)
    metrics("numAddedFiles").set(newFiles.size)
    txn.registerSQLMetrics(sqlContext.sparkSession, metrics)
    txn.commit(setTxn ++ newFiles ++ deletedFiles, info)
    // This is needed to make the SQL metrics visible in the Spark UI
    val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
    SQLMetrics.postDriverMetricUpdates(
      sqlContext.sparkContext, executionId, metrics.values.toSeq)
  }

  override def toString(): String = s"DeltaSink[$path]"
} 
Example 9
Source File: KafkaContinuousTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.util.concurrent.atomic.AtomicInteger

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.test.TestSparkSession

// Trait to configure StreamTest for kafka continuous execution tests.
trait KafkaContinuousTest extends KafkaSourceTest {
  override val defaultTrigger = Trigger.Continuous(1000)
  override val defaultUseV2Sink = true

  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
  override protected def createSparkSession = new TestSparkSession(
    new SparkContext(
      "local[10]",
      "continuous-stream-test-sql-context",
      sparkConf.set("spark.sql.testkey", "true")))

  // In addition to setting the partitions in Kafka, we have to wait until the query has
  // reconfigured to the new count so the test framework can hook in properly.
  override protected def setTopicPartitions(
      topic: String, newCount: Int, query: StreamExecution) = {
    testUtils.addPartitions(topic, newCount)
    eventually(timeout(streamingTimeout)) {
      assert(
        query.lastExecution.logical.collectFirst {
          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
        }.exists(_.knownPartitions.size == newCount),
        s"query never reconfigured to $newCount partitions")
    }
  }

  // Continuous processing tasks end asynchronously, so test that they actually end.
  private val tasksEndedListener = new SparkListener() {
    val activeTaskIdCount = new AtomicInteger(0)

    override def onTaskStart(start: SparkListenerTaskStart): Unit = {
      activeTaskIdCount.incrementAndGet()
    }

    override def onTaskEnd(end: SparkListenerTaskEnd): Unit = {
      activeTaskIdCount.decrementAndGet()
    }
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    spark.sparkContext.addSparkListener(tasksEndedListener)
  }

  override def afterEach(): Unit = {
    eventually(timeout(streamingTimeout)) {
      assert(tasksEndedListener.activeTaskIdCount.get() == 0)
    }
    spark.sparkContext.removeSparkListener(tasksEndedListener)
    super.afterEach()
  }


  test("ensure continuous stream is being used") {
    val query = spark.readStream
      .format("rate")
      .option("numPartitions", "1")
      .option("rowsPerSecond", "1")
      .load()

    testStream(query)(
      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
    )
  }
} 
Example 10
Source File: KafkaContinuousSourceSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
}