org.apache.spark.sql.execution.streaming.StreamExecution Scala Example

Source File: StreamingQueryException.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution}


  val time: Long = System.currentTimeMillis

  override def toString(): String = {
    val causeStr =
      s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}"
    s"""
       |$causeStr
       |
       |${query.asInstanceOf[StreamExecution].toDebugString}
       """.stripMargin
  }
}

Source File: StateStoreRDD.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.state

import java.util.UUID

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.EpochTracker
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration


  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val stateStoreProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)
    storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq
  }

  override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
    var store: StateStore = null
    val storeProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)

    // If we're in continuous processing mode, we should get the store version for the current
    // epoch rather than the one at planning time.
    val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING))
      .map(_.toBoolean).getOrElse(false)
    val currentVersion = if (isContinuous) {
      val epoch = EpochTracker.getCurrentEpoch
      assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.")
      epoch.get
    } else {
      storeVersion
    }

    store = StateStore.get(
      storeProviderId, keySchema, valueSchema, indexOrdinal, currentVersion,
      storeConf, hadoopConfBroadcast.value.value)
    val inputIter = dataRDD.iterator(partition, ctxt)
    storeUpdateFunction(store, inputIter)
  }
}

Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous

import scala.util.control.NonFatal

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter


case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan)
    extends SparkPlan with Logging {
  override def children: Seq[SparkPlan] = Seq(query)
  override def output: Seq[Attribute] = Nil

  override protected def doExecute(): RDD[InternalRow] = {
    val writerFactory = writer.createWriterFactory()
    val rdd = new ContinuousWriteRDD(query.execute(), writerFactory)

    logInfo(s"Start processing data source writer: $writer. " +
      s"The input RDD has ${rdd.partitions.length} partitions.")
    EpochCoordinatorRef.get(
      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
      sparkContext.env)
      .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))

    try {
      // Force the RDD to run so continuous processing starts; no data is actually being collected
      // to the driver, as ContinuousWriteRDD outputs nothing.
      rdd.collect()
    } catch {
      case _: InterruptedException =>
        // Interruption is how continuous queries are ended, so accept and ignore the exception.
      case cause: Throwable =>
        cause match {
          // Do not wrap interruption exceptions that will be handled by streaming specially.
          case _ if StreamExecution.isInterruptionException(cause) => throw cause
          // Only wrap non fatal exceptions.
          case NonFatal(e) => throw new SparkException("Writing job aborted.", e)
          case _ => throw cause
        }
    }

    sparkContext.emptyRDD
  }
}

Source File: StateStoreMetricsTest.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.sql.execution.streaming.StreamExecution

trait StateStoreMetricsTest extends StreamTest {

  private var lastCheckedRecentProgressIndex = -1
  private var lastQuery: StreamExecution = null

  override def beforeEach(): Unit = {
    super.beforeEach()
    lastCheckedRecentProgressIndex = -1
  }

  def assertNumStateRows(total: Seq[Long], updated: Seq[Long]): AssertOnQuery =
    AssertOnQuery(s"Check total state rows = $total, updated state rows = $updated") { q =>
      // This assumes that the streaming query will not make any progress while the eventually
      // is being executed.
      eventually(timeout(streamingTimeout)) {
        val recentProgress = q.recentProgress
        require(recentProgress.nonEmpty, "No progress made, cannot check num state rows")
        require(recentProgress.length < spark.sessionState.conf.streamingProgressRetention,
          "This test assumes that all progresses are present in q.recentProgress but " +
            "some may have been dropped due to retention limits")

        if (q.ne(lastQuery)) lastCheckedRecentProgressIndex = -1
        lastQuery = q

        val numStateOperators = recentProgress.last.stateOperators.length
        val progressesSinceLastCheck = recentProgress
          .slice(lastCheckedRecentProgressIndex + 1, recentProgress.length)
          .filter(_.stateOperators.length == numStateOperators)

        val allNumUpdatedRowsSinceLastCheck =
          progressesSinceLastCheck.map(_.stateOperators.map(_.numRowsUpdated))

        lazy val debugString = "recent progresses:\n" +
          progressesSinceLastCheck.map(_.prettyJson).mkString("\n\n")

        val numTotalRows = recentProgress.last.stateOperators.map(_.numRowsTotal)
        assert(numTotalRows === total, s"incorrect total rows, $debugString")

        val numUpdatedRows = arraySum(allNumUpdatedRowsSinceLastCheck, numStateOperators)
        assert(numUpdatedRows === updated, s"incorrect updates rows, $debugString")

        lastCheckedRecentProgressIndex = recentProgress.length - 1
      }
      true
    }

  def assertNumStateRows(total: Long, updated: Long): AssertOnQuery =
    assertNumStateRows(Seq(total), Seq(updated))

  def arraySum(arraySeq: Seq[Array[Long]], arrayLength: Int): Seq[Long] = {
    if (arraySeq.isEmpty) return Seq.fill(arrayLength)(0L)

    assert(arraySeq.forall(_.length == arrayLength),
      "Arrays are of different lengths:\n" + arraySeq.map(_.toSeq).mkString("\n"))
    (0 until arrayLength).map { index => arraySeq.map(_.apply(index)).sum }
  }
}

Source File: SparkAtlasStreamingQueryEventTracker.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas

import com.hortonworks.spark.atlas.sql.{QueryDetail, SparkExecutionPlanProcessor}

import scala.collection.mutable
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener._
import com.hortonworks.spark.atlas.utils.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper}

class SparkAtlasStreamingQueryEventTracker(
     atlasClient: AtlasClient,
     atlasClientConf: AtlasClientConf)
  extends StreamingQueryListener with Logging {

  def this(atlasClientConf: AtlasClientConf) = {
    this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf)
  }

  def this() {
    this(new AtlasClientConf)
  }

  private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf)

  private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf)
  executionPlanTracker.startThread()

  override def onQueryStarted(event: QueryStartedEvent): Unit = {
    logDebug(s"Start to track the Spark Streaming query in the Spark Atlas $event")
  }

  override def onQueryProgress(event: QueryProgressEvent): Unit = {
    if (!enabled) {
      // No op if SAC is disabled
      return
    }
    logInfo(s"Track running Spark Streaming query in the Spark Atlas: $event")
    val query = SparkSession.active.streams.get(event.progress.id)
    if (query != null) {
      val qd = query match {
        case query: StreamingQueryWrapper =>
          Some(QueryDetail.fromStreamingQueryListener(query.streamingQuery, event))

        case query: StreamExecution =>
          Some(QueryDetail.fromStreamingQueryListener(query, event))

        case _ =>
          logWarn(s"Unexpected type of streaming query: ${query.getClass}")
          None
      }

      qd.foreach { q =>
        if (q.qe != null) {
          executionPlanTracker.pushEvent(q)
        } else {
          logInfo(s"Can't retrieve query execution information for query ${event.progress.id}" +
            " - skip and wait for next batch.")
        }
      }
    } else {
      logWarn(s"Cannot find query ${event.progress.id} from active spark session!")
    }
  }

  override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {
    logDebug(s"Tack Spark Streaming query in the Spark Atlas Terminated: $event")
  }
}

Source File: AtlasStreamingQueryProgressListener.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.sql.testhelper

import com.hortonworks.spark.atlas.sql.QueryDetail
import com.hortonworks.spark.atlas.utils.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper}

import scala.collection.mutable
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent}

class AtlasStreamingQueryProgressListener extends StreamingQueryListener with Logging {
  val queryDetails = new mutable.MutableList[QueryDetail]()

  def onQueryStarted(event: QueryStartedEvent): Unit = {}

  def onQueryProgress(event: QueryProgressEvent): Unit = {
    // FIXME: this is totally duplicated with SparkAtlasStreamingQueryEventTracker...
    //  Extract into somewhere...
    val query = SparkSession.active.streams.get(event.progress.id)
    if (query != null) {
      query match {
        case query: StreamingQueryWrapper =>
          val qd = QueryDetail.fromStreamingQueryListener(query.streamingQuery, event)
          queryDetails += qd

        case query: StreamExecution =>
          val qd = QueryDetail.fromStreamingQueryListener(query, event)
          queryDetails += qd

        case _ => logWarn(s"Unexpected type of streaming query: ${query.getClass}")
      }
    } else {
      logWarn(s"Cannot find query ${event.progress.id} from active spark session!")
    }
  }

  def onQueryTerminated(event: QueryTerminatedEvent): Unit = {}

  def clear(): Unit = {
    queryDetails.clear()
  }
}

Source File: CarbonStreamingQueryListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.streaming

import java.util
import java.util.UUID

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.streaming.{CarbonAppendableStreamSink, StreamExecution}
import org.apache.spark.sql.streaming.StreamingQueryListener

import org.apache.carbondata.common.logging.LogServiceFactory

class CarbonStreamingQueryListener(spark: SparkSession) extends StreamingQueryListener {

  private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

  private val cache = new util.HashMap[UUID, String]()

  override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {
    val streamQuery = spark.streams.get(event.id)
    val qry = if (streamQuery.isInstanceOf[StreamExecution]) {
      // adapt spark 2.1
      streamQuery.asInstanceOf[StreamExecution]
    } else {
      // adapt spark 2.2 and later version
      val clazz = Class.forName("org.apache.spark.sql.execution.streaming.StreamingQueryWrapper")
      val method = clazz.getMethod("streamingQuery")
      method.invoke(streamQuery).asInstanceOf[StreamExecution]
    }
    if (qry.sink.isInstanceOf[CarbonAppendableStreamSink]) {
      LOGGER.info("Carbon streaming query started: " + event.id)
      val sink = qry.sink.asInstanceOf[CarbonAppendableStreamSink]
      val carbonTable = sink.carbonTable
      cache.put(event.id, carbonTable.getTableUniqueName)
    }
  }

  override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
  }

  override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {
    val tableUniqueName = cache.remove(event.id)
    if (null != tableUniqueName) {
      LOGGER.info("Carbon streaming query End: " + event.id)
      StreamSinkFactory.unLock(tableUniqueName)
    }
  }
}

Source File: DeltaSink.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.sources

import org.apache.spark.sql.delta._
import org.apache.spark.sql.delta.actions.SetTransaction
import org.apache.spark.sql.delta.metering.DeltaLogging
import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils}
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.execution.SQLExecution
import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric
import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.NullType


class DeltaSink(
    sqlContext: SQLContext,
    path: Path,
    partitionColumns: Seq[String],
    outputMode: OutputMode,
    options: DeltaOptions)
  extends Sink with ImplicitMetadataOperation with DeltaLogging {

  private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path)

  private val sqlConf = sqlContext.sparkSession.sessionState.conf

  override protected val canOverwriteSchema: Boolean =
    outputMode == OutputMode.Complete() && options.canOverwriteSchema

  override protected val canMergeSchema: Boolean = options.canMergeSchema

  override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn =>
    val sc = data.sparkSession.sparkContext
    val metrics = Map[String, SQLMetric](
      "numAddedFiles" -> createMetric(sc, "number of files added"),
      "numRemovedFiles" -> createMetric(sc, "number of files removed")
    )
    val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY)
    assert(queryId != null)

    if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) {
      throw DeltaErrors.streamWriteNullTypeException
    }

    // If the batch reads the same Delta table as this sink is going to write to, then this
    // write has dependencies. Then make sure that this commit set hasDependencies to true
    // by injecting a read on the whole table. This needs to be done explicitly because
    // MicroBatchExecution has already enforced all the data skipping (by forcing the generation
    // of the executed plan) even before the transaction was started.
    val selfScan = data.queryExecution.analyzed.collectFirst {
      case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true
    }.nonEmpty
    if (selfScan) {
      txn.readWholeTable()
    }

    // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details
    updateMetadata(
      txn,
      data,
      partitionColumns,
      configuration = Map.empty,
      outputMode == OutputMode.Complete())

    val currentVersion = txn.txnVersion(queryId)
    if (currentVersion >= batchId) {
      logInfo(s"Skipping already complete epoch $batchId, in query $queryId")
      return
    }

    val deletedFiles = outputMode match {
      case o if o == OutputMode.Complete() =>
        deltaLog.assertRemovable()
        txn.filterFiles().map(_.remove)
      case _ => Nil
    }
    val newFiles = txn.writeFiles(data, Some(options))
    val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil
    val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata)
    metrics("numRemovedFiles").set(deletedFiles.size)
    metrics("numAddedFiles").set(newFiles.size)
    txn.registerSQLMetrics(sqlContext.sparkSession, metrics)
    txn.commit(setTxn ++ newFiles ++ deletedFiles, info)
    // This is needed to make the SQL metrics visible in the Spark UI
    val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
    SQLMetrics.postDriverMetricUpdates(
      sqlContext.sparkContext, executionId, metrics.values.toSeq)
  }

  override def toString(): String = s"DeltaSink[$path]"
}

Source File: KafkaContinuousTest.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import java.util.concurrent.atomic.AtomicInteger

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.test.TestSparkSession

// Trait to configure StreamTest for kafka continuous execution tests.
trait KafkaContinuousTest extends KafkaSourceTest {
  override val defaultTrigger = Trigger.Continuous(1000)
  override val defaultUseV2Sink = true

  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
  override protected def createSparkSession = new TestSparkSession(
    new SparkContext(
      "local[10]",
      "continuous-stream-test-sql-context",
      sparkConf.set("spark.sql.testkey", "true")))

  // In addition to setting the partitions in Kafka, we have to wait until the query has
  // reconfigured to the new count so the test framework can hook in properly.
  override protected def setTopicPartitions(
      topic: String, newCount: Int, query: StreamExecution) = {
    testUtils.addPartitions(topic, newCount)
    eventually(timeout(streamingTimeout)) {
      assert(
        query.lastExecution.logical.collectFirst {
          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
        }.exists(_.knownPartitions.size == newCount),
        s"query never reconfigured to $newCount partitions")
    }
  }

  // Continuous processing tasks end asynchronously, so test that they actually end.
  private val tasksEndedListener = new SparkListener() {
    val activeTaskIdCount = new AtomicInteger(0)

    override def onTaskStart(start: SparkListenerTaskStart): Unit = {
      activeTaskIdCount.incrementAndGet()
    }

    override def onTaskEnd(end: SparkListenerTaskEnd): Unit = {
      activeTaskIdCount.decrementAndGet()
    }
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    spark.sparkContext.addSparkListener(tasksEndedListener)
  }

  override def afterEach(): Unit = {
    eventually(timeout(streamingTimeout)) {
      assert(tasksEndedListener.activeTaskIdCount.get() == 0)
    }
    spark.sparkContext.removeSparkListener(tasksEndedListener)
    super.afterEach()
  }


  test("ensure continuous stream is being used") {
    val query = spark.readStream
      .format("rate")
      .option("numPartitions", "1")
      .option("rowsPerSecond", "1")
      .load()

    testStream(query)(
      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
    )
  }
}

Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
}

org.apache.spark.sql.execution.streaming.StreamExecution Scala Examples