org.apache.spark.sql.streaming.StreamTest Scala Example

Source File: MemorySinkV2Suite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.streaming.sources._
import org.apache.spark.sql.streaming.{OutputMode, StreamTest}
import org.apache.spark.sql.types.StructType

class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
  test("data writer") {
    val partition = 1234
    val writer = new MemoryDataWriter(
      partition, OutputMode.Append(), new StructType().add("i", "int"))
    writer.write(InternalRow(1))
    writer.write(InternalRow(2))
    writer.write(InternalRow(44))
    val msg = writer.commit()
    assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44))
    assert(msg.partition == partition)

    // Buffer should be cleared, so repeated commits should give empty.
    assert(writer.commit().data.isEmpty)
  }

  test("streaming writer") {
    val sink = new MemorySinkV2
    val writeSupport = new MemoryStreamWriter(
      sink, OutputMode.Append(), new StructType().add("i", "int"))
    writeSupport.commit(0,
      Array(
        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
      ))
    assert(sink.latestBatchId.contains(0))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
    writeSupport.commit(19,
      Array(
        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
        MemoryWriterCommitMessage(0, Seq(Row(33)))
      ))
    assert(sink.latestBatchId.contains(19))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))

    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
  }
}

Source File: MicroBatchExecutionSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.functions.{count, window}
import org.apache.spark.sql.streaming.StreamTest

class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter {

  import testImplicits._

  after {
    sqlContext.streams.active.foreach(_.stop())
  }

  test("SPARK-24156: do not plan a no-data batch again after it has already been planned") {
    val inputData = MemoryStream[Int]
    val df = inputData.toDF()
      .withColumn("eventTime", $"value".cast("timestamp"))
      .withWatermark("eventTime", "10 seconds")
      .groupBy(window($"eventTime", "5 seconds") as 'window)
      .agg(count("*") as 'count)
      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])

    testStream(df)(
      AddData(inputData, 10, 11, 12, 13, 14, 15), // Set watermark to 5
      CheckAnswer(),
      AddData(inputData, 25), // Set watermark to 15 to make MicroBatchExecution run no-data batch
      CheckAnswer((10, 5)),   // Last batch should be a no-data batch
      StopStream,
      Execute { q =>
        // Delete the last committed batch from the commit log to signify that the last batch
        // (a no-data batch) never completed
        val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L)
        q.commitLog.purgeAfter(commit - 1)
      },
      // Add data before start so that MicroBatchExecution can plan a batch. It should not,
      // it should first re-run the incomplete no-data batch and then run a new batch to process
      // new data.
      AddData(inputData, 30),
      StartStream(),
      CheckNewAnswer((15, 1)),   // This should not throw the error reported in SPARK-24156
      StopStream,
      Execute { q =>
        // Delete the entire commit log
        val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L)
        q.commitLog.purge(commit + 1)
      },
      AddData(inputData, 50),
      StartStream(),
      CheckNewAnswer((25, 1), (30, 1))   // This should not throw the error reported in SPARK-24156
    )
  }
}

Source File: StreamMetadataSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
}

Source File: StreamMetadataSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
}

Source File: DeltaSourceSuiteBase.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta

import java.io.File

import org.apache.spark.sql.delta.actions.Format

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.StreamTest
import org.apache.spark.sql.types.StructType

trait DeltaSourceSuiteBase extends StreamTest {
  protected def withMetadata(
      deltaLog: DeltaLog,
      schema: StructType,
      format: String = "parquet",
      tableId: Option[String] = None): Unit = {
    val txn = deltaLog.startTransaction()
    val baseMetadata = tableId.map { tId => txn.metadata.copy(id = tId) }.getOrElse(txn.metadata)
    txn.commit(baseMetadata.copy(
      schemaString = schema.json,
      format = Format(format)
    ) :: Nil, DeltaOperations.ManualUpdate)
  }

  object AddToReservoir {
    def apply(path: File, data: DataFrame): AssertOnQuery =
      AssertOnQuery { _ =>
        data.write.format("delta").mode("append").save(path.getAbsolutePath)
        true
      }
  }
}

Source File: StreamMetadataSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
}

Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
}

Source File: MemorySinkV2Suite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.streaming.sources._
import org.apache.spark.sql.streaming.{OutputMode, StreamTest}

class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
  test("data writer") {
    val partition = 1234
    val writer = new MemoryDataWriter(partition, OutputMode.Append())
    writer.write(Row(1))
    writer.write(Row(2))
    writer.write(Row(44))
    val msg = writer.commit()
    assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44))
    assert(msg.partition == partition)

    // Buffer should be cleared, so repeated commits should give empty.
    assert(writer.commit().data.isEmpty)
  }

  test("continuous writer") {
    val sink = new MemorySinkV2
    val writer = new MemoryStreamWriter(sink, OutputMode.Append())
    writer.commit(0,
      Array(
        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
      ))
    assert(sink.latestBatchId.contains(0))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
    writer.commit(19,
      Array(
        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
        MemoryWriterCommitMessage(0, Seq(Row(33)))
      ))
    assert(sink.latestBatchId.contains(19))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))

    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
  }

  test("microbatch writer") {
    val sink = new MemorySinkV2
    new MemoryWriter(sink, 0, OutputMode.Append()).commit(
      Array(
        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
      ))
    assert(sink.latestBatchId.contains(0))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
    new MemoryWriter(sink, 19, OutputMode.Append()).commit(
      Array(
        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
        MemoryWriterCommitMessage(0, Seq(Row(33)))
      ))
    assert(sink.latestBatchId.contains(19))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))

    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
  }
}

Source File: StreamMetadataSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.io.File
import java.util.UUID

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.streaming.StreamTest

class StreamMetadataSuite extends StreamTest {

  test("writing and reading") {
    withTempDir { dir =>
      val id = UUID.randomUUID.toString
      val metadata = StreamMetadata(id)
      val file = new Path(new File(dir, "test").toString)
      StreamMetadata.write(metadata, file, hadoopConf)
      val readMetadata = StreamMetadata.read(file, hadoopConf)
      assert(readMetadata.nonEmpty)
      assert(readMetadata.get.id === id)
    }
  }

  test("read Spark 2.1.0 format") {
    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
    assert(
      readForResource("query-metadata-logs-version-2.1.0.txt") ===
      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
  }

  private def readForResource(fileName: String): StreamMetadata = {
    val input = getClass.getResource(s"/structured-streaming/$fileName")
    StreamMetadata.read(new Path(input.toString), hadoopConf).get
  }

  private val hadoopConf = new Configuration()
}

org.apache.spark.sql.streaming.StreamTest Scala Examples