org.apache.spark.sql.streaming.StreamTest Scala Examples
The following examples show how to use org.apache.spark.sql.streaming.StreamTest.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MemorySinkV2Suite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.streaming.sources._ import org.apache.spark.sql.streaming.{OutputMode, StreamTest} import org.apache.spark.sql.types.StructType class MemorySinkV2Suite extends StreamTest with BeforeAndAfter { test("data writer") { val partition = 1234 val writer = new MemoryDataWriter( partition, OutputMode.Append(), new StructType().add("i", "int")) writer.write(InternalRow(1)) writer.write(InternalRow(2)) writer.write(InternalRow(44)) val msg = writer.commit() assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44)) assert(msg.partition == partition) // Buffer should be cleared, so repeated commits should give empty. assert(writer.commit().data.isEmpty) } test("streaming writer") { val sink = new MemorySinkV2 val writeSupport = new MemoryStreamWriter( sink, OutputMode.Append(), new StructType().add("i", "int")) writeSupport.commit(0, Array( MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))), MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))), MemoryWriterCommitMessage(2, Seq(Row(6), Row(7))) )) assert(sink.latestBatchId.contains(0)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7)) writeSupport.commit(19, Array( MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))), MemoryWriterCommitMessage(0, Seq(Row(33))) )) assert(sink.latestBatchId.contains(19)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33)) assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33)) } }
Example 2
Source File: MicroBatchExecutionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.functions.{count, window} import org.apache.spark.sql.streaming.StreamTest class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter { import testImplicits._ after { sqlContext.streams.active.foreach(_.stop()) } test("SPARK-24156: do not plan a no-data batch again after it has already been planned") { val inputData = MemoryStream[Int] val df = inputData.toDF() .withColumn("eventTime", $"value".cast("timestamp")) .withWatermark("eventTime", "10 seconds") .groupBy(window($"eventTime", "5 seconds") as 'window) .agg(count("*") as 'count) .select($"window".getField("start").cast("long").as[Long], $"count".as[Long]) testStream(df)( AddData(inputData, 10, 11, 12, 13, 14, 15), // Set watermark to 5 CheckAnswer(), AddData(inputData, 25), // Set watermark to 15 to make MicroBatchExecution run no-data batch CheckAnswer((10, 5)), // Last batch should be a no-data batch StopStream, Execute { q => // Delete the last committed batch from the commit log to signify that the last batch // (a no-data batch) never completed val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purgeAfter(commit - 1) }, // Add data before start so that MicroBatchExecution can plan a batch. It should not, // it should first re-run the incomplete no-data batch and then run a new batch to process // new data. AddData(inputData, 30), StartStream(), CheckNewAnswer((15, 1)), // This should not throw the error reported in SPARK-24156 StopStream, Execute { q => // Delete the entire commit log val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purge(commit + 1) }, AddData(inputData, 50), StartStream(), CheckNewAnswer((25, 1), (30, 1)) // This should not throw the error reported in SPARK-24156 ) } }
Example 3
Source File: StreamMetadataSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.streaming.StreamTest class StreamMetadataSuite extends StreamTest { test("writing and reading") { withTempDir { dir => val id = UUID.randomUUID.toString val metadata = StreamMetadata(id) val file = new Path(new File(dir, "test").toString) StreamMetadata.write(metadata, file, hadoopConf) val readMetadata = StreamMetadata.read(file, hadoopConf) assert(readMetadata.nonEmpty) assert(readMetadata.get.id === id) } } test("read Spark 2.1.0 format") { // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0 assert( readForResource("query-metadata-logs-version-2.1.0.txt") === StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e")) } private def readForResource(fileName: String): StreamMetadata = { val input = getClass.getResource(s"/structured-streaming/$fileName") StreamMetadata.read(new Path(input.toString), hadoopConf).get } private val hadoopConf = new Configuration() }
Example 4
Source File: StreamMetadataSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.streaming.StreamTest class StreamMetadataSuite extends StreamTest { test("writing and reading") { withTempDir { dir => val id = UUID.randomUUID.toString val metadata = StreamMetadata(id) val file = new Path(new File(dir, "test").toString) StreamMetadata.write(metadata, file, hadoopConf) val readMetadata = StreamMetadata.read(file, hadoopConf) assert(readMetadata.nonEmpty) assert(readMetadata.get.id === id) } } test("read Spark 2.1.0 format") { // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0 assert( readForResource("query-metadata-logs-version-2.1.0.txt") === StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e")) } private def readForResource(fileName: String): StreamMetadata = { val input = getClass.getResource(s"/structured-streaming/$fileName") StreamMetadata.read(new Path(input.toString), hadoopConf).get } private val hadoopConf = new Configuration() }
Example 5
Source File: DeltaSourceSuiteBase.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import java.io.File import org.apache.spark.sql.delta.actions.Format import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.StreamTest import org.apache.spark.sql.types.StructType trait DeltaSourceSuiteBase extends StreamTest { protected def withMetadata( deltaLog: DeltaLog, schema: StructType, format: String = "parquet", tableId: Option[String] = None): Unit = { val txn = deltaLog.startTransaction() val baseMetadata = tableId.map { tId => txn.metadata.copy(id = tId) }.getOrElse(txn.metadata) txn.commit(baseMetadata.copy( schemaString = schema.json, format = Format(format) ) :: Nil, DeltaOperations.ManualUpdate) } object AddToReservoir { def apply(path: File, data: DataFrame): AssertOnQuery = AssertOnQuery { _ => data.write.format("delta").mode("append").save(path.getAbsolutePath) true } } }
Example 6
Source File: StreamMetadataSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.streaming.StreamTest class StreamMetadataSuite extends StreamTest { test("writing and reading") { withTempDir { dir => val id = UUID.randomUUID.toString val metadata = StreamMetadata(id) val file = new Path(new File(dir, "test").toString) StreamMetadata.write(metadata, file, hadoopConf) val readMetadata = StreamMetadata.read(file, hadoopConf) assert(readMetadata.nonEmpty) assert(readMetadata.get.id === id) } } test("read Spark 2.1.0 format") { // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0 assert( readForResource("query-metadata-logs-version-2.1.0.txt") === StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e")) } private def readForResource(fileName: String): StreamMetadata = { val input = getClass.getResource(s"/structured-streaming/$fileName") StreamMetadata.read(new Path(input.toString), hadoopConf).get } private val hadoopConf = new Configuration() }
Example 7
Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import scala.collection.mutable import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.{StreamTest, Trigger} import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession} // Run tests in KafkaSourceSuiteBase in continuous execution mode. class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest { import testImplicits._ override val brokerProps = Map("auto.create.topics.enable" -> "false") test("subscribing topic by pattern with topic deletions") { val topicPrefix = newTopic() val topic = topicPrefix + "-seems" val topic2 = topicPrefix + "-bad" testUtils.createTopic(topic, partitions = 5) testUtils.sendMessages(topic, Array("-1")) require(testUtils.getLatestOffsets(Set(topic)).size === 5) val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", testUtils.brokerAddress) .option("kafka.metadata.max.age.ms", "1") .option("subscribePattern", s"$topicPrefix-.*") .option("failOnDataLoss", "false") val kafka = reader.load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = kafka.map(kv => kv._2.toInt + 1) testStream(mapped)( makeSureGetOffsetCalled, AddKafkaData(Set(topic), 1, 2, 3), CheckAnswer(2, 3, 4), Execute { query => testUtils.deleteTopic(topic) testUtils.createTopic(topic2, partitions = 5) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists { r => // Ensure the new topic is present and the old topic is gone. r.knownPartitions.exists(_.topic == topic2) }, s"query never reconfigured to new topic $topic2") } }, AddKafkaData(Set(topic2), 4, 5, 6), CheckAnswer(2, 3, 4, 5, 6, 7) ) } } class KafkaContinuousSourceStressForDontFailOnDataLossSuite extends KafkaSourceStressForDontFailOnDataLossSuite { override protected def startStream(ds: Dataset[Int]) = { ds.writeStream .format("memory") .queryName("memory") .trigger(Trigger.Continuous("1 second")) .start() } }
Example 8
Source File: MemorySinkV2Suite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.sources._ import org.apache.spark.sql.streaming.{OutputMode, StreamTest} class MemorySinkV2Suite extends StreamTest with BeforeAndAfter { test("data writer") { val partition = 1234 val writer = new MemoryDataWriter(partition, OutputMode.Append()) writer.write(Row(1)) writer.write(Row(2)) writer.write(Row(44)) val msg = writer.commit() assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44)) assert(msg.partition == partition) // Buffer should be cleared, so repeated commits should give empty. assert(writer.commit().data.isEmpty) } test("continuous writer") { val sink = new MemorySinkV2 val writer = new MemoryStreamWriter(sink, OutputMode.Append()) writer.commit(0, Array( MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))), MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))), MemoryWriterCommitMessage(2, Seq(Row(6), Row(7))) )) assert(sink.latestBatchId.contains(0)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7)) writer.commit(19, Array( MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))), MemoryWriterCommitMessage(0, Seq(Row(33))) )) assert(sink.latestBatchId.contains(19)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33)) assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33)) } test("microbatch writer") { val sink = new MemorySinkV2 new MemoryWriter(sink, 0, OutputMode.Append()).commit( Array( MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))), MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))), MemoryWriterCommitMessage(2, Seq(Row(6), Row(7))) )) assert(sink.latestBatchId.contains(0)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7)) new MemoryWriter(sink, 19, OutputMode.Append()).commit( Array( MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))), MemoryWriterCommitMessage(0, Seq(Row(33))) )) assert(sink.latestBatchId.contains(19)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33)) assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33)) } }
Example 9
Source File: StreamMetadataSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.streaming.StreamTest class StreamMetadataSuite extends StreamTest { test("writing and reading") { withTempDir { dir => val id = UUID.randomUUID.toString val metadata = StreamMetadata(id) val file = new Path(new File(dir, "test").toString) StreamMetadata.write(metadata, file, hadoopConf) val readMetadata = StreamMetadata.read(file, hadoopConf) assert(readMetadata.nonEmpty) assert(readMetadata.get.id === id) } } test("read Spark 2.1.0 format") { // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0 assert( readForResource("query-metadata-logs-version-2.1.0.txt") === StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e")) } private def readForResource(fileName: String): StreamMetadata = { val input = getClass.getResource(s"/structured-streaming/$fileName") StreamMetadata.read(new Path(input.toString), hadoopConf).get } private val hadoopConf = new Configuration() }