org.apache.spark.sql.streaming.ProcessingTime Scala Examples
The following examples show how to use org.apache.spark.sql.streaming.ProcessingTime.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ProcessingTimeSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.ProcessingTime class ProcessingTimeSuite extends SparkFunSuite { test("create") { assert(ProcessingTime(10.seconds).intervalMs === 10 * 1000) assert(ProcessingTime.create(10, TimeUnit.SECONDS).intervalMs === 10 * 1000) assert(ProcessingTime("1 minute").intervalMs === 60 * 1000) assert(ProcessingTime("interval 1 minute").intervalMs === 60 * 1000) intercept[IllegalArgumentException] { ProcessingTime(null: String) } intercept[IllegalArgumentException] { ProcessingTime("") } intercept[IllegalArgumentException] { ProcessingTime("invalid") } intercept[IllegalArgumentException] { ProcessingTime("1 month") } intercept[IllegalArgumentException] { ProcessingTime("1 year") } } }
Example 2
Source File: DataFrameStream.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import java.util.concurrent.TimeUnit import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.ProcessingTime object DataFrameStream { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("DataFrame Stream") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ val df = spark.read .format("json") .option("inferSchema", "true") .load("../data/sparkml2/chapter13/person.json") df.printSchema() df.show() val stream = spark.readStream .schema(df.schema) .option("maxFilesPerTrigger", "1") .json("../data/sparkml2/chapter13/people") stream.printSchema() val people = stream.select("name", "age").where("age > 60") val query = people.writeStream .outputMode("append") .trigger(ProcessingTime(1, TimeUnit.SECONDS)) .format("console") query.start().awaitTermination() } }
Example 3
Source File: VoteCountStream.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import java.util.concurrent.TimeUnit import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.ProcessingTime object VoteCountStream { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("Test Stream") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ val stream = spark.readStream .format("socket") .option("host", "localhost") .option("port", 9999) .load() // Generate vote count val villiansVote = stream.groupBy("value").count() // Start triggering the query that prints the running counts to the console val query = villiansVote.orderBy("count").writeStream .outputMode("complete") .format("console") .trigger(ProcessingTime.create(10, TimeUnit.SECONDS)) .start() query.awaitTermination() } }
Example 4
Source File: DatasetStreamCSV.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import java.util.concurrent.TimeUnit import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.ProcessingTime import org.apache.spark.SparkConf case class StockPrice(date: String, open: Double, high: Double, low: Double, close: Double, volume: Integer, adjclose: Double) object DatasetStreamCSV { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("Dataset Stream") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ val s = spark.read .format("csv") .option("header", "true") .option("inferSchema", "true") .load("../data/sparkml2/chapter13/GE.csv") s.printSchema() s.show() val conf = new SparkConf() val streamDataset = spark.readStream .schema(s.schema) .option("sep", ",") .option("header", "true") .csv("../data/sparkml2/chapter13/ge").as[StockPrice] streamDataset.printSchema() val ge = streamDataset.filter("close > 100.00") val query = ge.writeStream .outputMode("append") .trigger(ProcessingTime(1, TimeUnit.SECONDS)) .format("console") query.start().awaitTermination() } }
Example 5
Source File: CurrentPersistenceIdsQuerySourceTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.sstreaming import java.util.UUID import java.util.concurrent.atomic.AtomicLong import akka.actor.{ ActorRef, Props } import akka.persistence.PersistentActor import akka.testkit.TestProbe import com.github.dnvriend.TestSpec import com.github.dnvriend.spark.datasources.SparkImplicits._ import com.github.dnvriend.spark.datasources.person.Person import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime } import org.scalatest.Ignore import scala.concurrent.ExecutionContext import scala.concurrent.duration._ import scala.language.implicitConversions object PersonActor { final case class BlogPost(id: Long, text: String) } class PersonActor(val persistenceId: String, schedule: Boolean)(implicit ec: ExecutionContext) extends PersistentActor { val counter = new AtomicLong() def ping() = context.system.scheduler.scheduleOnce(200.millis, self, "persist") def randomId: String = UUID.randomUUID.toString override val receiveRecover: Receive = PartialFunction.empty override val receiveCommand: Receive = { case "persist" => persist(Person(counter.incrementAndGet(), s"foo-$randomId", 20)) { _ => sender() ! "ack" } if (schedule) ping() } if (schedule) ping() } @Ignore class CurrentPersistenceIdsQuerySourceTest extends TestSpec { def withPersistentActor(pid: String = randomId, schedule: Boolean = false)(f: ActorRef => TestProbe => Unit): Unit = { val tp = TestProbe() val ref = system.actorOf(Props(new PersonActor(pid, schedule))) try f(ref)(tp) finally killActors(ref) } it should "query read journal" in withSparkSession { spark => withPersistentActor() { ref => tp => tp.send(ref, "persist") tp.expectMsg("ack") val jdbcReadJournal = spark.readStream .currentPersistenceIds("jdbc-read-journal") jdbcReadJournal.printSchema() println("Is the query streaming: " + jdbcReadJournal.isStreaming) println("Are there any streaming queries? " + spark.streams.active.isEmpty) val query = jdbcReadJournal .writeStream .format("console") .trigger(ProcessingTime(1.seconds)) .queryName("consoleStream") .outputMode(OutputMode.Append()) .start() query.awaitTermination(10.seconds) } } }
Example 6
Source File: QueryCsvTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.sstreaming import com.github.dnvriend.TestSpec import org.apache.commons.io.FileUtils import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime } import org.apache.spark.sql.types._ import org.scalatest.Ignore import scala.concurrent.duration._ import scala.language.implicitConversions @Ignore class QueryCsvTest extends TestSpec { def copyFiles(nrTimes: Int = 10): Unit = { FileUtils.deleteDirectory("/tmp/csv") FileUtils.forceMkdir("/tmp/csv") (1 to nrTimes).foreach { x => FileUtils.copyFile(TestSpec.PeopleCsv, s"/tmp/csv/people-$x") } } val schema: StructType = StructType(Array( StructField("id", LongType, nullable = false), StructField("name", StringType, nullable = true), StructField("age", IntegerType, nullable = true) )) it should "query csv file" in withSparkSession { spark => copyFiles() val csv = spark.readStream .schema(schema) .format("csv") .option("maxFilesPerTrigger", 1) .option("header", "false") // Use first line of all files as header .option("inferSchema", "false") // Automatically infer data types .option("delimiter", ";") .load("/tmp/csv") csv.printSchema() println("Is the query streaming: " + csv.isStreaming) println("Are there any streaming queries? " + spark.streams.active.isEmpty) val query = csv .writeStream .format("console") .trigger(ProcessingTime(5.seconds)) .queryName("consoleStream") .outputMode(OutputMode.Append()) .start() // waiting for data sleep(3.seconds) spark.streams .active .foreach(println) spark.streams .active .foreach(_.explain(extended = true)) query.awaitTermination(20.seconds) } }
Example 7
Source File: CurrentEventsByPersistenceIdQueryTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.sstreaming import akka.actor.{ ActorRef, Props } import akka.testkit.TestProbe import com.github.dnvriend.TestSpec import com.github.dnvriend.spark.datasources.SparkImplicits._ import com.github.dnvriend.spark.mapper.PersonEventMapper import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime } import org.apache.spark.sql.functions._ import org.scalatest.Ignore import scala.concurrent.duration._ @Ignore class CurrentEventsByPersistenceIdQueryTest extends TestSpec { def withPersistentActor(pid: String = randomId, schedule: Boolean = false)(f: ActorRef => TestProbe => Unit): Unit = { val tp = TestProbe() val ref = system.actorOf(Props(new PersonActor(pid, schedule))) try f(ref)(tp) finally killActors(ref) } it should "read events for pid" in withSparkSession { spark => import spark.implicits._ withPersistentActor("person", schedule = true) { ref => tp => tp.send(ref, "persist") tp.expectMsg("ack") val jdbcReadJournal = spark.readStream .schema(PersonEventMapper.schema) .option("pid", "person") .option("event-mapper", "com.github.dnvriend.spark.mapper.PersonEventMapper") .eventsByPersistenceId("jdbc-read-journal") jdbcReadJournal.printSchema() // val numOfEvents = jdbcReadJournal // .groupBy('persistence_id) // .agg(count('sequence_number).as("number_of_events")) val query = jdbcReadJournal .writeStream .format("console") .trigger(ProcessingTime(1.seconds)) .queryName("consoleStream") // .outputMode(OutputMode.Complete()) .outputMode(OutputMode.Append()) .start() query.awaitTermination(20.seconds) } } }
Example 8
Source File: ProcessingTimeSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} class ProcessingTimeSuite extends SparkFunSuite { test("create") { def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000) intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) } intercept[IllegalArgumentException] { Trigger.ProcessingTime("") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") } } }
Example 9
Source File: ContinuousTrigger.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration import org.apache.commons.lang3.StringUtils import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.spark.unsafe.types.CalendarInterval @InterfaceStability.Evolving case class ContinuousTrigger(intervalMs: Long) extends Trigger { require(intervalMs >= 0, "the interval of trigger should not be negative") } private[sql] object ContinuousTrigger { def apply(interval: String): ContinuousTrigger = { if (StringUtils.isBlank(interval)) { throw new IllegalArgumentException( "interval cannot be null or blank.") } val cal = if (interval.startsWith("interval")) { CalendarInterval.fromString(interval) } else { CalendarInterval.fromString("interval " + interval) } if (cal == null) { throw new IllegalArgumentException(s"Invalid interval: $interval") } if (cal.months > 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } new ContinuousTrigger(cal.microseconds / 1000) } def apply(interval: Duration): ContinuousTrigger = { ContinuousTrigger(interval.toMillis) } def create(interval: String): ContinuousTrigger = { apply(interval) } def create(interval: Long, unit: TimeUnit): ContinuousTrigger = { ContinuousTrigger(unit.toMillis(interval)) } }
Example 10
Source File: StructuredStreamingOffset.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.streaming import com.vita.Constants import com.vita.redies.RedisSingle import com.vita.spark.streaming.writer.RedisWriteKafkaOffset import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} object StructuredStreamingOffset { val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset") //topic val SUBSCRIBE = "log" case class readLogs(context: String, offset: String) def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[*]") .appName("StructuredStreamingOffset") .getOrCreate() //开始 offset var startOffset = -1 //init val redisSingle: RedisSingle = new RedisSingle() redisSingle.init(Constants.IP, Constants.PORT) //get redis if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) { startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt } //sink val df = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", SUBSCRIBE) .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}") .load() import spark.implicits._ //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)] val content = lines.map(x => readLogs(x._1, x._2.toString)) val count = content.toDF("context", "offset") //sink foreach 记录offset val query = count .writeStream .foreach(new RedisWriteKafkaOffset) .outputMode("update") .trigger(Trigger.ProcessingTime("5 seconds")) .format("console") .start() query.awaitTermination() } }
Example 11
Source File: ProcessingTimeExecutorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.{CountDownLatch, TimeUnit} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.ProcessingTime import org.apache.spark.util.{Clock, ManualClock, SystemClock} class ProcessingTimeExecutorSuite extends SparkFunSuite { test("nextBatchTime") { val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(100)) assert(processingTimeExecutor.nextBatchTime(0) === 100) assert(processingTimeExecutor.nextBatchTime(1) === 100) assert(processingTimeExecutor.nextBatchTime(99) === 100) assert(processingTimeExecutor.nextBatchTime(100) === 200) assert(processingTimeExecutor.nextBatchTime(101) === 200) assert(processingTimeExecutor.nextBatchTime(150) === 200) } test("calling nextBatchTime with the result of a previous call should return the next interval") { val intervalMS = 100 val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMS)) val ITERATION = 10 var nextBatchTime: Long = 0 for (it <- 1 to ITERATION) { nextBatchTime = processingTimeExecutor.nextBatchTime(nextBatchTime) } // nextBatchTime should be 1000 assert(nextBatchTime === intervalMS * ITERATION) } private def testBatchTermination(intervalMs: Long): Unit = { var batchCounts = 0 val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMs)) processingTimeExecutor.execute(() => { batchCounts += 1 // If the batch termination works well, batchCounts should be 3 after `execute` batchCounts < 3 }) assert(batchCounts === 3) } test("batch termination") { testBatchTermination(0) testBatchTermination(10) } test("notifyBatchFallingBehind") { val clock = new ManualClock() @volatile var batchFallingBehindCalled = false val latch = new CountDownLatch(1) val t = new Thread() { override def run(): Unit = { val processingTimeExecutor = new ProcessingTimeExecutor(ProcessingTime(100), clock) { override def notifyBatchFallingBehind(realElapsedTimeMs: Long): Unit = { batchFallingBehindCalled = true } } processingTimeExecutor.execute(() => { latch.countDown() clock.waitTillTime(200) false }) } } t.start() // Wait until the batch is running so that we don't call `advance` too early assert(latch.await(10, TimeUnit.SECONDS), "the batch has not yet started in 10 seconds") clock.advance(200) t.join() assert(batchFallingBehindCalled === true) } }
Example 12
Source File: StreamingOption.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark import scala.collection.mutable import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonLoadOptionConstants} import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.core.util.path.CarbonTablePath import org.apache.carbondata.streaming.parser.CarbonStreamParser class StreamingOption(val userInputMap: Map[String, String]) { lazy val trigger: Trigger = { val trigger = userInputMap.getOrElse( "trigger", throw new MalformedCarbonCommandException("trigger must be specified")) val interval = userInputMap.getOrElse( "interval", throw new MalformedCarbonCommandException("interval must be specified")) trigger match { case "ProcessingTime" => ProcessingTime(interval) case others => throw new MalformedCarbonCommandException("invalid trigger: " + trigger) } } def checkpointLocation(tablePath: String): String = userInputMap.getOrElse( "checkpointLocation", CarbonTablePath.getStreamingCheckpointDir(tablePath)) lazy val timeStampFormat: String = userInputMap.getOrElse("timestampformat", CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT) lazy val dateFormat: String = userInputMap.getOrElse("dateformat", CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT) lazy val rowParser: String = userInputMap.getOrElse(CarbonStreamParser.CARBON_STREAM_PARSER, CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER) lazy val badRecordsPath: String = userInputMap .getOrElse("bad_record_path", CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, CarbonCommonConstants.CARBON_BADRECORDS_LOC_DEFAULT_VAL)) lazy val badRecordsAction: String = userInputMap .getOrElse("bad_records_action", CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION_DEFAULT)) lazy val badRecordsLogger: String = userInputMap .getOrElse("bad_records_logger_enable", CarbonProperties.getInstance() .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE, CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE_DEFAULT)) lazy val isEmptyBadRecord: String = userInputMap .getOrElse("is_empty_bad_record", CarbonProperties.getInstance() .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD, CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD_DEFAULT)) lazy val remainingOption: Map[String, String] = { // copy the user input map and remove the fix options val mutableMap = mutable.Map[String, String]() ++= userInputMap mutableMap.remove("checkpointLocation") mutableMap.remove("timestampformat") mutableMap.remove("dateformat") mutableMap.remove("trigger") mutableMap.remove("interval") mutableMap.remove(CarbonStreamParser.CARBON_STREAM_PARSER) mutableMap.toMap } }
Example 13
Source File: ProcessingTimeSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.ProcessingTime class ProcessingTimeSuite extends SparkFunSuite { test("create") { assert(ProcessingTime(10.seconds).intervalMs === 10 * 1000) assert(ProcessingTime.create(10, TimeUnit.SECONDS).intervalMs === 10 * 1000) assert(ProcessingTime("1 minute").intervalMs === 60 * 1000) assert(ProcessingTime("interval 1 minute").intervalMs === 60 * 1000) intercept[IllegalArgumentException] { ProcessingTime(null: String) } intercept[IllegalArgumentException] { ProcessingTime("") } intercept[IllegalArgumentException] { ProcessingTime("invalid") } intercept[IllegalArgumentException] { ProcessingTime("1 month") } intercept[IllegalArgumentException] { ProcessingTime("1 year") } } }
Example 14
Source File: ProcessingTimeExecutorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.{CountDownLatch, TimeUnit} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.ProcessingTime import org.apache.spark.util.{Clock, ManualClock, SystemClock} class ProcessingTimeExecutorSuite extends SparkFunSuite { test("nextBatchTime") { val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(100)) assert(processingTimeExecutor.nextBatchTime(0) === 100) assert(processingTimeExecutor.nextBatchTime(1) === 100) assert(processingTimeExecutor.nextBatchTime(99) === 100) assert(processingTimeExecutor.nextBatchTime(100) === 200) assert(processingTimeExecutor.nextBatchTime(101) === 200) assert(processingTimeExecutor.nextBatchTime(150) === 200) } test("calling nextBatchTime with the result of a previous call should return the next interval") { val intervalMS = 100 val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMS)) val ITERATION = 10 var nextBatchTime: Long = 0 for (it <- 1 to ITERATION) { nextBatchTime = processingTimeExecutor.nextBatchTime(nextBatchTime) } // nextBatchTime should be 1000 assert(nextBatchTime === intervalMS * ITERATION) } private def testBatchTermination(intervalMs: Long): Unit = { var batchCounts = 0 val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMs)) processingTimeExecutor.execute(() => { batchCounts += 1 // If the batch termination works well, batchCounts should be 3 after `execute` batchCounts < 3 }) assert(batchCounts === 3) } test("batch termination") { testBatchTermination(0) testBatchTermination(10) } test("notifyBatchFallingBehind") { val clock = new ManualClock() @volatile var batchFallingBehindCalled = false val latch = new CountDownLatch(1) val t = new Thread() { override def run(): Unit = { val processingTimeExecutor = new ProcessingTimeExecutor(ProcessingTime(100), clock) { override def notifyBatchFallingBehind(realElapsedTimeMs: Long): Unit = { batchFallingBehindCalled = true } } processingTimeExecutor.execute(() => { latch.countDown() clock.waitTillTime(200) false }) } } t.start() // Wait until the batch is running so that we don't call `advance` too early assert(latch.await(10, TimeUnit.SECONDS), "the batch has not yet started in 10 seconds") clock.advance(200) t.join() assert(batchFallingBehindCalled === true) } }
Example 15
Source File: ProcessingTimeSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} class ProcessingTimeSuite extends SparkFunSuite { test("create") { def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000) intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) } intercept[IllegalArgumentException] { Trigger.ProcessingTime("") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") } } }
Example 16
Source File: ContinuousTrigger.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration import org.apache.commons.lang3.StringUtils import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.spark.unsafe.types.CalendarInterval @InterfaceStability.Evolving case class ContinuousTrigger(intervalMs: Long) extends Trigger { require(intervalMs >= 0, "the interval of trigger should not be negative") } private[sql] object ContinuousTrigger { def apply(interval: String): ContinuousTrigger = { if (StringUtils.isBlank(interval)) { throw new IllegalArgumentException( "interval cannot be null or blank.") } val cal = if (interval.startsWith("interval")) { CalendarInterval.fromString(interval) } else { CalendarInterval.fromString("interval " + interval) } if (cal == null) { throw new IllegalArgumentException(s"Invalid interval: $interval") } if (cal.months > 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } new ContinuousTrigger(cal.microseconds / 1000) } def apply(interval: Duration): ContinuousTrigger = { ContinuousTrigger(interval.toMillis) } def create(interval: String): ContinuousTrigger = { apply(interval) } def create(interval: Long, unit: TimeUnit): ContinuousTrigger = { ContinuousTrigger(unit.toMillis(interval)) } }
Example 17
Source File: ProcessingTimeSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.ProcessingTime class ProcessingTimeSuite extends SparkFunSuite { test("create") { assert(ProcessingTime(10.seconds).intervalMs === 10 * 1000) assert(ProcessingTime.create(10, TimeUnit.SECONDS).intervalMs === 10 * 1000) assert(ProcessingTime("1 minute").intervalMs === 60 * 1000) assert(ProcessingTime("interval 1 minute").intervalMs === 60 * 1000) intercept[IllegalArgumentException] { ProcessingTime(null: String) } intercept[IllegalArgumentException] { ProcessingTime("") } intercept[IllegalArgumentException] { ProcessingTime("invalid") } intercept[IllegalArgumentException] { ProcessingTime("1 month") } intercept[IllegalArgumentException] { ProcessingTime("1 year") } } }
Example 18
Source File: ProcessingTimeExecutorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.{CountDownLatch, TimeUnit} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.ProcessingTime import org.apache.spark.util.{Clock, ManualClock, SystemClock} class ProcessingTimeExecutorSuite extends SparkFunSuite { test("nextBatchTime") { val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(100)) assert(processingTimeExecutor.nextBatchTime(0) === 100) assert(processingTimeExecutor.nextBatchTime(1) === 100) assert(processingTimeExecutor.nextBatchTime(99) === 100) assert(processingTimeExecutor.nextBatchTime(100) === 200) assert(processingTimeExecutor.nextBatchTime(101) === 200) assert(processingTimeExecutor.nextBatchTime(150) === 200) } test("calling nextBatchTime with the result of a previous call should return the next interval") { val intervalMS = 100 val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMS)) val ITERATION = 10 var nextBatchTime: Long = 0 for (it <- 1 to ITERATION) { nextBatchTime = processingTimeExecutor.nextBatchTime(nextBatchTime) } // nextBatchTime should be 1000 assert(nextBatchTime === intervalMS * ITERATION) } private def testBatchTermination(intervalMs: Long): Unit = { var batchCounts = 0 val processingTimeExecutor = ProcessingTimeExecutor(ProcessingTime(intervalMs)) processingTimeExecutor.execute(() => { batchCounts += 1 // If the batch termination works well, batchCounts should be 3 after `execute` batchCounts < 3 }) assert(batchCounts === 3) } test("batch termination") { testBatchTermination(0) testBatchTermination(10) } test("notifyBatchFallingBehind") { val clock = new ManualClock() @volatile var batchFallingBehindCalled = false val latch = new CountDownLatch(1) val t = new Thread() { override def run(): Unit = { val processingTimeExecutor = new ProcessingTimeExecutor(ProcessingTime(100), clock) { override def notifyBatchFallingBehind(realElapsedTimeMs: Long): Unit = { batchFallingBehindCalled = true } } processingTimeExecutor.execute(() => { latch.countDown() clock.waitTillTime(200) false }) } } t.start() // Wait until the batch is running so that we don't call `advance` too early assert(latch.await(10, TimeUnit.SECONDS), "the batch has not yet started in 10 seconds") clock.advance(200) t.join() assert(batchFallingBehindCalled === true) } }