org.apache.spark.sql.streaming.Trigger Scala Examples
The following examples show how to use org.apache.spark.sql.streaming.Trigger.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: JdbcSinkDemo.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.examples.sql.streaming.jdbc import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.streaming.{OutputMode, Trigger} object JdbcSinkDemo { private case class Person(name: String, age: Int) def main(args: Array[String]): Unit = { if (args.length < 4) { // scalastyle:off println System.err.println("Usage: JdbcSinkDemo <jdbcUrl> <tableName> <username> <password>") // scalastyle:on System.exit(1) } val jdbcUrl = args(0) val tableName = args(1) val username = args(2) val password = args(3) val spark = SparkSession .builder() .appName("JdbcSinkDemo") .getOrCreate() // load data source val df = spark.readStream .format("rate") .option("numPartitions", "5") .option("rowsPerSecond", "100") .load() // change input value to a person object. import spark.implicits._ val lines = df.select("value").as[Long].map{ value => Person(s"name_${value}", value.toInt % 30) } lines.printSchema() // write result val query = lines.writeStream .outputMode("append") .format("streaming-jdbc") .outputMode(OutputMode.Append) .option(JDBCOptions.JDBC_URL, jdbcUrl) .option(JDBCOptions.JDBC_TABLE_NAME, tableName) .option(JDBCOptions.JDBC_DRIVER_CLASS, "com.mysql.jdbc.Driver") .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, "5") .option("user", username) .option("password", password) .trigger(Trigger.ProcessingTime("10 seconds")) .start() query.awaitTermination() } }
Example 2
Source File: SparkRandomGenDataIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import java.sql.Timestamp import scala.util.Random import pipelines.streamlets.{ DurationConfigParameter, IntegerConfigParameter, StreamletShape } import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.{ OutputMode, Trigger } import pipelines.spark.sql.SQLImplicits._ case class Rate(timestamp: Timestamp, value: Long) class SparkRandomGenDataIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.src) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to produce.", Some(50)) val RampUpTime = DurationConfigParameter( "ramp-up-time", "Time to reach max records per second.", Some("0 seconds")) override def configParameters = Vector(RecordsPerSecond, RampUpTime) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) val rampUpTime = context.streamletConfig.getDuration(RampUpTime.key, java.util.concurrent.TimeUnit.SECONDS) println(s"Using rampup time of $rampUpTime seconds") val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas" val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .option("rampUpTime", s"${rampUpTime}s") .load() .as[Rate] rateStream.map { case Rate(timestamp, value) ⇒ Data(s"src-${value % 1000}", timestamp.getTime, None, None, gaugeGen(), value) } } } }
Example 3
Source File: SparkConsoleEgress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic, StreamletQueryExecution } import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.functions._ import org.apache.spark.sql.DataFrame class SparkConsoleEgress extends SparkStreamlet { val in1 = AvroInlet[Data]("in1") val in2 = AvroInlet[Data]("in2") val shape = StreamletShape.withInlets(in1, in2) def asTimestamp = udf((t: Long) ⇒ new java.sql.Timestamp(t)) def elapsedTime = udf((t1: Long, t0: Long) ⇒ t1 - t0) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val stream1 = readStream(in1).withColumn("source", lit("spark")).withColumn("elapsed", elapsedTime($"t2", $"t1")) val stream2 = readStream(in2).withColumn("source", lit("akka")).withColumn("elapsed", elapsedTime($"t2", $"t1")) // commented-out process: simple stats to compute min/max/mean on a window // val dataCount = stream1.union(stream2).withColumn("ts", asTimestamp($"timestamp")) // val stats = dataCount // .withWatermark("ts", "1 second") // .groupBy(window($"ts", "5 minutes", "1 minute"), $"source") // //.agg(max($"elapsed"), min($"elapsed"), avg($"elapsed"), count($"source")) val quantiles: (String ⇒ Long ⇒ (DataFrame, Long) ⇒ Unit) = { name ⇒ period ⇒ (df, time) ⇒ df.cache() val count = df.count() val cps = count.toDouble / period val quans = df.stat.approxQuantile("elapsed", Array(0.1, 0.5, 0.9, 0.99), 0.01) println(s"$time, $name, $count, $cps, " + quans.mkString(", ")) } val period = 60 * 5 // seconds val q1 = stream1.writeStream.foreachBatch(quantiles("spark")(period)) .trigger(Trigger.ProcessingTime(s"$period seconds")) .option("checkpointLocation", context.checkpointDir("console-egress-q1")) .start() val q2 = stream2.writeStream.foreachBatch(quantiles("akka")(period)) .trigger(Trigger.ProcessingTime(s"$period seconds")) .option("checkpointLocation", context.checkpointDir("console-egress-q2")) .start() new Thread() { override def run(): Unit = { while (true) { val progress = q1.lastProgress if (progress != null) { println("***************** [PROGRESS] *********************") println(progress.toString()) println("**************************************************") } Thread.sleep(60 * 1000) } } } //.start // uncomment to enable the query progress StreamletQueryExecution(q1, q2) } } }
Example 4
Source File: PulsarContinuousTest.scala From pulsar-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.pulsar import java.util.concurrent.atomic.AtomicInteger import scala.language.reflectiveCalls import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession trait PulsarContinuousTest extends PulsarSourceTest { override val defaultTrigger = Trigger.Continuous(1000) override val defaultUseV2Sink = true // We need more than the default local[2] to be able to schedule all partitions simultaneously. override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } test("ensure continuous stream is being used") { val query = spark.readStream .format("rate") .option("numPartitions", "1") .option("rowsPerSecond", "1") .load() testStream(query)( Execute(q => assert(q.isInstanceOf[ContinuousExecution])) ) } }
Example 5
Source File: TestSparkStreamletContext.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark package testkit import java.nio.file.attribute.FileAttribute import com.typesafe.config._ import scala.reflect.runtime.universe._ import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger } import cloudflow.streamlets._ import org.apache.spark.sql.catalyst.InternalRow class TestSparkStreamletContext(override val streamletRef: String, session: SparkSession, inletTaps: Seq[SparkInletTap[_]], outletTaps: Seq[SparkOutletTap[_]], override val config: Config = ConfigFactory.empty) extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config), session) { val ProcessingTimeInterval = 1500.milliseconds override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = inletTaps .find(_.portName == inPort.name) .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In]) .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}")) override def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = { // RateSource can only work with a microBatch query because it contains no data at time zero. // Trigger.Once requires data at start to work. val trigger = if (isRateSource(stream)) { Trigger.ProcessingTime(ProcessingTimeInterval) } else { Trigger.Once() } val streamingQuery = outletTaps .find(_.portName == outPort.name) .map { outletTap ⇒ stream.writeStream .outputMode(outputMode) .format("memory") .trigger(trigger) .queryName(outletTap.queryName) .start() } .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}")) streamingQuery } override def checkpointDir(dirName: String): String = { val fileAttibutes: Array[FileAttribute[_]] = Array() val tmpDir = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*) tmpDir.toFile.getAbsolutePath } private def isRateSource(stream: Dataset[_]): Boolean = { import org.apache.spark.sql.execution.command.ExplainCommand val explain = ExplainCommand(stream.queryExecution.logical, true) val res = session.sessionState.executePlan(explain).executedPlan.executeCollect() res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider")) } } case class TestContextException(portName: String, msg: String) extends RuntimeException(msg)
Example 6
Source File: SparkEgressSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.streaming.{ OutputMode, Trigger } import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkEgressSpec extends SparkScalaTestSupport { "SparkEgress" should { "materialize streaming data to sink" in { val testKit = SparkStreamletTestkit(session) def asCollection[T: Encoder](session: SparkSession, queryName: String): List[T] = session.sql(s"select * from $queryName").as[T].collect().toList val instance = new MySparkEgress() // setup inlet tap on inlet port val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in) // build data and send to inlet tap val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) in.addData(data) val run = testKit.run(instance, Seq(in), Seq.empty) run.failures mustBe ('empty) run.totalRows mustBe (20) val r1 = asCollection[String](session, "allNames") val r2 = asCollection[String](session, "allNamesUpper") // assert r1 must contain("name1") r2 must contain("NAME1") } } } class MySparkEgress extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = process(readStream(in)) private def process(inDataset: Dataset[Data]): StreamletQueryExecution = { val q1 = inDataset .map { d ⇒ d.name } .writeStream .format("memory") .option("truncate", false) .queryName("allNames") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() val q2 = inDataset .map { d ⇒ d.name.toUpperCase } .writeStream .format("memory") .option("truncate", false) .queryName("allNamesUpper") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() StreamletQueryExecution(q1, q2) } } }
Example 7
Source File: IntCountByStructured.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.kafka import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.Trigger object IntCountByStructured { val master = "local" val serverList = "localhost:9092" val kafka = "kafka" def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession.builder() .master(master) .appName(IntCountByStructured.toString) .getOrCreate() import spark.implicits._ //此处一定要注意structured streaming集成kafka与spark streaming集成所需依赖不同 val query = spark .readStream //input stream阶段,读取kafka .format(kafka) .option("kafka.bootstrap.servers", serverList) .option("subscribe", "randomCount") .load() //转换为DataSet[String] .selectExpr("CAST(value as STRING)") .as[String] //正常的word count .flatMap(_.split(" ")) .groupBy("value") .count() //output stream阶段 .writeStream .outputMode("complete") //触发器十秒 .trigger(Trigger.ProcessingTime("10 seconds")) .format("console") .start() query.awaitTermination() } }
Example 8
Source File: A_6_ContinuousProcessing.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.structured_streaming import org.apache.spark.sql.SparkSession object A_6_ContinuousProcessing { def main(args: Array[String]): Unit = { val session = SparkSession.builder() .appName("ContinuousProcessing") .master("local") .getOrCreate() import org.apache.spark.sql.streaming.Trigger session.readStream .format("kafka") .option("kafka.bootstrap.servers", "host1:port1,host2:port2") .option("subscribe", "topic1") .load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .writeStream .format("kafka") .option("kafka.bootstrap.server", "host1:port1,host2:port2") .option("subscribe", "outPutTopic") .trigger(Trigger.Continuous("1 second")) .start() } }
Example 9
Source File: SchemaRegistryAvroReader.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.examples import java.util.UUID import com.hortonworks.spark.registry.util._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{OutputMode, Trigger} object SchemaRegistryAvroReader { def main(args: Array[String]): Unit = { val schemaRegistryUrl = if (args.length > 0) args(0) else "http://localhost:9090/api/v1/" val bootstrapServers = if (args.length > 1) args(1) else "localhost:9092" val topic = if (args.length > 2) args(2) else "topic1-out" val checkpointLocation = if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString val securityProtocol = if (args.length > 4) Option(args(4)) else None val spark = SparkSession .builder .appName("SchemaRegistryAvroReader") .getOrCreate() val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("subscribe", topic) val messages = securityProtocol .map(p => reader.option("kafka.security.protocol", p).load()) .getOrElse(reader.load()) import spark.implicits._ // the schema registry client config val config = Map[String, Object]("schema.registry.url" -> schemaRegistryUrl) // the schema registry config that will be implicitly passed implicit val srConfig: SchemaRegistryConfig = SchemaRegistryConfig(config) // Read messages from kafka and deserialize. // This uses the schema registry schema associated with the topic. val df = messages .select(from_sr($"value", topic).alias("message")) // write the output to console // should produce events like {"driverId":14,"truckId":25,"miles":373} val query = df .writeStream .format("console") .trigger(Trigger.ProcessingTime(10000)) .outputMode(OutputMode.Append()) .start() query.awaitTermination() } }
Example 10
Source File: SchemaJsonExample.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.examples import java.util.UUID import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{from_json, struct, to_json} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.types._ object SchemaJsonExample { def main(args: Array[String]): Unit = { val bootstrapServers = if (args.length > 0) args(0) else "localhost:9092" val topic = if (args.length > 1) args(1) else "topic1" val outTopic = if (args.length > 2) args(2) else "topic1-out" val checkpointLocation = if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString val spark = SparkSession .builder .appName("SchemaExample") .getOrCreate() val messages = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("subscribe", topic) .load() import spark.implicits._ // the schema for truck events val schema = StructType(Seq( StructField("driverId", IntegerType, nullable = false), StructField("truckId", IntegerType, nullable = false), StructField("eventTime", StringType, nullable = false), StructField("eventType", StringType, nullable = false), StructField("longitude", DoubleType, nullable = false), StructField("latitude", DoubleType, nullable = false), StructField("eventKey", StringType, nullable = false), StructField("correlationId", StringType, nullable = false), StructField("driverName", StringType, nullable = false), StructField("routeId", IntegerType, nullable = false), StructField("routeName", StringType, nullable = false), StructField("eventDate", StringType, nullable = false), StructField("miles", IntegerType, nullable = false) )) // read messages from kafka and parse it using the above schema val df = messages .select(from_json($"value".cast("string"), schema).alias("value")) // project (driverId, truckId, miles) for the events where miles > 300 val filtered = df.select($"value.driverId", $"value.truckId", $"value.miles") .where("value.miles > 300") // write the output to a kafka topic serialized as a JSON string. // should produce events like {"driverId":14,"truckId":25,"miles":373} val query = filtered .select(to_json(struct($"*")).alias("value")) .writeStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("topic", outTopic) .option("checkpointLocation", checkpointLocation) .trigger(Trigger.ProcessingTime(10000)) .outputMode(OutputMode.Append()) .start() query.awaitTermination() } }
Example 11
Source File: KinesisContinuousTest.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession trait KinesisContinuousTest extends KinesisSourceTest{ override val defaultTrigger = Trigger.Continuous("1 hour") override val defaultUseV2Sink = true override val streamingTimeout = 120.seconds override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } }
Example 12
Source File: ProcessingTimeSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} class ProcessingTimeSuite extends SparkFunSuite { test("create") { def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000) intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) } intercept[IllegalArgumentException] { Trigger.ProcessingTime("") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") } } }
Example 13
Source File: ContinuousTrigger.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration import org.apache.commons.lang3.StringUtils import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.spark.unsafe.types.CalendarInterval @InterfaceStability.Evolving case class ContinuousTrigger(intervalMs: Long) extends Trigger { require(intervalMs >= 0, "the interval of trigger should not be negative") } private[sql] object ContinuousTrigger { def apply(interval: String): ContinuousTrigger = { if (StringUtils.isBlank(interval)) { throw new IllegalArgumentException( "interval cannot be null or blank.") } val cal = if (interval.startsWith("interval")) { CalendarInterval.fromString(interval) } else { CalendarInterval.fromString("interval " + interval) } if (cal == null) { throw new IllegalArgumentException(s"Invalid interval: $interval") } if (cal.months > 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } new ContinuousTrigger(cal.microseconds / 1000) } def apply(interval: Duration): ContinuousTrigger = { ContinuousTrigger(interval.toMillis) } def create(interval: String): ContinuousTrigger = { apply(interval) } def create(interval: Long, unit: TimeUnit): ContinuousTrigger = { ContinuousTrigger(unit.toMillis(interval)) } }
Example 14
Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import scala.collection.mutable import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.{StreamTest, Trigger} import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession} // Run tests in KafkaSourceSuiteBase in continuous execution mode. class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest { import testImplicits._ override val brokerProps = Map("auto.create.topics.enable" -> "false") test("subscribing topic by pattern with topic deletions") { val topicPrefix = newTopic() val topic = topicPrefix + "-seems" val topic2 = topicPrefix + "-bad" testUtils.createTopic(topic, partitions = 5) testUtils.sendMessages(topic, Array("-1")) require(testUtils.getLatestOffsets(Set(topic)).size === 5) val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", testUtils.brokerAddress) .option("kafka.metadata.max.age.ms", "1") .option("subscribePattern", s"$topicPrefix-.*") .option("failOnDataLoss", "false") val kafka = reader.load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = kafka.map(kv => kv._2.toInt + 1) testStream(mapped)( makeSureGetOffsetCalled, AddKafkaData(Set(topic), 1, 2, 3), CheckAnswer(2, 3, 4), Execute { query => testUtils.deleteTopic(topic) testUtils.createTopic(topic2, partitions = 5) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists { r => // Ensure the new topic is present and the old topic is gone. r.knownPartitions.exists(_.topic == topic2) }, s"query never reconfigured to new topic $topic2") } }, AddKafkaData(Set(topic2), 4, 5, 6), CheckAnswer(2, 3, 4, 5, 6, 7) ) } } class KafkaContinuousSourceStressForDontFailOnDataLossSuite extends KafkaSourceStressForDontFailOnDataLossSuite { override protected def startStream(ds: Dataset[Int]) = { ds.writeStream .format("memory") .queryName("memory") .trigger(Trigger.Continuous("1 second")) .start() } }
Example 15
Source File: KafkaContinuousTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession // Trait to configure StreamTest for kafka continuous execution tests. trait KafkaContinuousTest extends KafkaSourceTest { override val defaultTrigger = Trigger.Continuous(1000) override val defaultUseV2Sink = true // We need more than the default local[2] to be able to schedule all partitions simultaneously. override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // In addition to setting the partitions in Kafka, we have to wait until the query has // reconfigured to the new count so the test framework can hook in properly. override protected def setTopicPartitions( topic: String, newCount: Int, query: StreamExecution) = { testUtils.addPartitions(topic, newCount) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists(_.knownPartitions.size == newCount), s"query never reconfigured to $newCount partitions") } } // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } test("ensure continuous stream is being used") { val query = spark.readStream .format("rate") .option("numPartitions", "1") .option("rowsPerSecond", "1") .load() testStream(query)( Execute(q => assert(q.isInstanceOf[ContinuousExecution])) ) } }
Example 16
Source File: EventMemoryStreamSpec.scala From odsc-west-streaming-trends with GNU General Public License v3.0 | 5 votes |
package com.twilio.open.streaming.trend.discovery import java.nio.charset.StandardCharsets import java.nio.file.Files import com.twilio.open.protocol.Calls.CallEvent import com.twilio.open.streaming.trend.discovery.config.{AppConfig, AppConfiguration} import com.twilio.open.streaming.trend.discovery.streams.EventAggregation import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.Trigger import org.scalatest.{FunSuite, Matchers} import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.duration._ class EventMemoryStreamSpec extends FunSuite with Matchers with SparkSqlTest { val log: Logger = LoggerFactory.getLogger(classOf[EventAggregation]) private val pathToTestScenarios = "src/test/resources/scenarios" lazy val session: SparkSession = sparkSql override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("aggregation-test-app") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.shuffle.partitions", "32") .set("spark.executor.cores", "4") .set("spark.executor.memory", "1g") .set("spark.ui.enabled", "false") .setJars(SparkContext.jarOfClass(classOf[EventAggregation]).toList) } protected val checkpointDir: String = Files.createTempDirectory(appID).toString def appConfigForTest(): AppConfiguration = { val baseConfig = AppConfig("src/test/resources/app.yaml") baseConfig.copy( checkpointPath = checkpointDir ) baseConfig } test("Should aggregate call events") { implicit val sqlContext: SQLContext = session.sqlContext import session.implicits._ val appConfig = appConfigForTest() val scenario = TestHelper.loadScenario[CallEvent](s"$pathToTestScenarios/pdd_events.json") val scenarioIter = scenario.toIterator scenario.nonEmpty shouldBe true val trendDiscoveryApp = new TrendDiscoveryApp(appConfigForTest(), session) val kafkaData = MemoryStream[MockKafkaDataFrame] val processingTimeTrigger = Trigger.ProcessingTime(2.seconds) val eventAggregation = EventAggregation(appConfig) val processingStream = eventAggregation.process(kafkaData.toDF())(session) .writeStream .format("memory") .queryName("calleventaggs") .outputMode(eventAggregation.outputMode) .trigger(processingTimeTrigger) .start() // 22 events kafkaData.addData(scenarioIter.take(11).map(TestHelper.asMockKafkaDataFrame)) processingStream.processAllAvailable() kafkaData.addData(scenarioIter.take(10).map(TestHelper.asMockKafkaDataFrame)) processingStream.processAllAvailable() kafkaData.addData(scenarioIter.take(1).map(TestHelper.asMockKafkaDataFrame)) processingStream.processAllAvailable() val df = session.sql("select * from calleventaggs") df.printSchema() df.show val res = session .sql("select avg(stats.p99) from calleventaggs") .collect() .map { r => r.getAs[Double](0) } .head DiscoveryUtils.round(res) shouldEqual 7.56 processingStream.stop() } }
Example 17
Source File: StructuredStreamingOffset.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.streaming import com.vita.Constants import com.vita.redies.RedisSingle import com.vita.spark.streaming.writer.RedisWriteKafkaOffset import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} object StructuredStreamingOffset { val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset") //topic val SUBSCRIBE = "log" case class readLogs(context: String, offset: String) def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[*]") .appName("StructuredStreamingOffset") .getOrCreate() //开始 offset var startOffset = -1 //init val redisSingle: RedisSingle = new RedisSingle() redisSingle.init(Constants.IP, Constants.PORT) //get redis if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) { startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt } //sink val df = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", SUBSCRIBE) .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}") .load() import spark.implicits._ //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)] val content = lines.map(x => readLogs(x._1, x._2.toString)) val count = content.toDF("context", "offset") //sink foreach 记录offset val query = count .writeStream .foreach(new RedisWriteKafkaOffset) .outputMode("update") .trigger(Trigger.ProcessingTime("5 seconds")) .format("console") .start() query.awaitTermination() } }
Example 18
Source File: CountingInAStreamExpWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp import org.apache.spark.sql.functions._ import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} object CountingInAStreamExpWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[5]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[5]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .option("includeTimestamp", true) .load() val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => { MessageBuilder.build(line._1, line._2) }).filter(r => r != null).as[Message] val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp")) .withWatermark("eventTime", "30 seconds") .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker") .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window") val ticketOutput = tickerCount.writeStream .format("Console") .option("checkpointLocation", checkpointFolder) .outputMode("update") //.outputMode("complete") .format("console") .option("truncate", false) .option("numRows", 40) .start() ticketOutput.awaitTermination() } }
Example 19
Source File: CountingInAStreamDatasetExpGroupBy.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.functions._ object CountingInAStreamDatasetExpGroupBy { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String].map(line => { MessageBuilder.build(line) }).as[Message] val tickerCount = messageDs.groupBy("ticker", "destUser").agg(sum($"price"), avg($"price")) val ticketOutput = tickerCount.writeStream .format("Console") .trigger(Trigger.ProcessingTime("5 seconds")) .option("checkpointLocation", checkpointFolder) .outputMode("complete") .format("console") .start() ticketOutput.awaitTermination() } }
Example 20
Source File: CountingInAStreamExpGroupBy.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.functions._ object CountingInAStreamExpGroupBy { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String]. flatMap(line => line.toLowerCase().split(" ")) // Generate running word count val wordCounts = messageDs.groupBy("value").count() // Start running the query that prints the running counts to the console val query = wordCounts.writeStream .outputMode("complete") .format("console") .start() query.awaitTermination() } }
Example 21
Source File: StreamingOption.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark import scala.collection.mutable import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonLoadOptionConstants} import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.core.util.path.CarbonTablePath import org.apache.carbondata.streaming.parser.CarbonStreamParser class StreamingOption(val userInputMap: Map[String, String]) { lazy val trigger: Trigger = { val trigger = userInputMap.getOrElse( "trigger", throw new MalformedCarbonCommandException("trigger must be specified")) val interval = userInputMap.getOrElse( "interval", throw new MalformedCarbonCommandException("interval must be specified")) trigger match { case "ProcessingTime" => ProcessingTime(interval) case others => throw new MalformedCarbonCommandException("invalid trigger: " + trigger) } } def checkpointLocation(tablePath: String): String = userInputMap.getOrElse( "checkpointLocation", CarbonTablePath.getStreamingCheckpointDir(tablePath)) lazy val timeStampFormat: String = userInputMap.getOrElse("timestampformat", CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT) lazy val dateFormat: String = userInputMap.getOrElse("dateformat", CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT) lazy val rowParser: String = userInputMap.getOrElse(CarbonStreamParser.CARBON_STREAM_PARSER, CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER) lazy val badRecordsPath: String = userInputMap .getOrElse("bad_record_path", CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, CarbonCommonConstants.CARBON_BADRECORDS_LOC_DEFAULT_VAL)) lazy val badRecordsAction: String = userInputMap .getOrElse("bad_records_action", CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION_DEFAULT)) lazy val badRecordsLogger: String = userInputMap .getOrElse("bad_records_logger_enable", CarbonProperties.getInstance() .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE, CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE_DEFAULT)) lazy val isEmptyBadRecord: String = userInputMap .getOrElse("is_empty_bad_record", CarbonProperties.getInstance() .getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD, CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD_DEFAULT)) lazy val remainingOption: Map[String, String] = { // copy the user input map and remove the fix options val mutableMap = mutable.Map[String, String]() ++= userInputMap mutableMap.remove("checkpointLocation") mutableMap.remove("timestampformat") mutableMap.remove("dateformat") mutableMap.remove("trigger") mutableMap.remove("interval") mutableMap.remove(CarbonStreamParser.CARBON_STREAM_PARSER) mutableMap.toMap } }
Example 22
Source File: EventAggregationSpec.scala From spark-summit-2018 with GNU General Public License v3.0 | 5 votes |
package com.twilio.open.streaming.trend.discovery import java.util import com.twilio.open.protocol.Calls.CallEvent import com.twilio.open.protocol.Metrics import com.twilio.open.streaming.trend.discovery.streams.EventAggregation import org.apache.kafka.common.serialization.{Deserializer, Serializer, StringDeserializer, StringSerializer} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql._ import org.apache.spark.sql.kafka010.KafkaTestUtils import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.{Logger, LoggerFactory} class EventAggregationSpec extends KafkaBackedTest[String, CallEvent] { override val testUtils = new KafkaTestUtils[String, CallEvent] { override val keySerializer: Serializer[String] = new StringSerializer override val keyDeserializer: Deserializer[String] = new StringDeserializer override val valueSerializer: Serializer[CallEvent] = new CallEventSerializer override val valueDeserializer: Deserializer[CallEvent] = new CallEventDeserializer } override protected val kafkaTopic = "spark.summit.call.events" override protected val partitions = 8 private val pathToTestScenarios = "src/test/resources/scenarios" val log: Logger = LoggerFactory.getLogger(classOf[EventAggregation]) lazy val session: SparkSession = sparkSql override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("aggregation-test-app") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.shuffle.partitions", "32") .set("spark.executor.cores", "4") .set("spark.executor.memory", "1g") .set("spark.ui.enabled", "false") .setJars(SparkContext.jarOfClass(classOf[EventAggregation]).toList) } test("Should aggregate call events") { import session.implicits._ val appConfig = appConfigForTest() val scenario = TestHelper.loadScenario[CallEvent](s"$pathToTestScenarios/pdd_events.json") val scenarioIter = scenario.toIterator scenario.nonEmpty shouldBe true testUtils.createTopic(kafkaTopic, partitions, overwrite = true) sendNextMessages(scenarioIter, 30, _.getEventId, _.getLoggedEventTime) val trendDiscoveryApp = new TrendDiscoveryApp(appConfigForTest(), session) val eventAggregation = EventAggregation(appConfig) eventAggregation.process(trendDiscoveryApp.readKafkaStream())(session) .writeStream .queryName("calleventaggs") .format("memory") .outputMode(eventAggregation.outputMode) .start() .processAllAvailable() val df = session.sql("select * from calleventaggs") df.printSchema() df.show val res = session .sql("select avg(stats.p99) from calleventaggs") .collect() .map { r => r.getAs[Double](0) } .head DiscoveryUtils.round(res) shouldEqual 7.13 } } class CallEventSerializer extends Serializer[CallEvent] { override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {} override def serialize(topic: String, data: CallEvent): Array[Byte] = data.toByteArray override def close(): Unit = {} } class CallEventDeserializer extends Deserializer[CallEvent] { override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {} override def deserialize(topic: String, data: Array[Byte]): CallEvent = CallEvent.parseFrom(data) override def close(): Unit = {} }
Example 23
Source File: SparkStreamingKustoSink.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
import java.util.concurrent.TimeUnit import com.microsoft.kusto.spark.datasink.KustoSinkOptions import org.apache.spark.sql._ import org.apache.spark.eventhubs.{ConnectionStringBuilder, EventHubsConf, EventPosition} import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.functions._ // COMMAND ---------- // To enable faster ingestion into kusto, set a minimal value for the batching ingestion policy: // .alter table <table name> policy ingestionbatching @'{"MaximumBatchingTimeSpan": "00:00:10",}' object SparkStreamingKustoSink { def main(args: Array[String]): Unit = { // COMMAND ---------- // Note! This command is not required if you run in a Databricks notebook val spark: SparkSession = SparkSession.builder() .appName("SparkStreamingKustoSink") .master(f"local[4]") .getOrCreate() // read messages from Azure Event Hub val connectionString = ConnectionStringBuilder("Event Hub Connection String") .setEventHubName("Event Hub Name") .build val eventHubsConf = EventHubsConf(connectionString) .setStartingPosition(EventPosition.fromEndOfStream) val eventhubs = spark.readStream .format("eventhubs") .options(eventHubsConf.toMap) .option("checkpointLocation", "/checkpoint") .load() val toString = udf((payload: Array[Byte]) => new String(payload)) val df = eventhubs.withColumn("body", toString(eventhubs("body"))) spark.conf.set("spark.sql.streaming.checkpointLocation", "target/temp/checkpoint/") spark.conf.set("spark.sql.codegen.wholeStage", "false") // Write to a Kusto table from a streaming source val df1 = df .writeStream .format("com.microsoft.kusto.spark.datasink.KustoSinkProvider") .option(KustoSinkOptions.KUSTO_CLUSTER, "Your Kusto Cluster") .option(KustoSinkOptions.KUSTO_DATABASE, "Your Kusto Database") .option(KustoSinkOptions.KUSTO_TABLE, "Your Kusto Destination Table") .option(KustoSinkOptions.KUSTO_AAD_APP_ID, "Your Client ID") .option(KustoSinkOptions.KUSTO_AAD_APP_SECRET, "Your secret") .trigger(Trigger.ProcessingTime(0)) .start() df1.awaitTermination(TimeUnit.MINUTES.toMillis(8)) } }
Example 24
Source File: ProcessingTimeSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} class ProcessingTimeSuite extends SparkFunSuite { test("create") { def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000) assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000) intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) } intercept[IllegalArgumentException] { Trigger.ProcessingTime("") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") } intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") } } }
Example 25
Source File: ContinuousTrigger.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration import org.apache.commons.lang3.StringUtils import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.spark.unsafe.types.CalendarInterval @InterfaceStability.Evolving case class ContinuousTrigger(intervalMs: Long) extends Trigger { require(intervalMs >= 0, "the interval of trigger should not be negative") } private[sql] object ContinuousTrigger { def apply(interval: String): ContinuousTrigger = { if (StringUtils.isBlank(interval)) { throw new IllegalArgumentException( "interval cannot be null or blank.") } val cal = if (interval.startsWith("interval")) { CalendarInterval.fromString(interval) } else { CalendarInterval.fromString("interval " + interval) } if (cal == null) { throw new IllegalArgumentException(s"Invalid interval: $interval") } if (cal.months > 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } new ContinuousTrigger(cal.microseconds / 1000) } def apply(interval: Duration): ContinuousTrigger = { ContinuousTrigger(interval.toMillis) } def create(interval: String): ContinuousTrigger = { apply(interval) } def create(interval: Long, unit: TimeUnit): ContinuousTrigger = { ContinuousTrigger(unit.toMillis(interval)) } }
Example 26
Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import java.util.Locale import org.apache.spark.SparkException import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.StreamingRelationV2 import org.apache.spark.sql.sources.v2.StreamWriteSupport import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.StreamingSinkType case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand { private var outputMode: OutputMode = OutputMode.Append // dummy override def output: Seq[AttributeReference] = Seq.empty // dummy override def producedAttributes: AttributeSet = plan.producedAttributes override def run(sparkSession: SparkSession): Seq[Row] = { import StreamingSinkType._ val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan)) val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema)) plan.collectLeaves.head match { case StreamingRelationV2(_, _, extraOptions, _, _) => val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK) val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv => val key = kv._1.substring(STREAMING_SINK_PREFIX.length) (key, kv._2) } StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match { case CONSOLE => case TEXT | PARQUET | ORC | JSON | CSV => if (sinkOptions.get(STREAMING_SINK_PATH) == None) { throw new SparkException("Sink type is file, must config path") } case KAFKA => if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) { throw new SparkException("Sink type is kafka, must config bootstrap servers") } if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) { throw new SparkException("Sink type is kafka, must config kafka topic") } case _ => throw new SparkException( "Sink type is invalid, " + s"select from ${StreamingSinkType.values}") } val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") val sink = ds.newInstance() match { case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w case _ => val ds = DataSource( sparkSession, className = source, options = sinkOptions.toMap, partitionColumns = Nil) ds.createSink(InternalOutputModes.Append) } val outputMode = InternalOutputModes( extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE)) val duration = extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION) val trigger = extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match { case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration) case STREAMING_ONCE_TRIGGER => Trigger.Once() case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration) } val query = sparkSession.sessionState.streamingQueryManager.startQuery( extraOptions.get("queryName"), extraOptions.get(STREAMING_CHECKPOINT_LOCATION), df, sinkOptions.toMap, sink, outputMode, useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK, recoverFromCheckpointLocation = true, trigger = trigger) query.awaitTermination() } // dummy Seq.empty } } case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 27
Source File: ConsoleSink.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.sink.console import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} class ConsoleSink(trigger: Trigger = Trigger.Once(), outputMode: OutputMode = OutputMode.Update()) extends StreamingSink { override def writeStream(data: DataFrame): StreamingQuery = { data.writeStream .format("console") .trigger(trigger) .outputMode(outputMode) .option("checkpointLocation", checkpointLocation + "/console") .start() } }
Example 28
Source File: DeltaSink.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.sink.delta import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} class DeltaSink(trigger: Trigger = Trigger.Once(), outputMode: OutputMode = OutputMode.Append()) extends StreamingSink { override def writeStream(data: DataFrame): StreamingQuery = { data.writeStream .format("delta") .trigger(trigger) .outputMode(outputMode) .option("checkpointLocation", checkpointLocation + "/tmp/delta/events") .start("/tmp/delta/events") } }
Example 29
Source File: MemorySink.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.sink.memory import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} class MemorySink(trigger: Trigger = Trigger.Once(), outputMode: OutputMode = OutputMode.Update()) extends StreamingSink { override def writeStream(data: DataFrame): StreamingQuery = { data.writeStream .format("memory") .trigger(trigger) .outputMode(outputMode) .option("checkpointLocation", checkpointLocation + "/memory") .start() } }
Example 30
Source File: MapGroupsWithStateApp.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.operations.stateful import com.phylosoft.spark.learning.sql.streaming.domain.Model.{Event, SessionInfo, SessionUpdate} import com.phylosoft.spark.learning.sql.streaming.monitoring.Monitoring import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import com.phylosoft.spark.learning.sql.streaming.sink.console.ConsoleSink import com.phylosoft.spark.learning.sql.streaming.source.rate.UserActionsRateSource import com.phylosoft.spark.learning.{Logger, SparkSessionConfiguration} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, StreamingQuery, Trigger} object MapGroupsWithStateApp extends App with SparkSessionConfiguration with GroupsWithStateFunction with Monitoring with Logger { val settings = Map("spark.app.name" -> "MapGroupsWithStateApp") spark.streams.addListener(simpleListener) val source = new UserActionsRateSource(spark) val userActions = source.loadUserActions() userActions.printSchema() import spark.implicits._ val events = userActions .withColumnRenamed("userId", "sessionId") .withColumnRenamed("actionTime", "timestamp") .as[Event] events.printSchema() // Sessionize the events. Track number of events, start and end timestamps of session, and // and report session updates. val timeTimeoutMode = "ProcessingTime" val sessionUpdates = timeTimeoutMode match { case "ProcessingTime" => events .groupByKey(event => event.sessionId) .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.ProcessingTimeTimeout) { sessionUpdate } case _ => events .withWatermark("timestamp", "2 seconds") .groupByKey(event => event.sessionId) .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.EventTimeTimeout) { sessionUpdate } } val sessions = sessionUpdates .select($"*") .where("expired == true") sessions.printSchema() // Start running the query that prints the session updates to the console val query = startStreamingSink(sessions, initStreamingSink) query.awaitTermination() private def startStreamingSink[T <: StreamingSink](data: DataFrame, sink: T) : StreamingQuery = { sink.writeStream(data) } private def initStreamingSink: StreamingSink = { import scala.concurrent.duration._ new ConsoleSink(trigger = Trigger.ProcessingTime(2.seconds), outputMode = OutputMode.Append()) } }
Example 31
Source File: StreamingPredictionsSpec.scala From odsc-east-realish-predictions with Apache License 2.0 | 4 votes |
package com.twilio.open.odsc.realish import java.sql.Timestamp import java.time.Instant import java.util.{Random, UUID} import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoders, SQLContext, SparkSession} import org.scalatest.{FunSuite, Matchers} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} import scala.concurrent.duration._ class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql { override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("odsc-spark-utils") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.session.timeZone", "UTC") } final val notRandomRandom = { val generator = new Random generator.setSeed(100L) generator } test("should stream in some mock data for fun") { implicit val spark: SparkSession = sparkSql import spark.implicits._ implicit val sqlContext: SQLContext = spark.sqlContext implicit val metricEncoder = Encoders.product[Metric] val metricData = MemoryStream[Metric] val startingInstant = Instant.now() val backingData = (1 to 10000).map(offset => { val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration" val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100) Metric( Timestamp.from(startingInstant.minusSeconds(offset)), UUID.randomUUID().toString, metric, value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240), countryCode = if (offset % 8 == 0) "US" else "BR", callDirection = if (metric == "loss_percentage") "inbound" else "outbound" ) }) val processingTimeTrigger = Trigger.ProcessingTime(2.seconds) val streamingQuery = metricData.toDF() .withWatermark("timestamp", "2 hours") .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes")) .agg( min("value") as "min", avg("value") as "mean", max("value") as "max", count("*") as "total" ) .writeStream .format("memory") .queryName("datastream") .outputMode(OutputMode.Append()) .trigger(processingTimeTrigger) .start() metricData.addData(backingData) streamingQuery.processAllAvailable() spark.sql("select * from datastream").show(20, false) val checkChange = spark.sql("select * from datastream") .groupBy("metric","countryCode") .agg( sum("total") as "total", avg("mean") as "mean" ) checkChange.show(20, false) // now can do interesting things with minor back tracking... streamingQuery.stop() } }