org.apache.spark.sql.streaming.OutputMode Scala Examples
The following examples show how to use org.apache.spark.sql.streaming.OutputMode.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DefaultSource.scala From spark-snowflake with Apache License 2.0 | 7 votes |
package net.snowflake.spark.snowflake import net.snowflake.spark.snowflake.streaming.SnowflakeSink import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import org.slf4j.LoggerFactory override def createRelation(sqlContext: SQLContext, saveMode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val params = Parameters.mergeParameters(parameters) // check spark version for push down if (params.autoPushdown) { SnowflakeConnectorUtils.checkVersionAndEnablePushdown( sqlContext.sparkSession ) } // pass parameters to pushdown functions pushdowns.setGlobalParameter(params) val table = params.table.getOrElse { throw new IllegalArgumentException( "For save operations you must specify a Snowfake table name with the 'dbtable' parameter" ) } def tableExists: Boolean = { val conn = jdbcWrapper.getConnector(params) try { jdbcWrapper.tableExists(conn, table.toString) } finally { conn.close() } } val (doSave, dropExisting) = saveMode match { case SaveMode.Append => (true, false) case SaveMode.Overwrite => (true, true) case SaveMode.ErrorIfExists => if (tableExists) { sys.error( s"Table $table already exists! (SaveMode is set to ErrorIfExists)" ) } else { (true, false) } case SaveMode.Ignore => if (tableExists) { log.info(s"Table $table already exists -- ignoring save request.") (false, false) } else { (true, false) } } if (doSave) { val updatedParams = parameters.updated("overwrite", dropExisting.toString) new SnowflakeWriter(jdbcWrapper) .save( sqlContext, data, saveMode, Parameters.mergeParameters(updatedParams) ) } createRelation(sqlContext, parameters) } override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = new SnowflakeSink(sqlContext, parameters, partitionColumns, outputMode) }
Example 2
Source File: SparkEgressSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.streaming.{ OutputMode, Trigger } import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkEgressSpec extends SparkScalaTestSupport { "SparkEgress" should { "materialize streaming data to sink" in { val testKit = SparkStreamletTestkit(session) def asCollection[T: Encoder](session: SparkSession, queryName: String): List[T] = session.sql(s"select * from $queryName").as[T].collect().toList val instance = new MySparkEgress() // setup inlet tap on inlet port val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in) // build data and send to inlet tap val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) in.addData(data) val run = testKit.run(instance, Seq(in), Seq.empty) run.failures mustBe ('empty) run.totalRows mustBe (20) val r1 = asCollection[String](session, "allNames") val r2 = asCollection[String](session, "allNamesUpper") // assert r1 must contain("name1") r2 must contain("NAME1") } } } class MySparkEgress extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = process(readStream(in)) private def process(inDataset: Dataset[Data]): StreamletQueryExecution = { val q1 = inDataset .map { d ⇒ d.name } .writeStream .format("memory") .option("truncate", false) .queryName("allNames") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() val q2 = inDataset .map { d ⇒ d.name.toUpperCase } .writeStream .format("memory") .option("truncate", false) .queryName("allNamesUpper") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() StreamletQueryExecution(q1, q2) } } }
Example 3
Source File: SparkStreamletContextImpl.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.kafka import java.io.File import com.typesafe.config.Config import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.{ ExpressionEncoder, RowEncoder } import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery } import cloudflow.spark.SparkStreamletContext import cloudflow.spark.avro.{ SparkAvroDecoder, SparkAvroEncoder } import cloudflow.spark.sql.SQLImplicits._ import cloudflow.streamlets._ import scala.reflect.runtime.universe._ class SparkStreamletContextImpl( private[cloudflow] override val streamletDefinition: StreamletDefinition, session: SparkSession, override val config: Config ) extends SparkStreamletContext(streamletDefinition, session) { val storageDir = config.getString("storage.mountPath") val maxOffsetsPerTrigger = config.getLong("cloudflow.spark.read.options.max-offsets-per-trigger") def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = { implicit val inRowEncoder: ExpressionEncoder[Row] = RowEncoder(encoder.schema) val schema = inPort.schemaAsString val topic = findTopicForPort(inPort) val srcTopic = topic.name val brokers = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers) val src: DataFrame = session.readStream .format("kafka") .option("kafka.bootstrap.servers", brokers) .options(kafkaConsumerMap(topic)) .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger) .option("subscribe", srcTopic) // Allow restart of stateful streamlets that may have been offline for longer than the kafka retention period. // This setting may result in data loss in some cases but allows for continuity of the runtime .option("failOnDataLoss", false) .option("startingOffsets", "earliest") .load() val rawDataset = src.select($"value").as[Array[Byte]] val dataframe: Dataset[Row] = rawDataset.mapPartitions { iter ⇒ val avroDecoder = new SparkAvroDecoder[In](schema) iter.map(avroDecoder.decode) }(inRowEncoder) dataframe.as[In] } def kafkaConsumerMap(topic: Topic) = topic.kafkaConsumerProperties.map { case (key, value) => s"kafka.$key" -> value } def kafkaProducerMap(topic: Topic) = topic.kafkaProducerProperties.map { case (key, value) => s"kafka.$key" -> value } def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = { val avroEncoder = new SparkAvroEncoder[Out](outPort.schemaAsString) val encodedStream = avroEncoder.encodeWithKey(stream, outPort.partitioner) val topic = findTopicForPort(outPort) val destTopic = topic.name val brokers = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers) // metadata checkpoint directory on mount val checkpointLocation = checkpointDir(outPort.name) val queryName = s"$streamletRef.$outPort" encodedStream.writeStream .outputMode(outputMode) .format("kafka") .queryName(queryName) .option("kafka.bootstrap.servers", brokers) .options(kafkaProducerMap(topic)) .option("topic", destTopic) .option("checkpointLocation", checkpointLocation) .start() } def checkpointDir(dirName: String): String = { val baseCheckpointDir = new File(storageDir, streamletRef) val dir = new File(baseCheckpointDir, dirName) if (!dir.exists()) { val created = dir.mkdirs() require(created, s"Could not create checkpoint directory: $dir") } dir.getAbsolutePath } }
Example 4
Source File: SparkProcessorSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import scala.collection.immutable.Seq import org.apache.spark.sql.streaming.OutputMode import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkProcessorSpec extends SparkScalaTestSupport { "SparkProcessor" should { "process streaming data" in { val testKit = SparkStreamletTestkit(session) // create an instance of the streamlet under test val instance = new TestSparkProcessor() // setup inlet tap on inlet port val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in) // setup outlet tap on outlet port val out: SparkOutletTap[Simple] = testKit.outletAsTap[Simple](instance.out) // build data and send to inlet tap val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) in.addData(data) val run = testKit.run(instance, Seq(in), Seq(out)) run.totalRows must be(10) // get data from outlet tap val results = out.asCollection(session) // assert results must contain(Simple("name1")) } } } // Test sparkStreamlet class TestSparkProcessor extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Simple]("out", _.name) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = dataset.select($"name").as[Simple] val query = writeStream(outStream, out, OutputMode.Append) StreamletQueryExecution(query) } } }
Example 5
Source File: SparkJoin3Spec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkJoin3Spec extends SparkScalaTestSupport { "SparkJoin3" should { "process streaming data" in { val testKit = SparkStreamletTestkit(session) val instance = new MySparkJoin3() // setup inlet tap on inlet port val in0: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in0) val in1: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in1) val in2: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in2) // setup outlet tap on outlet port val out: SparkOutletTap[Simple] = testKit.outletAsTap[Simple](instance.out) // build data and send to inlet tap val List(d1, d2, d3) = (1 to 30).map(i ⇒ Data(i, s"name$i")).sliding(10, 10).toList in0.addData(d1) in1.addData(d2) in2.addData(d3) val run = testKit.run(instance, Seq(in0, in1, in2), Seq(out)) run.totalRows must be(30) // get data from outlet tap val results = out.asCollection(session) // assert results must contain(Simple("name1")) results must contain(Simple("name11")) results must contain(Simple("name21")) (results must have).length(30) } } } // create sparkStreamlet class MySparkJoin3 extends SparkStreamlet { // comment: all inlets could be in different formats, one proto, one avro, one csv.. val in0 = AvroInlet[Data]("in0") val in1 = AvroInlet[Data]("in1") val in2 = AvroInlet[Data]("in2") val out = AvroOutlet[Simple]("out", _.name) val shape = StreamletShape(out).withInlets(in0, in1, in2) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset0 = readStream(in0) val dataset1 = readStream(in1) val dataset2 = readStream(in2) val outStream: Dataset[Simple] = process(dataset0, dataset1, dataset2) val query = writeStream(outStream, out, OutputMode.Append) StreamletQueryExecution(query) } private def process(in0: Dataset[Data], in1: Dataset[Data], in2: Dataset[Data]): Dataset[Simple] = in0.union(in1.union(in2)).select($"name").as[Simple] } }
Example 6
Source File: SparkIngressSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import scala.collection.immutable.Seq import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.execution.streaming.MemoryStream import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkIngressSpec extends SparkScalaTestSupport { "SparkIngress" should { "produce elements to its outlet" in { val testKit = SparkStreamletTestkit(session) val instance = new MySparkIngress() // setup outlet tap on outlet port val out: SparkOutletTap[Data] = testKit.outletAsTap[Data](instance.out) val run = testKit.run(instance, Seq.empty, Seq(out)) // get processed rows from the run run.totalRows must be(10) // get data from outlet tap val results = out.asCollection(session) // assert results must contain(Data(1, "name1")) } } } // create sparkStreamlet class MySparkIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.id.toString) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { private def process: Dataset[Data] = { implicit val sqlCtx = session.sqlContext val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) val m = MemoryStream[Data] m.addData(data) m.toDF.as[Data] } override def buildStreamingQueries = { val outStream: Dataset[Data] = process require(outStream.isStreaming, "The Dataset created by an Ingress must be a Streaming Dataset") val query = writeStream(outStream, out, OutputMode.Append) StreamletQueryExecution(query) } } }
Example 7
Source File: SparkProcessor.scala From cloudflow with Apache License 2.0 | 5 votes |
package com.example import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark._ import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode //tag::processor[] // create Spark Streamlet class SparkProcessor extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.id.toString) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = dataset.filter(_.id % 2 == 0) val query = writeStream(outStream, out, OutputMode.Append) query.toQueryExecution } } } //end::processor[]
Example 8
Source File: MovingAverageSparklet.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.sparkdoc import cloudflow.spark._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import org.apache.spark.sql.functions._ import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.types.TimestampType import org.apache.spark.sql.streaming.OutputMode //tag::spark-streamlet-example[] class MovingAverageSparklet extends SparkStreamlet { // <1> val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.key) val shape = StreamletShape(in, out) // <2> override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { // <3> val groupedData = readStream(in) // <4> .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"key") .agg(avg($"value").as("avg")) val query = groupedData.select($"key", $"avg".as("value")).as[Data] writeStream(query, out, OutputMode.Append).toQueryExecution } } } //end::spark-streamlet-example[]
Example 9
Source File: SparkRandomGenIngress.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.sparkdoc import scala.util.Random import cloudflow.spark._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import java.sql.Timestamp class SparkRandomGenDataIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.key) val shape = StreamletShape(out) case class Rate(timestamp: Timestamp, value: Long) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = writeStream(process, out, OutputMode.Append).toQueryExecution private def process: Dataset[Data] = { val recordsPerSecond = 10 val keyGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "keyOne" else "keyTwo" val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] rateStream.map { case Rate(_, value) ⇒ Data(keyGen(), value.toInt) } } } }
Example 10
Source File: SparkConsoleEgress.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.sparkdoc import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class SparkConsoleEgress extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { //tag::docs-checkpointDir-example[] override def buildStreamingQueries = readStream(in).writeStream .format("console") .option("checkpointLocation", context.checkpointDir("console-egress")) .outputMode(OutputMode.Append()) .start() .toQueryExecution //end::docs-checkpointDir-example[] } }
Example 11
Source File: CallStatsAggregator.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.callrecordaggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import cloudflow.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = Watermark.value val groupByWindow = GroupByWindow.value //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } //end::docs-aggregationQuery-example[] private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration")) .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } } }
Example 12
Source File: RecordSumFlow.scala From cloudflow with Apache License 2.0 | 5 votes |
package com.example import cloudflow.streamlets._ import cloudflow.spark._ import cloudflow.streamlets.avro._ import cloudflow.spark.sql.SQLImplicits._ import cloudflow.sparkdoc.Data import org.apache.spark.sql.streaming.OutputMode object RecordSumFlow extends SparkStreamlet { val recordsInWindowParameter = IntegerConfigParameter( "records-in-window", "This value describes how many records of data should be processed together, default 64 records", Some(64) ) override def configParameters = Vector(recordsInWindowParameter) val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.key) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = dataset.filter(_.value % 2 == 0) val query = writeStream(outStream, out, OutputMode.Append) query.toQueryExecution } } }
Example 13
Source File: MovingAverageSparklet.scala From cloudflow with Apache License 2.0 | 5 votes |
package sensors import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class MovingAverageSparklet extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Agg]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Agg] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge") .agg(avg($"value").as("avg")) query.select($"src", $"gauge", $"avg".as("value")).as[Agg] } } }
Example 14
Source File: SparkConsoleEgress.scala From cloudflow with Apache License 2.0 | 5 votes |
package sensors import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class SparkConsoleEgress extends SparkStreamlet { val in = AvroInlet[Agg]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { //tag::docs-checkpointDir-example[] override def buildStreamingQueries = readStream(in).writeStream .format("console") .option("checkpointLocation", context.checkpointDir("console-egress")) .outputMode(OutputMode.Append()) .start() .toQueryExecution //end::docs-checkpointDir-example[] } }
Example 15
Source File: CallAggregatorConsoleEgress.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import org.apache.log4j.{ Level, Logger } import carly.data._ class CallAggregatorConsoleEgress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val in = AvroInlet[AggregatedCallStats]("in") val shape = StreamletShape(in) override def createLogic = new SparkStreamletLogic { override def buildStreamingQueries = readStream(in).writeStream .format("console") .outputMode(OutputMode.Append()) .start() .toQueryExecution } }
Example 16
Source File: CallStatsAggregator.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import cloudflow.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter("group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter("watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = Watermark.value val groupByWindow = GroupByWindow.value // val t0 = System.currentTimeMillis() // serialization error! //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration").as("avgCallDuration"), sum($"duration").as("totalCallDuration")) .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType).as("startTime"), $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } //end::docs-aggregationQuery-example[] } }
Example 17
Source File: CallRecordGeneratorIngress.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import java.sql.Timestamp import scala.util.Random import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, SparkSession } import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.sql.SQLImplicits._ import carly.data.CallRecord import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.log4j.{ Level, Logger } case class Rate(timestamp: Timestamp, value: Long) class CallRecordGeneratorIngress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) val out = AvroOutlet[CallRecord]("out", _.user) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = RecordsPerSecond.value override def buildStreamingQueries = { val outStream = DataGenerator.mkData(super.session, recordsPerSecond) writeStream(outStream, out, OutputMode.Append).toQueryExecution } } } object DataGenerator { def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = { // do we need to expose this through configuration? val MaxTime = 2.hours.toMillis val MaxUsers = 100000 val TS0 = new java.sql.Timestamp(0) val ZeroTimestampProb = 0.05 // error rate // Random Data Generator val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers)) val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing") // Time-biased randomized filter - 1/2 hour cycles val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI) val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng)) val zeroTimestampUdf = udf { (ts: java.sql.Timestamp, rng: Double) ⇒ if (rng < ZeroTimestampProb) { TS0 } else { ts } } val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand()) val sampledData = randomDataset .where(timeFilterUdf($"timestamp", $"rng")) .withColumn("user", usersUdf()) .withColumn("other", usersUdf()) .withColumn("direction", directionUdf()) .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType)) .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng")) .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp".as("timestamp")) .as[CallRecord] sampledData } }
Example 18
Source File: SparkOutput.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import swissknife.data.Data class SparkOutput extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { val sparkLocality = context.session.conf.getOption("spark.locality.wait").getOrElse("") val feedbackMsg = s"locality=[$sparkLocality]" override def buildStreamingQueries = { val query = readStream(in) // we add this to the output to make it observable from the outside .withColumn("payload", lit(feedbackMsg)) // we add this to the output to make it observable from the outside .writeStream .format("console") .option("truncate","false") .start query.toQueryExecution } } }
Example 19
Source File: SparkCounter.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import cloudflow.streamlets.{StreamletShape, StringConfigParameter} import cloudflow.streamlets.avro._ import cloudflow.spark.{SparkStreamlet, SparkStreamletLogic} import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import swissknife.data.Data class SparkCounter extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) val configurableMessage = StringConfigParameter("configurable-message", "Configurable message.", Some("spark-original")) override def configParameters = Vector(configurableMessage) override def createLogic() = new SparkStreamletLogic { val msg = configurableMessage.value override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset, msg) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data], message: String): Dataset[Data] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withColumn("updated_src", concat($"src", lit("-spark"))) .withWatermark("ts", "0 seconds") .groupBy(window($"ts", "5 seconds"), $"updated_src") .agg(max($"count").as("count")) query.select($"updated_src".as("src"), $"window.start".as("timestamp"), lit(message).as("payload"), $"count").as[Data] } } }
Example 20
Source File: SparkDataGenerator.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import java.sql.Timestamp import cloudflow.streamlets.{ IntegerConfigParameter, StreamletShape } import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import cloudflow.spark.sql.SQLImplicits._ import swissknife.data.Data case class Rate(timestamp: Timestamp, value: Long) class SparkDataGenerator extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.src) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to produce.", Some(1)) override def configParameters = Vector(RecordsPerSecond) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = writeStream(process, out, OutputMode.Append).toQueryExecution private def process: Dataset[Data] = { val recordsPerSecond = RecordsPerSecond.value session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .select(lit("origin").as("src"), $"timestamp", lit("").as("payload"), $"value".as("count")) .as[Data] } } }
Example 21
Source File: A_1_WindowOperation.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.structured_streaming import java.sql.Timestamp import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.OutputMode object A_1_WindowOperation { def main(args: Array[String]): Unit = { if (args.length < 3) { println(s" Usage: StructuredNetworkWordCountWindowed <hostname> <port>" + " <window duration in seconds> [<slide duration in seconds>]") System.exit(1) } val host = args(0) val port = args(1).toInt val windowSize = args(2).toInt val slideSize = if (args.length == 3) windowSize else args(3).toInt if (slideSize > windowSize) { System.err.println("<滑动间隔> 必须要小于或等于 <窗口间隔>") } val windowDuration = s"$windowSize seconds" val slideDuration = s"$slideSize seconds" val spark = SparkSession.builder() .master("local") .appName(A_1_WindowOperation.getClass.getName) .getOrCreate() val lines = spark.readStream .format("socket") .option("host", host) .option("port", port) .load() import spark.implicits._ val words = lines.as[(String, Timestamp)] .flatMap(line => line._1.split(" ").map(word => (word, line._2))).toDF() val windowCount = words.groupBy( window($"timestamp", windowDuration, slideDuration) , $"word").count().orderBy("window") val query = windowCount.writeStream .outputMode(OutputMode.Complete()) .format("console") .option("truncate", "false") .start() query.awaitTermination() } }
Example 22
Source File: ElasticSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package elastic import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} import radio.{SimpleSongAggregation, Song} import org.elasticsearch.spark.sql.streaming._ import org.elasticsearch.spark.sql._ import org.elasticsearch.spark.sql.streaming.EsSparkSqlStreamingSink object ElasticSink { def writeStream(ds: Dataset[Song] ) : StreamingQuery = { ds //Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark .writeStream .outputMode(OutputMode.Append) //Only mode for ES .format("org.elasticsearch.spark.sql") //es .queryName("ElasticSink") .start("test/broadcast") //ES index } }
Example 23
Source File: MapGroupsWithState.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package mapGroupsWithState import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.StringType import spark.SparkHelper import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} import radio.{ArtistAggregationState, SimpleSongAggregation, SimpleSongAggregationKafka} object MapGroupsWithState extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def updateArtistStateWithEvent(state: ArtistAggregationState, artistCount : SimpleSongAggregation) = { log.warn("MapGroupsWithState - updateArtistStateWithEvent") if(state.artist == artistCount.artist) { ArtistAggregationState(state.artist, state.count + artistCount.count) } else { state } } def updateAcrossEvents(artist:String, inputs: Iterator[SimpleSongAggregation], oldState: GroupState[ArtistAggregationState]): ArtistAggregationState = { var state: ArtistAggregationState = if (oldState.exists) oldState.get else ArtistAggregationState(artist, 1L) // for every rows, let's count by artist the number of broadcast, instead of counting by artist, title and radio for (input <- inputs) { state = updateArtistStateWithEvent(state, input) oldState.update(state) } state } def write(ds: Dataset[SimpleSongAggregationKafka] ) = { ds.select($"radioCount.title", $"radioCount.artist", $"radioCount.radio", $"radioCount.count") .as[SimpleSongAggregation] .groupByKey(_.artist) .mapGroupsWithState(GroupStateTimeout.NoTimeout)(updateAcrossEvents) //we can control what should be done with the state when no update is received after a timeout. .writeStream .outputMode(OutputMode.Update()) .format("console") .queryName("mapGroupsWithState - counting artist broadcast") .start() } }
Example 24
Source File: SchemaRegistryAvroReader.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.examples import java.util.UUID import com.hortonworks.spark.registry.util._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{OutputMode, Trigger} object SchemaRegistryAvroReader { def main(args: Array[String]): Unit = { val schemaRegistryUrl = if (args.length > 0) args(0) else "http://localhost:9090/api/v1/" val bootstrapServers = if (args.length > 1) args(1) else "localhost:9092" val topic = if (args.length > 2) args(2) else "topic1-out" val checkpointLocation = if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString val securityProtocol = if (args.length > 4) Option(args(4)) else None val spark = SparkSession .builder .appName("SchemaRegistryAvroReader") .getOrCreate() val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("subscribe", topic) val messages = securityProtocol .map(p => reader.option("kafka.security.protocol", p).load()) .getOrElse(reader.load()) import spark.implicits._ // the schema registry client config val config = Map[String, Object]("schema.registry.url" -> schemaRegistryUrl) // the schema registry config that will be implicitly passed implicit val srConfig: SchemaRegistryConfig = SchemaRegistryConfig(config) // Read messages from kafka and deserialize. // This uses the schema registry schema associated with the topic. val df = messages .select(from_sr($"value", topic).alias("message")) // write the output to console // should produce events like {"driverId":14,"truckId":25,"miles":373} val query = df .writeStream .format("console") .trigger(Trigger.ProcessingTime(10000)) .outputMode(OutputMode.Append()) .start() query.awaitTermination() } }
Example 25
Source File: SparkRandomGenDataIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import java.sql.Timestamp import scala.util.Random import pipelines.streamlets.{ DurationConfigParameter, IntegerConfigParameter, StreamletShape } import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.{ OutputMode, Trigger } import pipelines.spark.sql.SQLImplicits._ case class Rate(timestamp: Timestamp, value: Long) class SparkRandomGenDataIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.src) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to produce.", Some(50)) val RampUpTime = DurationConfigParameter( "ramp-up-time", "Time to reach max records per second.", Some("0 seconds")) override def configParameters = Vector(RecordsPerSecond, RampUpTime) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) val rampUpTime = context.streamletConfig.getDuration(RampUpTime.key, java.util.concurrent.TimeUnit.SECONDS) println(s"Using rampup time of $rampUpTime seconds") val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas" val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .option("rampUpTime", s"${rampUpTime}s") .load() .as[Rate] rateStream.map { case Rate(timestamp, value) ⇒ Data(s"src-${value % 1000}", timestamp.getTime, None, None, gaugeGen(), value) } } } }
Example 26
Source File: SuicidalMonkeyProcessor.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import pipelines.spark.sql.SQLImplicits._ class SuicidalMonkeyProcessor extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.key.toString) val shape = StreamletShape(in, out) val rng = scala.util.Random override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val outStream = process(readStream(in)) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Data] = { inDataset.mapPartitions { iter ⇒ // monkey business // The logic in this processor causes the current executor to crash with a certain probability. // comment out to see the process working if (rng.nextDouble() < SequenceSettings.FailureProbability) { sys.exit(-1) } iter } } } }
Example 27
Source File: SparkSequenceGeneratorIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.LongType import org.apache.spark.sql.streaming.OutputMode import pipelines.streamlets._ import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import pipelines.spark.sql.SQLImplicits._ class SparkSequenceGeneratorIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.key.toString) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .withColumn("key", ($"value" / SequenceSettings.GroupSize).cast(LongType)) .as[Data] } } }
Example 28
Source File: MovingAverageSparklet.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class MovingAverageSparklet extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Agg]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Agg] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge").agg(avg($"value") as "avg") query.select($"src", $"gauge", $"avg" as "value").as[Agg] } } }
Example 29
Source File: SparkRandomGenDataIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import java.sql.Timestamp import scala.util.Random import pipelines.streamlets.{ IntegerConfigParameter, StreamletShape } import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import pipelines.spark.sql.SQLImplicits._ case class Rate(timestamp: Timestamp, value: Long) class SparkRandomGenDataIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.src) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to produce.", Some(50)) override def configParameters = Vector(RecordsPerSecond) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) val gaugeGen: () ⇒ String = () ⇒ if (Random.nextDouble() < 0.5) "oil" else "gas" val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] rateStream.map { case Rate(timestamp, value) ⇒ Data(s"src-${value % 100}", timestamp.getTime, gaugeGen(), Random.nextDouble() * value) } } } }
Example 30
Source File: SparkConsoleEgress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class SparkConsoleEgress extends SparkStreamlet { val in = AvroInlet[Agg]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { //tag::docs-checkpointDir-example[] override def buildStreamingQueries = { readStream(in).writeStream .format("console") .option("checkpointLocation", context.checkpointDir("console-egress")) .outputMode(OutputMode.Append()) .start() .toQueryExecution } //end::docs-checkpointDir-example[] } }
Example 31
Source File: CallAggregatorConsoleEgress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.examples.carly.aggregator import pipelines.streamlets._ import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import org.apache.log4j.{ Level, Logger } import pipelines.examples.carly.data._ class CallAggregatorConsoleEgress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val in = AvroInlet[AggregatedCallStats]("in") val shape = StreamletShape(in) override def createLogic = new SparkStreamletLogic { override def buildStreamingQueries = { readStream(in).writeStream .format("console") .outputMode(OutputMode.Append()) .start() .toQueryExecution } } }
Example 32
Source File: CallStatsAggregator.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.examples.carly.aggregator import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import pipelines.streamlets._ import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.streaming.OutputMode import pipelines.spark.sql.SQLImplicits._ import org.apache.log4j.{ Level, Logger } import pipelines.examples.carly.data._ class CallStatsAggregator extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) //tag::docs-schemaAware-example[] val in = AvroInlet[CallRecord]("in") val out = AvroOutlet[AggregatedCallStats]("out", _.startTime.toString) val shape = StreamletShape(in, out) //end::docs-schemaAware-example[] val GroupByWindow = DurationConfigParameter( "group-by-window", "Window duration for the moving average computation", Some("1 minute")) val Watermark = DurationConfigParameter( "watermark", "Late events watermark duration: how long to wait for late events", Some("1 minute")) override def configParameters = Vector(GroupByWindow, Watermark) override def createLogic = new SparkStreamletLogic { val watermark = context.streamletConfig.getDuration(Watermark.key) val groupByWindow = context.streamletConfig.getDuration(GroupByWindow.key) //tag::docs-aggregationQuery-example[] override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Update).toQueryExecution } private def process(inDataset: Dataset[CallRecord]): Dataset[AggregatedCallStats] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", s"${watermark.toMillis()} milliseconds") .groupBy(window($"ts", s"${groupByWindow.toMillis()} milliseconds")) .agg(avg($"duration") as "avgCallDuration", sum($"duration") as "totalCallDuration") .withColumn("windowDuration", $"window.end".cast(LongType) - $"window.start".cast(LongType)) query .select($"window.start".cast(LongType) as "startTime", $"windowDuration", $"avgCallDuration", $"totalCallDuration") .as[AggregatedCallStats] } //end::docs-aggregationQuery-example[] } }
Example 33
Source File: CallRecordGeneratorIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.examples.carly.aggregator import java.sql.Timestamp import scala.util.Random import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, SparkSession } import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType import pipelines.streamlets._ import pipelines.streamlets.avro._ import pipelines.spark.sql.SQLImplicits._ import pipelines.examples.carly.data.CallRecord import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.log4j.{ Level, Logger } case class Rate(timestamp: Timestamp, value: Long) class CallRecordGeneratorIngress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) val out = AvroOutlet[CallRecord]("out", _.user) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) override def buildStreamingQueries = { val outStream = DataGenerator.mkData(super.session, recordsPerSecond) writeStream(outStream, out, OutputMode.Append).toQueryExecution } } } object DataGenerator { def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = { // do we need to expose this through configuration? val MaxTime = 2.hours.toMillis val MaxUsers = 100000 val TS0 = new java.sql.Timestamp(0) val ZeroTimestampProb = 0.05 // error rate // Random Data Generator val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers)) val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing") // Time-biased randomized filter - 1/2 hour cycles val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI) val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng)) val zeroTimestampUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ { if (rng < ZeroTimestampProb) { TS0 } else { ts } }) val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand()) val sampledData = randomDataset.where(timeFilterUdf($"timestamp", $"rng")) .withColumn("user", usersUdf()) .withColumn("other", usersUdf()) .withColumn("direction", directionUdf()) .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType)) .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng")) .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp" as "timestamp") .as[CallRecord] sampledData } }
Example 34
Source File: IdentitySparkProcessor2.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import org.apache.spark.sql.streaming.OutputMode import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import pipelines.spark.sql.SQLImplicits._ import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ class IdentitySparkProcessor2 extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(readStream(in).map(d ⇒ d.copy(t2 = TimeOps.nowAsOption)), out, OutputMode.Append).toQueryExecution } } }
Example 35
Source File: IdentitySparkProcessor1.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class IdentitySparkProcessor1 extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(readStream(in).map(d ⇒ d.copy(t1 = TimeOps.nowAsOption)), out, OutputMode.Append).toQueryExecution } } }
Example 36
Source File: IdentitySparkProcessor0.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import org.apache.spark.sql.streaming.OutputMode import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import pipelines.spark.sql.SQLImplicits._ import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ class IdentitySparkProcessor0 extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(readStream(in).map { d ⇒ Thread.sleep(200); d }, out, OutputMode.Append).toQueryExecution } } }
Example 37
Source File: KinesisSink.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.streaming.OutputMode private[kinesis] class KinesisSink(sqlContext: SQLContext, sinkOptions: Map[String, String], outputMode: OutputMode) extends Sink with Logging { @volatile private var latestBatchId = -1L override def toString: String = "KinesisSink" override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= latestBatchId) { logInfo(s"Skipping already committed batch $batchId") } else { KinesisWriter.write(sqlContext.sparkSession, data.queryExecution, sinkOptions) latestBatchId = batchId } } }
Example 38
Source File: CurrentPersistenceIdsQuerySourceTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.sstreaming import java.util.UUID import java.util.concurrent.atomic.AtomicLong import akka.actor.{ ActorRef, Props } import akka.persistence.PersistentActor import akka.testkit.TestProbe import com.github.dnvriend.TestSpec import com.github.dnvriend.spark.datasources.SparkImplicits._ import com.github.dnvriend.spark.datasources.person.Person import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime } import org.scalatest.Ignore import scala.concurrent.ExecutionContext import scala.concurrent.duration._ import scala.language.implicitConversions object PersonActor { final case class BlogPost(id: Long, text: String) } class PersonActor(val persistenceId: String, schedule: Boolean)(implicit ec: ExecutionContext) extends PersistentActor { val counter = new AtomicLong() def ping() = context.system.scheduler.scheduleOnce(200.millis, self, "persist") def randomId: String = UUID.randomUUID.toString override val receiveRecover: Receive = PartialFunction.empty override val receiveCommand: Receive = { case "persist" => persist(Person(counter.incrementAndGet(), s"foo-$randomId", 20)) { _ => sender() ! "ack" } if (schedule) ping() } if (schedule) ping() } @Ignore class CurrentPersistenceIdsQuerySourceTest extends TestSpec { def withPersistentActor(pid: String = randomId, schedule: Boolean = false)(f: ActorRef => TestProbe => Unit): Unit = { val tp = TestProbe() val ref = system.actorOf(Props(new PersonActor(pid, schedule))) try f(ref)(tp) finally killActors(ref) } it should "query read journal" in withSparkSession { spark => withPersistentActor() { ref => tp => tp.send(ref, "persist") tp.expectMsg("ack") val jdbcReadJournal = spark.readStream .currentPersistenceIds("jdbc-read-journal") jdbcReadJournal.printSchema() println("Is the query streaming: " + jdbcReadJournal.isStreaming) println("Are there any streaming queries? " + spark.streams.active.isEmpty) val query = jdbcReadJournal .writeStream .format("console") .trigger(ProcessingTime(1.seconds)) .queryName("consoleStream") .outputMode(OutputMode.Append()) .start() query.awaitTermination(10.seconds) } } }
Example 39
Source File: QueryCsvTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.sstreaming import com.github.dnvriend.TestSpec import org.apache.commons.io.FileUtils import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime } import org.apache.spark.sql.types._ import org.scalatest.Ignore import scala.concurrent.duration._ import scala.language.implicitConversions @Ignore class QueryCsvTest extends TestSpec { def copyFiles(nrTimes: Int = 10): Unit = { FileUtils.deleteDirectory("/tmp/csv") FileUtils.forceMkdir("/tmp/csv") (1 to nrTimes).foreach { x => FileUtils.copyFile(TestSpec.PeopleCsv, s"/tmp/csv/people-$x") } } val schema: StructType = StructType(Array( StructField("id", LongType, nullable = false), StructField("name", StringType, nullable = true), StructField("age", IntegerType, nullable = true) )) it should "query csv file" in withSparkSession { spark => copyFiles() val csv = spark.readStream .schema(schema) .format("csv") .option("maxFilesPerTrigger", 1) .option("header", "false") // Use first line of all files as header .option("inferSchema", "false") // Automatically infer data types .option("delimiter", ";") .load("/tmp/csv") csv.printSchema() println("Is the query streaming: " + csv.isStreaming) println("Are there any streaming queries? " + spark.streams.active.isEmpty) val query = csv .writeStream .format("console") .trigger(ProcessingTime(5.seconds)) .queryName("consoleStream") .outputMode(OutputMode.Append()) .start() // waiting for data sleep(3.seconds) spark.streams .active .foreach(println) spark.streams .active .foreach(_.explain(extended = true)) query.awaitTermination(20.seconds) } }
Example 40
Source File: CurrentEventsByPersistenceIdQueryTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.sstreaming import akka.actor.{ ActorRef, Props } import akka.testkit.TestProbe import com.github.dnvriend.TestSpec import com.github.dnvriend.spark.datasources.SparkImplicits._ import com.github.dnvriend.spark.mapper.PersonEventMapper import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime } import org.apache.spark.sql.functions._ import org.scalatest.Ignore import scala.concurrent.duration._ @Ignore class CurrentEventsByPersistenceIdQueryTest extends TestSpec { def withPersistentActor(pid: String = randomId, schedule: Boolean = false)(f: ActorRef => TestProbe => Unit): Unit = { val tp = TestProbe() val ref = system.actorOf(Props(new PersonActor(pid, schedule))) try f(ref)(tp) finally killActors(ref) } it should "read events for pid" in withSparkSession { spark => import spark.implicits._ withPersistentActor("person", schedule = true) { ref => tp => tp.send(ref, "persist") tp.expectMsg("ack") val jdbcReadJournal = spark.readStream .schema(PersonEventMapper.schema) .option("pid", "person") .option("event-mapper", "com.github.dnvriend.spark.mapper.PersonEventMapper") .eventsByPersistenceId("jdbc-read-journal") jdbcReadJournal.printSchema() // val numOfEvents = jdbcReadJournal // .groupBy('persistence_id) // .agg(count('sequence_number).as("number_of_events")) val query = jdbcReadJournal .writeStream .format("console") .trigger(ProcessingTime(1.seconds)) .queryName("consoleStream") // .outputMode(OutputMode.Complete()) .outputMode(OutputMode.Append()) .start() query.awaitTermination(20.seconds) } } }
Example 41
Source File: KuduSinkProvider.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp.kudu import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister { override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported") KuduSink.withDefaultContext(sqlContext, parameters) } override def shortName(): String = "kudu" }
Example 42
Source File: Memory.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode} import org.apache.spark.sql.{DataFrame, Row} object Memory { def memorySink(sinkName: String) = new Sink { override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = df .writeStream .outputMode(OutputMode.Append) .queryName(name) .format("memory") override val name: String = sinkName } }
Example 43
Source File: KuduSinkProvider.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp.kudu import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister { override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported") KuduSink.withDefaultContext(sqlContext, parameters) } override def shortName(): String = "kudu" }
Example 44
Source File: StreamingTestHelper.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.streaming import java.io.{File, IOException} import java.util.UUID import com.qubole.spark.hiveacid.TestHelper import org.apache.spark.network.util.JavaUtils import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} import org.scalatest.concurrent.TimeLimits import org.scalatest.time.SpanSugar class StreamingTestHelper extends TestHelper with TimeLimits { import StreamingTestHelper._ def runStreaming(tableName: String, outputMode: OutputMode, cols: Seq[String], inputRange: Range, options: List[(String, String)] = List.empty): Unit = { val inputData = MemoryStream[Int] val ds = inputData.toDS() val checkpointDir = createCheckpointDir(namePrefix = "stream.checkpoint").getCanonicalPath var query: StreamingQuery = null try { // Starting streaming query val writerDf = ds.map(i => (i*100, i*10, i)) .toDF(cols:_*) .writeStream .format("HiveAcid") .option("table", tableName) .outputMode(outputMode) .option("checkpointLocation", checkpointDir) //.start() query = options.map { option => writerDf.option(option._1, option._2) }.lastOption.getOrElse(writerDf).start() // Adding data for streaming query inputData.addData(inputRange) failAfter(STREAMING_TIMEOUT) { query.processAllAvailable() } } finally { if (query != null) { // Terminating streaming query query.stop() deleteCheckpointDir(checkpointDir) } } } def deleteCheckpointDir(fileStr: String): Unit = { val file = new File(fileStr) if (file != null) { JavaUtils.deleteRecursively(file) } } def createCheckpointDir(root: String = System.getProperty("java.io.tmpdir"), namePrefix: String = "spark"): File = { var attempts = 0 val maxAttempts = MAX_DIR_CREATION_ATTEMPTS var dir: File = null while (dir == null) { attempts += 1 if (attempts > maxAttempts) { throw new IOException("Failed to create a temp directory (under " + root + ") after " + maxAttempts + " attempts!") } try { dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString) if (dir.exists() || !dir.mkdirs()) { dir = null } } catch { case e: SecurityException => dir = null; } } dir.getCanonicalFile } } object StreamingTestHelper extends TestHelper with SpanSugar { val MAX_DIR_CREATION_ATTEMPTS = 10 val STREAMING_TIMEOUT = 60.seconds }
Example 45
Source File: HiveAcidSinkOptionsSuite.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.streaming import java.util.Locale import com.qubole.spark.hiveacid.Table import org.apache.spark.sql.streaming.OutputMode class HiveAcidSinkOptionsSuite extends HiveAcidStreamingFunSuite { import HiveAcidSinkOptions._ test("bad sink options") { def testBadOptions(options: List[(String, String)])(expectedMsg: String): Unit = { val tableName = "tempTable" val tType = Table.orcFullACIDTable val cols = Map( ("value1","int"), ("value2", "int") ) val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false) // creating table helper.recreate(tableHive) val errorMessage = intercept[IllegalArgumentException] { helper.runStreaming( tableHive.hiveTname, OutputMode.Append(), tableHive.getColMap.keys.toSeq, Range(1, 4), options) }.getMessage assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) } testBadOptions(List(CLEANUP_DELAY_KEY -> "-2"))("Invalid value '-2' " + s"for option '$CLEANUP_DELAY_KEY', must be a positive integer") testBadOptions(List(COMPACT_INTERVAL_KEY -> "-5"))("Invalid value '-5' " + s"for option '$COMPACT_INTERVAL_KEY', must be a positive integer") testBadOptions(List(MIN_BATCHES_TO_RETAIN_KEY -> "-5"))("Invalid value '-5' " + s"for option '$MIN_BATCHES_TO_RETAIN_KEY', must be a positive integer") testBadOptions(List(LOG_DELETION_KEY -> "x"))("Invalid value 'x' " + s"for option '$LOG_DELETION_KEY', must be true or false") } }
Example 46
Source File: HiveAcidDataSource.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.datasource import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable} import com.qubole.spark.hiveacid.streaming.HiveAcidSink import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode class HiveAcidDataSource extends RelationProvider // USING HiveAcid with CreatableRelationProvider // Insert into/overwrite with DataSourceRegister // FORMAT("HiveAcid") with StreamSinkProvider with Logging { // returns relation for passed in table name override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { HiveAcidRelation(sqlContext.sparkSession, getFullyQualifiedTableName(parameters), parameters) } // returns relation after writing passed in data frame. Table name is part of parameter override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession( sqlContext.sparkSession, getFullyQualifiedTableName(parameters), parameters) mode match { case SaveMode.Overwrite => hiveAcidTable.insertOverwrite(df) case SaveMode.Append => hiveAcidTable.insertInto(df) // TODO: Add support for these case SaveMode.ErrorIfExists | SaveMode.Ignore => HiveAcidErrors.unsupportedSaveMode(mode) } createRelation(sqlContext, parameters) } override def shortName(): String = { HiveAcidDataSource.NAME } override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { tableSinkAssertions(partitionColumns, outputMode) new HiveAcidSink(sqlContext.sparkSession, parameters) } private def tableSinkAssertions(partitionColumns: Seq[String], outputMode: OutputMode): Unit = { if (partitionColumns.nonEmpty) { throw HiveAcidErrors.unsupportedFunction("partitionBy", "HiveAcidSink") } if (outputMode != OutputMode.Append) { throw HiveAcidErrors.unsupportedStreamingOutputMode(s"$outputMode") } } private def getFullyQualifiedTableName(parameters: Map[String, String]): String = { parameters.getOrElse("table", { throw HiveAcidErrors.tableNotSpecifiedException() }) } } object HiveAcidDataSource { val NAME = "HiveAcid" }
Example 47
Source File: TestSparkStreamletContext.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark package testkit import java.nio.file.attribute.FileAttribute import com.typesafe.config._ import scala.reflect.runtime.universe._ import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger } import cloudflow.streamlets._ import org.apache.spark.sql.catalyst.InternalRow class TestSparkStreamletContext(override val streamletRef: String, session: SparkSession, inletTaps: Seq[SparkInletTap[_]], outletTaps: Seq[SparkOutletTap[_]], override val config: Config = ConfigFactory.empty) extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config), session) { val ProcessingTimeInterval = 1500.milliseconds override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = inletTaps .find(_.portName == inPort.name) .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In]) .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}")) override def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = { // RateSource can only work with a microBatch query because it contains no data at time zero. // Trigger.Once requires data at start to work. val trigger = if (isRateSource(stream)) { Trigger.ProcessingTime(ProcessingTimeInterval) } else { Trigger.Once() } val streamingQuery = outletTaps .find(_.portName == outPort.name) .map { outletTap ⇒ stream.writeStream .outputMode(outputMode) .format("memory") .trigger(trigger) .queryName(outletTap.queryName) .start() } .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}")) streamingQuery } override def checkpointDir(dirName: String): String = { val fileAttibutes: Array[FileAttribute[_]] = Array() val tmpDir = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*) tmpDir.toFile.getAbsolutePath } private def isRateSource(stream: Dataset[_]): Boolean = { import org.apache.spark.sql.execution.command.ExplainCommand val explain = ExplainCommand(stream.queryExecution.logical, true) val res = session.sessionState.executePlan(explain).executedPlan.executeCollect() res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider")) } } case class TestContextException(portName: String, msg: String) extends RuntimeException(msg)
Example 48
Source File: InternalOutputModes.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.streaming import java.util.Locale import org.apache.spark.sql.streaming.OutputMode case object Update extends OutputMode def apply(outputMode: String): OutputMode = { outputMode.toLowerCase(Locale.ROOT) match { case "append" => OutputMode.Append case "complete" => OutputMode.Complete case "update" => OutputMode.Update case _ => throw new IllegalArgumentException(s"Unknown output mode $outputMode. " + "Accepted output modes are 'append', 'complete', 'update'") } } }
Example 49
Source File: SchemaJsonExample.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.examples import java.util.UUID import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{from_json, struct, to_json} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.types._ object SchemaJsonExample { def main(args: Array[String]): Unit = { val bootstrapServers = if (args.length > 0) args(0) else "localhost:9092" val topic = if (args.length > 1) args(1) else "topic1" val outTopic = if (args.length > 2) args(2) else "topic1-out" val checkpointLocation = if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString val spark = SparkSession .builder .appName("SchemaExample") .getOrCreate() val messages = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("subscribe", topic) .load() import spark.implicits._ // the schema for truck events val schema = StructType(Seq( StructField("driverId", IntegerType, nullable = false), StructField("truckId", IntegerType, nullable = false), StructField("eventTime", StringType, nullable = false), StructField("eventType", StringType, nullable = false), StructField("longitude", DoubleType, nullable = false), StructField("latitude", DoubleType, nullable = false), StructField("eventKey", StringType, nullable = false), StructField("correlationId", StringType, nullable = false), StructField("driverName", StringType, nullable = false), StructField("routeId", IntegerType, nullable = false), StructField("routeName", StringType, nullable = false), StructField("eventDate", StringType, nullable = false), StructField("miles", IntegerType, nullable = false) )) // read messages from kafka and parse it using the above schema val df = messages .select(from_json($"value".cast("string"), schema).alias("value")) // project (driverId, truckId, miles) for the events where miles > 300 val filtered = df.select($"value.driverId", $"value.truckId", $"value.miles") .where("value.miles > 300") // write the output to a kafka topic serialized as a JSON string. // should produce events like {"driverId":14,"truckId":25,"miles":373} val query = filtered .select(to_json(struct($"*")).alias("value")) .writeStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("topic", outTopic) .option("checkpointLocation", checkpointLocation) .trigger(Trigger.ProcessingTime(10000)) .outputMode(OutputMode.Append()) .start() query.awaitTermination() } }
Example 50
Source File: CustomSink.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql._ import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.execution.streaming.Sink //tag::foreachDatasetSink[] override def addBatch(batchId: Long, data: DataFrame) = { val batchDistinctCount = data.rdd.distinct.count() println(s"Batch ${batchId}'s distinct count is ${batchDistinctCount}") } } //end::basicSink[] object CustomSinkDemo { def write(ds: Dataset[_]) = { //tag::customSinkDemo[] ds.writeStream.format( "com.highperformancespark.examples.structuredstreaming." + "BasicSinkProvider") .queryName("customSinkDemo") .start() //end::customSinkDemo[] } }
Example 51
Source File: KustoSinkProvider.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.datasink import com.microsoft.kusto.spark.utils.{KeyVaultUtils, KustoDataSourceUtils} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class KustoSinkProvider extends StreamSinkProvider with DataSourceRegister { override def shortName(): String = "KustoSink" override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { val sinkParameters = KustoDataSourceUtils.parseSinkParameters(parameters) new KustoSink( sqlContext, sinkParameters.sourceParametersResults.kustoCoordinates, if(sinkParameters.sourceParametersResults.keyVaultAuth.isDefined){ val paramsFromKeyVault = KeyVaultUtils.getAadAppParametersFromKeyVault(sinkParameters.sourceParametersResults.keyVaultAuth.get) KustoDataSourceUtils.mergeKeyVaultAndOptionsAuthentication(paramsFromKeyVault, Some(sinkParameters.sourceParametersResults.authenticationParameters)) } else sinkParameters.sourceParametersResults.authenticationParameters, sinkParameters.writeOptions ) } }
Example 52
Source File: StructuredStreamingWordCount.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import com.datastax.driver.core.Cluster import knolx.Config._ import knolx.KnolXLogger import knolx.spark.CassandraForeachWriter.writeToCassandra import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StringType object StructuredStreamingWordCount extends App with KnolXLogger { val cluster = Cluster.builder.addContactPoints(cassandraHosts).build val session = cluster.newSession() info("Creating Keypsace and tables in Cassandra...") session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " + "replication = {'class':'SimpleStrategy','replication_factor':1};") session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );") info("Closing DB connection...") session.close() session.getCluster.close() info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Creating Streaming DF...") val dataStream = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .load() info("Writing data to Cassandra...") val query = dataStream .select(col("value").cast(StringType).as("word"), lit(1).as("count")) .groupBy(col("word")) .agg(sum("count").as("count")) .writeStream .outputMode(OutputMode.Update()) .foreach(writeToCassandra) .option("checkpointLocation", checkPointDir) .start() info("Waiting for the query to terminate...") query.awaitTermination() query.stop() }
Example 53
Source File: MultiStreamHandler.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.functions.col import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Encoders, SparkSession} case class CurrentPowerConsumption(kwh: Double) case class PowerConsumptionStatus(numOfReadings: Long, total: Double, avg: Double, status: String) { def compute(newReadings: List[Double]) = { val newTotal = newReadings.sum + total val newNumOfReadings = numOfReadings + newReadings.size val newAvg = newTotal / newNumOfReadings.toDouble PowerConsumptionStatus(newNumOfReadings, newTotal, newAvg, "ON") } } object MultiStreamHandler extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") val updateStateFunc = (deviceId: String, newReadings: Iterator[(String, CurrentPowerConsumption)], state: GroupState[PowerConsumptionStatus]) => { val data = newReadings.toList.map { case(_, reading) => reading }.map(_.kwh) lazy val initialPowerConsumptionStatus = PowerConsumptionStatus(0L, 0D, 0D, "OFF") val currentState = state.getOption.fold(initialPowerConsumptionStatus.compute(data))(_.compute(data)) val currentStatus = if(state.hasTimedOut) { // If we do not receive any reading, for a device, we will assume that it is OFF. currentState.copy(status = "OFF") } else { state.setTimeoutDuration("10 seconds") currentState } state.update(currentStatus) (deviceId, currentStatus) } info("Creating Streaming DF...") val dataStream = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .option("failOnDataLoss", false) .option("includeTimestamp", true) .load() info("Writing data to Console...") import spark.implicits._ implicit val currentPowerConsumptionEncoder = Encoders.kryo[CurrentPowerConsumption] implicit val powerConsumptionStatusEncoder = Encoders.kryo[PowerConsumptionStatus] val query = dataStream .select(col("key").cast(StringType).as("key"), col("value").cast(StringType).as("value")) .as[(String, String)] .map { case(deviceId, unit) => (deviceId, CurrentPowerConsumption(Option(unit).fold(0D)(_.toDouble))) } .groupByKey { case(deviceId, _) => deviceId } .mapGroupsWithState[PowerConsumptionStatus, (String, PowerConsumptionStatus)](GroupStateTimeout.ProcessingTimeTimeout())(updateStateFunc) .toDF("deviceId", "current_status") .writeStream .format("console") .option("truncate", false) .outputMode(OutputMode.Update()) .option("checkpointLocation", checkPointDir) .start() info("Waiting for the query to terminate...") query.awaitTermination() query.stop() }
Example 54
Source File: HttpStreamSink.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import org.apache.spark.internal.Logging import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.sources.StreamSinkProvider import org.apache.spark.sql.streaming.OutputMode import Params.map2Params class HttpStreamSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new HttpStreamSink(parameters.getRequiredString("httpServletUrl"), parameters.getRequiredString("topic"), parameters.getInt("maxPacketSize", 10 * 1024 * 1024)); } def shortName(): String = "httpStream" } class HttpStreamSink(httpPostURL: String, topic: String, maxPacketSize: Int) extends Sink with Logging { val producer = HttpStreamClient.connect(httpPostURL); val RETRY_TIMES = 5; val SLEEP_TIME = 100; override def addBatch(batchId: Long, data: DataFrame) { //send data to the HTTP server var success = false; var retried = 0; while (!success && retried < RETRY_TIMES) { try { retried += 1; producer.sendDataFrame(topic, batchId, data, maxPacketSize); success = true; } catch { case e: Throwable ⇒ { success = false; super.logWarning(s"failed to send", e); if (retried < RETRY_TIMES) { val sleepTime = SLEEP_TIME * retried; super.logWarning(s"will retry to send after ${sleepTime}ms"); Thread.sleep(sleepTime); } else { throw e; } } } } } }
Example 55
Source File: BlockingSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 56
Source File: MemorySinkV2Suite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.streaming.sources._ import org.apache.spark.sql.streaming.{OutputMode, StreamTest} import org.apache.spark.sql.types.StructType class MemorySinkV2Suite extends StreamTest with BeforeAndAfter { test("data writer") { val partition = 1234 val writer = new MemoryDataWriter( partition, OutputMode.Append(), new StructType().add("i", "int")) writer.write(InternalRow(1)) writer.write(InternalRow(2)) writer.write(InternalRow(44)) val msg = writer.commit() assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44)) assert(msg.partition == partition) // Buffer should be cleared, so repeated commits should give empty. assert(writer.commit().data.isEmpty) } test("streaming writer") { val sink = new MemorySinkV2 val writeSupport = new MemoryStreamWriter( sink, OutputMode.Append(), new StructType().add("i", "int")) writeSupport.commit(0, Array( MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))), MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))), MemoryWriterCommitMessage(2, Seq(Row(6), Row(7))) )) assert(sink.latestBatchId.contains(0)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7)) writeSupport.commit(19, Array( MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))), MemoryWriterCommitMessage(0, Seq(Row(33))) )) assert(sink.latestBatchId.contains(19)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33)) assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33)) } }
Example 57
Source File: console.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class ConsoleSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter( queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new ConsoleWriter(schema, options) } def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { // Number of rows to display, by default 20 rows val numRowsToShow = parameters.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true val isTruncated = parameters.get("truncate").map(_.toBoolean).getOrElse(true) data.show(numRowsToShow, isTruncated) ConsoleRelation(sqlContext, data) } def shortName(): String = "console" }
Example 58
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 59
Source File: InternalOutputModesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.streaming import java.util.Locale import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.OutputMode class InternalOutputModesSuite extends SparkFunSuite { test("supported strings") { def testMode(outputMode: String, expected: OutputMode): Unit = { assert(InternalOutputModes(outputMode) === expected) } testMode("append", OutputMode.Append) testMode("Append", OutputMode.Append) testMode("complete", OutputMode.Complete) testMode("Complete", OutputMode.Complete) testMode("update", OutputMode.Update) testMode("Update", OutputMode.Update) } test("unsupported strings") { def testMode(outputMode: String): Unit = { val acceptedModes = Seq("append", "update", "complete") val e = intercept[IllegalArgumentException](InternalOutputModes(outputMode)) (Seq("output mode", "unknown", outputMode) ++ acceptedModes).foreach { s => assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) } } testMode("Xyz") } }
Example 60
Source File: EventAggregationSpec.scala From spark-summit-2018 with GNU General Public License v3.0 | 5 votes |
package com.twilio.open.streaming.trend.discovery import java.util import com.twilio.open.protocol.Calls.CallEvent import com.twilio.open.protocol.Metrics import com.twilio.open.streaming.trend.discovery.streams.EventAggregation import org.apache.kafka.common.serialization.{Deserializer, Serializer, StringDeserializer, StringSerializer} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql._ import org.apache.spark.sql.kafka010.KafkaTestUtils import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.{Logger, LoggerFactory} class EventAggregationSpec extends KafkaBackedTest[String, CallEvent] { override val testUtils = new KafkaTestUtils[String, CallEvent] { override val keySerializer: Serializer[String] = new StringSerializer override val keyDeserializer: Deserializer[String] = new StringDeserializer override val valueSerializer: Serializer[CallEvent] = new CallEventSerializer override val valueDeserializer: Deserializer[CallEvent] = new CallEventDeserializer } override protected val kafkaTopic = "spark.summit.call.events" override protected val partitions = 8 private val pathToTestScenarios = "src/test/resources/scenarios" val log: Logger = LoggerFactory.getLogger(classOf[EventAggregation]) lazy val session: SparkSession = sparkSql override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("aggregation-test-app") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.shuffle.partitions", "32") .set("spark.executor.cores", "4") .set("spark.executor.memory", "1g") .set("spark.ui.enabled", "false") .setJars(SparkContext.jarOfClass(classOf[EventAggregation]).toList) } test("Should aggregate call events") { import session.implicits._ val appConfig = appConfigForTest() val scenario = TestHelper.loadScenario[CallEvent](s"$pathToTestScenarios/pdd_events.json") val scenarioIter = scenario.toIterator scenario.nonEmpty shouldBe true testUtils.createTopic(kafkaTopic, partitions, overwrite = true) sendNextMessages(scenarioIter, 30, _.getEventId, _.getLoggedEventTime) val trendDiscoveryApp = new TrendDiscoveryApp(appConfigForTest(), session) val eventAggregation = EventAggregation(appConfig) eventAggregation.process(trendDiscoveryApp.readKafkaStream())(session) .writeStream .queryName("calleventaggs") .format("memory") .outputMode(eventAggregation.outputMode) .start() .processAllAvailable() val df = session.sql("select * from calleventaggs") df.printSchema() df.show val res = session .sql("select avg(stats.p99) from calleventaggs") .collect() .map { r => r.getAs[Double](0) } .head DiscoveryUtils.round(res) shouldEqual 7.13 } } class CallEventSerializer extends Serializer[CallEvent] { override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {} override def serialize(topic: String, data: CallEvent): Array[Byte] = data.toByteArray override def close(): Unit = {} } class CallEventDeserializer extends Deserializer[CallEvent] { override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {} override def deserialize(topic: String, data: Array[Byte]): CallEvent = CallEvent.parseFrom(data) override def close(): Unit = {} }
Example 61
Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import java.util.Locale import org.apache.spark.SparkException import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.StreamingRelationV2 import org.apache.spark.sql.sources.v2.StreamWriteSupport import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.StreamingSinkType case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand { private var outputMode: OutputMode = OutputMode.Append // dummy override def output: Seq[AttributeReference] = Seq.empty // dummy override def producedAttributes: AttributeSet = plan.producedAttributes override def run(sparkSession: SparkSession): Seq[Row] = { import StreamingSinkType._ val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan)) val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema)) plan.collectLeaves.head match { case StreamingRelationV2(_, _, extraOptions, _, _) => val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK) val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv => val key = kv._1.substring(STREAMING_SINK_PREFIX.length) (key, kv._2) } StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match { case CONSOLE => case TEXT | PARQUET | ORC | JSON | CSV => if (sinkOptions.get(STREAMING_SINK_PATH) == None) { throw new SparkException("Sink type is file, must config path") } case KAFKA => if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) { throw new SparkException("Sink type is kafka, must config bootstrap servers") } if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) { throw new SparkException("Sink type is kafka, must config kafka topic") } case _ => throw new SparkException( "Sink type is invalid, " + s"select from ${StreamingSinkType.values}") } val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") val sink = ds.newInstance() match { case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w case _ => val ds = DataSource( sparkSession, className = source, options = sinkOptions.toMap, partitionColumns = Nil) ds.createSink(InternalOutputModes.Append) } val outputMode = InternalOutputModes( extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE)) val duration = extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION) val trigger = extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match { case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration) case STREAMING_ONCE_TRIGGER => Trigger.Once() case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration) } val query = sparkSession.sessionState.streamingQueryManager.startQuery( extraOptions.get("queryName"), extraOptions.get(STREAMING_CHECKPOINT_LOCATION), df, sinkOptions.toMap, sink, outputMode, useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK, recoverFromCheckpointLocation = true, trigger = trigger) query.awaitTermination() } // dummy Seq.empty } } case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 62
Source File: ConsoleSink.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.sink.console import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} class ConsoleSink(trigger: Trigger = Trigger.Once(), outputMode: OutputMode = OutputMode.Update()) extends StreamingSink { override def writeStream(data: DataFrame): StreamingQuery = { data.writeStream .format("console") .trigger(trigger) .outputMode(outputMode) .option("checkpointLocation", checkpointLocation + "/console") .start() } }
Example 63
Source File: DeltaSink.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.sink.delta import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} class DeltaSink(trigger: Trigger = Trigger.Once(), outputMode: OutputMode = OutputMode.Append()) extends StreamingSink { override def writeStream(data: DataFrame): StreamingQuery = { data.writeStream .format("delta") .trigger(trigger) .outputMode(outputMode) .option("checkpointLocation", checkpointLocation + "/tmp/delta/events") .start("/tmp/delta/events") } }
Example 64
Source File: MemorySink.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.sink.memory import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, Trigger} class MemorySink(trigger: Trigger = Trigger.Once(), outputMode: OutputMode = OutputMode.Update()) extends StreamingSink { override def writeStream(data: DataFrame): StreamingQuery = { data.writeStream .format("memory") .trigger(trigger) .outputMode(outputMode) .option("checkpointLocation", checkpointLocation + "/memory") .start() } }
Example 65
Source File: MapGroupsWithStateApp.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.operations.stateful import com.phylosoft.spark.learning.sql.streaming.domain.Model.{Event, SessionInfo, SessionUpdate} import com.phylosoft.spark.learning.sql.streaming.monitoring.Monitoring import com.phylosoft.spark.learning.sql.streaming.sink.StreamingSink import com.phylosoft.spark.learning.sql.streaming.sink.console.ConsoleSink import com.phylosoft.spark.learning.sql.streaming.source.rate.UserActionsRateSource import com.phylosoft.spark.learning.{Logger, SparkSessionConfiguration} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, StreamingQuery, Trigger} object MapGroupsWithStateApp extends App with SparkSessionConfiguration with GroupsWithStateFunction with Monitoring with Logger { val settings = Map("spark.app.name" -> "MapGroupsWithStateApp") spark.streams.addListener(simpleListener) val source = new UserActionsRateSource(spark) val userActions = source.loadUserActions() userActions.printSchema() import spark.implicits._ val events = userActions .withColumnRenamed("userId", "sessionId") .withColumnRenamed("actionTime", "timestamp") .as[Event] events.printSchema() // Sessionize the events. Track number of events, start and end timestamps of session, and // and report session updates. val timeTimeoutMode = "ProcessingTime" val sessionUpdates = timeTimeoutMode match { case "ProcessingTime" => events .groupByKey(event => event.sessionId) .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.ProcessingTimeTimeout) { sessionUpdate } case _ => events .withWatermark("timestamp", "2 seconds") .groupByKey(event => event.sessionId) .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.EventTimeTimeout) { sessionUpdate } } val sessions = sessionUpdates .select($"*") .where("expired == true") sessions.printSchema() // Start running the query that prints the session updates to the console val query = startStreamingSink(sessions, initStreamingSink) query.awaitTermination() private def startStreamingSink[T <: StreamingSink](data: DataFrame, sink: T) : StreamingQuery = { sink.writeStream(data) } private def initStreamingSink: StreamingSink = { import scala.concurrent.duration._ new ConsoleSink(trigger = Trigger.ProcessingTime(2.seconds), outputMode = OutputMode.Append()) } }
Example 66
Source File: DefaultSource.scala From spark-solr with Apache License 2.0 | 5 votes |
package solr import com.lucidworks.spark.{SolrRelation, SolrStreamWriter} import com.lucidworks.spark.util.Constants import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode class DefaultSource extends RelationProvider with CreatableRelationProvider with StreamSinkProvider with DataSourceRegister { override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { try { new SolrRelation(parameters, sqlContext.sparkSession) } catch { case re: RuntimeException => throw re case e: Exception => throw new RuntimeException(e) } } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { try { // TODO: What to do with the saveMode? val solrRelation: SolrRelation = new SolrRelation(parameters, Some(df), sqlContext.sparkSession) solrRelation.insert(df, overwrite = true) solrRelation } catch { case re: RuntimeException => throw re case e: Exception => throw new RuntimeException(e) } } override def shortName(): String = Constants.SOLR_FORMAT override def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new SolrStreamWriter(sqlContext.sparkSession, parameters, partitionColumns, outputMode) } }
Example 67
Source File: SolrStreamWriter.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import com.lucidworks.spark.util.{SolrQuerySupport, SolrSupport} import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.streaming.OutputMode import com.lucidworks.spark.util.ConfigurationConstants._ import org.apache.spark.sql.types.StructType import scala.collection.mutable class SolrStreamWriter( val sparkSession: SparkSession, parameters: Map[String, String], val partitionColumns: Seq[String], val outputMode: OutputMode)( implicit val solrConf : SolrConf = new SolrConf(parameters)) extends Sink with LazyLogging { require(solrConf.getZkHost.isDefined, s"Parameter ${SOLR_ZK_HOST_PARAM} not defined") require(solrConf.getCollection.isDefined, s"Parameter ${SOLR_COLLECTION_PARAM} not defined") val collection : String = solrConf.getCollection.get val zkhost: String = solrConf.getZkHost.get lazy val solrVersion : String = SolrSupport.getSolrVersion(solrConf.getZkHost.get) lazy val uniqueKey: String = SolrQuerySupport.getUniqueKey(zkhost, collection.split(",")(0)) lazy val dynamicSuffixes: Set[String] = SolrQuerySupport.getFieldTypes( Set.empty, SolrSupport.getSolrBaseUrl(zkhost), SolrSupport.getCachedCloudClient(zkhost), collection, skipDynamicExtensions = false) .keySet .filter(f => f.startsWith("*_") || f.endsWith("_*")) .map(f => if (f.startsWith("*_")) f.substring(1) else f.substring(0, f.length-1)) @volatile private var latestBatchId: Long = -1L val acc: SparkSolrAccumulator = new SparkSolrAccumulator val accName = if (solrConf.getAccumulatorName.isDefined) solrConf.getAccumulatorName.get else "Records Written" sparkSession.sparkContext.register(acc, accName) SparkSolrAccumulatorContext.add(accName, acc.id) override def addBatch(batchId: Long, df: DataFrame): Unit = { if (batchId <= latestBatchId) { logger.info(s"Skipping already processed batch $batchId") } else { val rows = df.collect() if (rows.nonEmpty) { val schema: StructType = df.schema val solrClient = SolrSupport.getCachedCloudClient(zkhost) // build up a list of updates to send to the Solr Schema API val fieldsToAddToSolr = SolrRelation.getFieldsToAdd(schema, solrConf, solrVersion, dynamicSuffixes) if (fieldsToAddToSolr.nonEmpty) { SolrRelation.addFieldsForInsert(fieldsToAddToSolr, collection, solrClient) } val solrDocs = rows.toStream.map(row => SolrRelation.convertRowToSolrInputDocument(row, solrConf, uniqueKey)) acc.add(solrDocs.length.toLong) SolrSupport.sendBatchToSolrWithRetry(zkhost, solrClient, collection, solrDocs, solrConf.commitWithin) logger.info(s"Written ${solrDocs.length} documents to Solr collection $collection from batch $batchId") latestBatchId = batchId } } } }
Example 68
Source File: S2SinkProvider.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark.sql.streaming import com.typesafe.config.{Config, ConfigFactory, ConfigRenderOptions} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode import scala.collection.JavaConversions._ class S2SinkProvider extends StreamSinkProvider with DataSourceRegister with Logger { override def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { logger.info(s"S2SinkProvider options : ${parameters}") val jobConf:Config = ConfigFactory.parseMap(parameters).withFallback(ConfigFactory.load()) logger.info(s"S2SinkProvider Configuration : ${jobConf.root().render(ConfigRenderOptions.concise())}") new S2SparkSqlStreamingSink(sqlContext.sparkSession, jobConf) } override def shortName(): String = "s2graph" }
Example 69
Source File: console.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 70
Source File: commands.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 71
Source File: DeltaSink.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.sources import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.SetTransaction import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.NullType class DeltaSink( sqlContext: SQLContext, path: Path, partitionColumns: Seq[String], outputMode: OutputMode, options: DeltaOptions) extends Sink with ImplicitMetadataOperation with DeltaLogging { private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path) private val sqlConf = sqlContext.sparkSession.sessionState.conf override protected val canOverwriteSchema: Boolean = outputMode == OutputMode.Complete() && options.canOverwriteSchema override protected val canMergeSchema: Boolean = options.canMergeSchema override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn => val sc = data.sparkSession.sparkContext val metrics = Map[String, SQLMetric]( "numAddedFiles" -> createMetric(sc, "number of files added"), "numRemovedFiles" -> createMetric(sc, "number of files removed") ) val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY) assert(queryId != null) if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) { throw DeltaErrors.streamWriteNullTypeException } // If the batch reads the same Delta table as this sink is going to write to, then this // write has dependencies. Then make sure that this commit set hasDependencies to true // by injecting a read on the whole table. This needs to be done explicitly because // MicroBatchExecution has already enforced all the data skipping (by forcing the generation // of the executed plan) even before the transaction was started. val selfScan = data.queryExecution.analyzed.collectFirst { case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true }.nonEmpty if (selfScan) { txn.readWholeTable() } // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details updateMetadata( txn, data, partitionColumns, configuration = Map.empty, outputMode == OutputMode.Complete()) val currentVersion = txn.txnVersion(queryId) if (currentVersion >= batchId) { logInfo(s"Skipping already complete epoch $batchId, in query $queryId") return } val deletedFiles = outputMode match { case o if o == OutputMode.Complete() => deltaLog.assertRemovable() txn.filterFiles().map(_.remove) case _ => Nil } val newFiles = txn.writeFiles(data, Some(options)) val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata) metrics("numRemovedFiles").set(deletedFiles.size) metrics("numAddedFiles").set(newFiles.size) txn.registerSQLMetrics(sqlContext.sparkSession, metrics) txn.commit(setTxn ++ newFiles ++ deletedFiles, info) // This is needed to make the SQL metrics visible in the Spark UI val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates( sqlContext.sparkContext, executionId, metrics.values.toSeq) } override def toString(): String = s"DeltaSink[$path]" }
Example 72
Source File: BlockingSource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 73
Source File: MemorySinkV2Suite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.sources._ import org.apache.spark.sql.streaming.{OutputMode, StreamTest} class MemorySinkV2Suite extends StreamTest with BeforeAndAfter { test("data writer") { val partition = 1234 val writer = new MemoryDataWriter(partition, OutputMode.Append()) writer.write(Row(1)) writer.write(Row(2)) writer.write(Row(44)) val msg = writer.commit() assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44)) assert(msg.partition == partition) // Buffer should be cleared, so repeated commits should give empty. assert(writer.commit().data.isEmpty) } test("continuous writer") { val sink = new MemorySinkV2 val writer = new MemoryStreamWriter(sink, OutputMode.Append()) writer.commit(0, Array( MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))), MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))), MemoryWriterCommitMessage(2, Seq(Row(6), Row(7))) )) assert(sink.latestBatchId.contains(0)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7)) writer.commit(19, Array( MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))), MemoryWriterCommitMessage(0, Seq(Row(33))) )) assert(sink.latestBatchId.contains(19)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33)) assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33)) } test("microbatch writer") { val sink = new MemorySinkV2 new MemoryWriter(sink, 0, OutputMode.Append()).commit( Array( MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))), MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))), MemoryWriterCommitMessage(2, Seq(Row(6), Row(7))) )) assert(sink.latestBatchId.contains(0)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7)) new MemoryWriter(sink, 19, OutputMode.Append()).commit( Array( MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))), MemoryWriterCommitMessage(0, Seq(Row(33))) )) assert(sink.latestBatchId.contains(19)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33)) assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33)) } }
Example 74
Source File: console.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class ConsoleSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter( queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new ConsoleWriter(schema, options) } def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { // Number of rows to display, by default 20 rows val numRowsToShow = parameters.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true val isTruncated = parameters.get("truncate").map(_.toBoolean).getOrElse(true) data.show(numRowsToShow, isTruncated) ConsoleRelation(sqlContext, data) } def shortName(): String = "console" }
Example 75
Source File: InternalOutputModesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.streaming import java.util.Locale import org.apache.spark.SparkFunSuite import org.apache.spark.sql.streaming.OutputMode class InternalOutputModesSuite extends SparkFunSuite { test("supported strings") { def testMode(outputMode: String, expected: OutputMode): Unit = { assert(InternalOutputModes(outputMode) === expected) } testMode("append", OutputMode.Append) testMode("Append", OutputMode.Append) testMode("complete", OutputMode.Complete) testMode("Complete", OutputMode.Complete) testMode("update", OutputMode.Update) testMode("Update", OutputMode.Update) } test("unsupported strings") { def testMode(outputMode: String): Unit = { val acceptedModes = Seq("append", "update", "complete") val e = intercept[IllegalArgumentException](InternalOutputModes(outputMode)) (Seq("output mode", "unknown", outputMode) ++ acceptedModes).foreach { s => assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) } } testMode("Xyz") } }
Example 76
Source File: InternalOutputModes.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.streaming import java.util.Locale import org.apache.spark.sql.streaming.OutputMode case object Update extends OutputMode def apply(outputMode: String): OutputMode = { outputMode.toLowerCase(Locale.ROOT) match { case "append" => OutputMode.Append case "complete" => OutputMode.Complete case "update" => OutputMode.Update case _ => throw new IllegalArgumentException(s"Unknown output mode $outputMode. " + "Accepted output modes are 'append', 'complete', 'update'") } } }
Example 77
Source File: CustomSinkProvider.scala From spark-highcharts with Apache License 2.0 | 5 votes |
package com.knockdata.spark.highcharts import com.knockdata.spark.highcharts.model.Highcharts import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.StreamSinkProvider import org.apache.spark.sql.streaming.OutputMode class CustomSinkProvider extends StreamSinkProvider { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = { val chartId = parameters("chartId") val chartParagraphId = parameters("chartParagraphId") println(s"batchId: $batchId, chartId: $chartId, chartParagraphId: $chartParagraphId") // data.show(3) val z = Registry.get(s"$chartId-z").asInstanceOf[ZeppelinContextHolder] val seriesHolder = Registry.get(s"$chartId-seriesHolder").asInstanceOf[SeriesHolder] val outputMode = Registry.get(s"$chartId-outputMode").asInstanceOf[CustomOutputMode] seriesHolder.dataFrame = data val result = seriesHolder.result val (normalSeriesList, drilldownSeriesList) = outputMode.result(result._1, result._2) val chart = new Highcharts(normalSeriesList, seriesHolder.chartId) .drilldown(drilldownSeriesList) val plotData = chart.plotData // val escaped = plotData.replace("%angular", "") // println(s" put $chartParagraphId $escaped") z.put(chartParagraphId, plotData) println(s"run $chartParagraphId") z.run(chartParagraphId) } } } }
Example 78
Source File: CustomOutputMode.scala From spark-highcharts with Apache License 2.0 | 5 votes |
package com.knockdata.spark.highcharts import com.knockdata.spark.highcharts.model.{Drilldown, Series} import org.apache.spark.sql.streaming.OutputMode import org.apache.zeppelin.spark.ZeppelinContext import scala.collection.mutable abstract class CustomOutputMode() extends OutputMode { val values = mutable.Map[String, String]() def put(key: String, value: String): Unit = values.put(key, value) def get(key: String): Option[String] = values.get(key) def apply(key: String): String = values(key) def result(normalSeries: List[Series], drilldownSeries: List[Series]): (List[Series], List[Series]) = (normalSeries, drilldownSeries) // def onFinish(result: String) } class AppendOutputMode(maxPoints: Int) extends CustomOutputMode() { var currentNormalSeries = mutable.Map[String, Series]() var currentDrilldownSeries = mutable.Map[String, Series]() def merge(previous: mutable.Map[String, Series], currentSeriesList: List[Series]): mutable.Map[String, Series] = { val current = mutable.Map[String, Series]() for (series <- currentSeriesList) { current += series.id -> series } // for the existing series, if there are more point need be added for ((key, series) <- previous) { if (current.contains(key)) { // println("\nprevious") // println(series.values.mkString("\n")) // println("\ncurrent") // println(current(key).values.mkString("\n")) current(key).vs = (series.values ::: current(key).values).takeRight(maxPoints) // println("\nvs") // println(current(key).vs.mkString("\n")) } else { current += key -> series } } current } override def result(normalSeries: List[Series], drilldownSeries: List[Series]): (List[Series], List[Series]) = { currentNormalSeries = merge(currentNormalSeries, normalSeries) currentDrilldownSeries = merge(currentDrilldownSeries, drilldownSeries) (currentNormalSeries.values.toList, currentDrilldownSeries.values.toList) } } class CompleteOutputMode() extends CustomOutputMode() { }
Example 79
Source File: BlockingSource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 80
Source File: console.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 81
Source File: commands.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 82
Source File: commands.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 83
Source File: JdbcSourceProvider.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.jdbc import scala.collection.JavaConverters._ import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.sources.v2.{DataSourceOptions, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType class JdbcSourceProvider extends StreamWriteSupport with DataSourceRegister{ override def createStreamWriter(queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { val optionMap = options.asMap().asScala.toMap // add this for parameter check. new JDBCOptions(optionMap) new JdbcStreamWriter(schema, optionMap) } // short name 'jdbc' is used for batch, chose a different name for streaming. override def shortName(): String = "streaming-jdbc" }
Example 84
Source File: JdbcSinkDemo.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.examples.sql.streaming.jdbc import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.streaming.{OutputMode, Trigger} object JdbcSinkDemo { private case class Person(name: String, age: Int) def main(args: Array[String]): Unit = { if (args.length < 4) { // scalastyle:off println System.err.println("Usage: JdbcSinkDemo <jdbcUrl> <tableName> <username> <password>") // scalastyle:on System.exit(1) } val jdbcUrl = args(0) val tableName = args(1) val username = args(2) val password = args(3) val spark = SparkSession .builder() .appName("JdbcSinkDemo") .getOrCreate() // load data source val df = spark.readStream .format("rate") .option("numPartitions", "5") .option("rowsPerSecond", "100") .load() // change input value to a person object. import spark.implicits._ val lines = df.select("value").as[Long].map{ value => Person(s"name_${value}", value.toInt % 30) } lines.printSchema() // write result val query = lines.writeStream .outputMode("append") .format("streaming-jdbc") .outputMode(OutputMode.Append) .option(JDBCOptions.JDBC_URL, jdbcUrl) .option(JDBCOptions.JDBC_TABLE_NAME, tableName) .option(JDBCOptions.JDBC_DRIVER_CLASS, "com.mysql.jdbc.Driver") .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, "5") .option("user", username) .option("password", password) .trigger(Trigger.ProcessingTime("10 seconds")) .start() query.awaitTermination() } }
Example 85
Source File: MQTTStreamSink.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.mqtt import scala.collection.JavaConverters._ import scala.collection.mutable import org.eclipse.paho.client.mqttv3.MqttException import org.apache.spark.SparkEnv import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.bahir.utils.Logging import org.apache.bahir.utils.Retry class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions) extends StreamWriter with Logging { override def createWriterFactory(): DataWriterFactory[InternalRow] = { // Skipping client identifier as single batch can be distributed to multiple // Spark worker process. MQTT server does not support two connections // declaring same client ID at given point in time. val params = parameters.asMap().asScala.filterNot( _._1.equalsIgnoreCase("clientId") ) MQTTDataWriterFactory(params) } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} } case class MQTTDataWriterFactory(config: mutable.Map[String, String]) extends DataWriterFactory[InternalRow] { override def createDataWriter( partitionId: Int, taskId: Long, epochId: Long ): DataWriter[InternalRow] = new MQTTDataWriter(config) } case object MQTTWriterCommitMessage extends WriterCommitMessage class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] { private lazy val publishAttempts: Int = SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1) private lazy val publishBackoff: Long = SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s") private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap) override def write(record: InternalRow): Unit = { val client = CachedMQTTClient.getOrCreate(config.toMap) val message = record.getBinary(0) Retry(publishAttempts, publishBackoff, classOf[MqttException]) { // In case of errors, retry sending the message. client.publish(topic, message, qos, false) } } override def commit(): WriterCommitMessage = MQTTWriterCommitMessage override def abort(): Unit = {} } case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter(queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new MQTTStreamWriter(schema, options) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { MQTTRelation(sqlContext, data) } override def shortName(): String = "mqtt" }
Example 86
Source File: DefaultSource.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import com.google.cloud.hadoop.io.bigquery.BigQueryStrings import com.samelamin.spark.bigquery.converters.SchemaConverters import com.samelamin.spark.bigquery.streaming.{BigQuerySink, BigQuerySource} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.{Sink, Source} import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.RelationProvider class DefaultSource extends StreamSinkProvider with StreamSourceProvider with RelationProvider{ override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { val path = parameters.get("transaction_log").getOrElse("transaction_log") new BigQuerySink(sqlContext.sparkSession, path, parameters) } def getConvertedSchema(sqlContext: SQLContext,options: Map[String, String]): StructType = { val bigqueryClient = BigQueryClient.getInstance(sqlContext) val tableReference = BigQueryStrings.parseTableReference(options.get("tableReferenceSource").get) SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference)) } override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, options: Map[String, String]): (String, StructType) = { val convertedSchema = getConvertedSchema(sqlContext,options) ("bigquery", schema.getOrElse(convertedSchema)) } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { new BigQuerySource(sqlContext, schema, parameters) } override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BigQueryRelation = { val tableName = parameters.get("tableReferenceSource").get new BigQueryRelation(tableName)(sqlContext) } }
Example 87
Source File: EnrichmentInAStream.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.OutputMode object EnrichmentInAStream { def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) var checkpointDir = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .option("checkpointLocation", checkpointDir) .load() val messageDs = socketLines.as[String].map(line => { MessageBuilder.build(line) }).as[Message] val upperMessageDs = messageDs.map(message => { message.toString.toUpperCase() }).as[String] upperMessageDs.foreachPartition(messageIt => { //make connection to storage layer // May use static connection messageIt.foreach(message => { //write to storage location }) }) val messageOutput = upperMessageDs.writeStream.outputMode(OutputMode.Complete()) .start() messageOutput.awaitTermination() } }
Example 88
Source File: CountingInAStreamExpWindowing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp import org.apache.spark.sql.functions._ import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} object CountingInAStreamExpWindowing { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[5]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[5]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .option("includeTimestamp", true) .load() val messageDsDStream = socketLines.as[(String, Timestamp)].map(line => { MessageBuilder.build(line._1, line._2) }).filter(r => r != null).as[Message] val tickerCount = messageDsDStream.withColumn("eventTime", $"tradeTs".cast("timestamp")) .withWatermark("eventTime", "30 seconds") .groupBy(window($"eventTime", "30 seconds", "5 seconds"), $"ticker") .agg(max($"tradeTs") as "max_time", sum($"price") as "total_price", avg($"price") as "avg_price", count($"price") as "number_of_trades")//.orderBy("window") val ticketOutput = tickerCount.writeStream .format("Console") .option("checkpointLocation", checkpointFolder) .outputMode("update") //.outputMode("complete") .format("console") .option("truncate", false) .option("numRows", 40) .start() ticketOutput.awaitTermination() } }
Example 89
Source File: CountingInAStreamExpQueringResults.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.OutputMode object CountingInAStreamExpQueringResults { def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host", "127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDsDStream = socketLines.as[String].map(line => { MessageBuilder.build(line) }).as[Message] val tickerCount = messageDsDStream.groupBy("ticker").count() val destCount = messageDsDStream.groupBy("destUser").count() val ticketOutput = tickerCount.writeStream.outputMode(OutputMode.Complete()) .format("memory") .queryName("ticker_counts") .start() val destOutput = destCount.writeStream .format("memory") .queryName("dest_counts") .start() while (true) { println("ticker_counts") sparkSession.sql("select * from ticker_counts").collect().foreach(println) println("dest_counts") sparkSession.sql("select * from dest_counts").collect().foreach(println) } destOutput.awaitTermination() ticketOutput.awaitTermination() } }
Example 90
Source File: CountingInAStreamExpGroupBy.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.structured import com.malaska.spark.training.streaming.{Message, MessageBuilder} import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.functions._ object CountingInAStreamExpGroupBy { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .master("local[3]") .getOrCreate() } import sparkSession.implicits._ val socketLines = sparkSession.readStream .format("socket") .option("host", host) .option("port", port) .load() val messageDs = socketLines.as[String]. flatMap(line => line.toLowerCase().split(" ")) // Generate running word count val wordCounts = messageDs.groupBy("value").count() // Start running the query that prints the running counts to the console val query = wordCounts.writeStream .outputMode("complete") .format("console") .start() query.awaitTermination() } }
Example 91
Source File: BlockingSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 92
Source File: console.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 93
Source File: StreamingPredictionsSpec.scala From odsc-east-realish-predictions with Apache License 2.0 | 4 votes |
package com.twilio.open.odsc.realish import java.sql.Timestamp import java.time.Instant import java.util.{Random, UUID} import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoders, SQLContext, SparkSession} import org.scalatest.{FunSuite, Matchers} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} import scala.concurrent.duration._ class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql { override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("odsc-spark-utils") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.session.timeZone", "UTC") } final val notRandomRandom = { val generator = new Random generator.setSeed(100L) generator } test("should stream in some mock data for fun") { implicit val spark: SparkSession = sparkSql import spark.implicits._ implicit val sqlContext: SQLContext = spark.sqlContext implicit val metricEncoder = Encoders.product[Metric] val metricData = MemoryStream[Metric] val startingInstant = Instant.now() val backingData = (1 to 10000).map(offset => { val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration" val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100) Metric( Timestamp.from(startingInstant.minusSeconds(offset)), UUID.randomUUID().toString, metric, value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240), countryCode = if (offset % 8 == 0) "US" else "BR", callDirection = if (metric == "loss_percentage") "inbound" else "outbound" ) }) val processingTimeTrigger = Trigger.ProcessingTime(2.seconds) val streamingQuery = metricData.toDF() .withWatermark("timestamp", "2 hours") .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes")) .agg( min("value") as "min", avg("value") as "mean", max("value") as "max", count("*") as "total" ) .writeStream .format("memory") .queryName("datastream") .outputMode(OutputMode.Append()) .trigger(processingTimeTrigger) .start() metricData.addData(backingData) streamingQuery.processAllAvailable() spark.sql("select * from datastream").show(20, false) val checkChange = spark.sql("select * from datastream") .groupBy("metric","countryCode") .agg( sum("total") as "total", avg("mean") as "mean" ) checkChange.show(20, false) // now can do interesting things with minor back tracking... streamingQuery.stop() } }