org.apache.spark.sql.sources.StreamSinkProvider Scala Examples
The following examples show how to use org.apache.spark.sql.sources.StreamSinkProvider.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: console.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 2
Source File: S2SinkProvider.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark.sql.streaming import com.typesafe.config.{Config, ConfigFactory, ConfigRenderOptions} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode import scala.collection.JavaConversions._ class S2SinkProvider extends StreamSinkProvider with DataSourceRegister with Logger { override def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { logger.info(s"S2SinkProvider options : ${parameters}") val jobConf:Config = ConfigFactory.parseMap(parameters).withFallback(ConfigFactory.load()) logger.info(s"S2SinkProvider Configuration : ${jobConf.root().render(ConfigRenderOptions.concise())}") new S2SparkSqlStreamingSink(sqlContext.sparkSession, jobConf) } override def shortName(): String = "s2graph" }
Example 3
Source File: BlockingSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 4
Source File: HttpStreamSink.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import org.apache.spark.internal.Logging import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.sources.StreamSinkProvider import org.apache.spark.sql.streaming.OutputMode import Params.map2Params class HttpStreamSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new HttpStreamSink(parameters.getRequiredString("httpServletUrl"), parameters.getRequiredString("topic"), parameters.getInt("maxPacketSize", 10 * 1024 * 1024)); } def shortName(): String = "httpStream" } class HttpStreamSink(httpPostURL: String, topic: String, maxPacketSize: Int) extends Sink with Logging { val producer = HttpStreamClient.connect(httpPostURL); val RETRY_TIMES = 5; val SLEEP_TIME = 100; override def addBatch(batchId: Long, data: DataFrame) { //send data to the HTTP server var success = false; var retried = 0; while (!success && retried < RETRY_TIMES) { try { retried += 1; producer.sendDataFrame(topic, batchId, data, maxPacketSize); success = true; } catch { case e: Throwable ⇒ { success = false; super.logWarning(s"failed to send", e); if (retried < RETRY_TIMES) { val sleepTime = SLEEP_TIME * retried; super.logWarning(s"will retry to send after ${sleepTime}ms"); Thread.sleep(sleepTime); } else { throw e; } } } } } }
Example 5
Source File: KustoSinkProvider.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.datasink import com.microsoft.kusto.spark.utils.{KeyVaultUtils, KustoDataSourceUtils} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class KustoSinkProvider extends StreamSinkProvider with DataSourceRegister { override def shortName(): String = "KustoSink" override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { val sinkParameters = KustoDataSourceUtils.parseSinkParameters(parameters) new KustoSink( sqlContext, sinkParameters.sourceParametersResults.kustoCoordinates, if(sinkParameters.sourceParametersResults.keyVaultAuth.isDefined){ val paramsFromKeyVault = KeyVaultUtils.getAadAppParametersFromKeyVault(sinkParameters.sourceParametersResults.keyVaultAuth.get) KustoDataSourceUtils.mergeKeyVaultAndOptionsAuthentication(paramsFromKeyVault, Some(sinkParameters.sourceParametersResults.authenticationParameters)) } else sinkParameters.sourceParametersResults.authenticationParameters, sinkParameters.writeOptions ) } }
Example 6
Source File: CustomSink.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql._ import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.execution.streaming.Sink //tag::foreachDatasetSink[] override def addBatch(batchId: Long, data: DataFrame) = { val batchDistinctCount = data.rdd.distinct.count() println(s"Batch ${batchId}'s distinct count is ${batchDistinctCount}") } } //end::basicSink[] object CustomSinkDemo { def write(ds: Dataset[_]) = { //tag::customSinkDemo[] ds.writeStream.format( "com.highperformancespark.examples.structuredstreaming." + "BasicSinkProvider") .queryName("customSinkDemo") .start() //end::customSinkDemo[] } }
Example 7
Source File: console.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 8
Source File: BlockingSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 9
Source File: console.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 10
Source File: BlockingSource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 11
Source File: CustomSinkProvider.scala From spark-highcharts with Apache License 2.0 | 5 votes |
package com.knockdata.spark.highcharts import com.knockdata.spark.highcharts.model.Highcharts import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.StreamSinkProvider import org.apache.spark.sql.streaming.OutputMode class CustomSinkProvider extends StreamSinkProvider { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = { val chartId = parameters("chartId") val chartParagraphId = parameters("chartParagraphId") println(s"batchId: $batchId, chartId: $chartId, chartParagraphId: $chartParagraphId") // data.show(3) val z = Registry.get(s"$chartId-z").asInstanceOf[ZeppelinContextHolder] val seriesHolder = Registry.get(s"$chartId-seriesHolder").asInstanceOf[SeriesHolder] val outputMode = Registry.get(s"$chartId-outputMode").asInstanceOf[CustomOutputMode] seriesHolder.dataFrame = data val result = seriesHolder.result val (normalSeriesList, drilldownSeriesList) = outputMode.result(result._1, result._2) val chart = new Highcharts(normalSeriesList, seriesHolder.chartId) .drilldown(drilldownSeriesList) val plotData = chart.plotData // val escaped = plotData.replace("%angular", "") // println(s" put $chartParagraphId $escaped") z.put(chartParagraphId, plotData) println(s"run $chartParagraphId") z.run(chartParagraphId) } } } }
Example 12
Source File: BlockingSource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.util import java.util.concurrent.CountDownLatch import org.apache.spark.sql.{SQLContext, _} import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source} import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class BlockingSource extends StreamSourceProvider with StreamSinkProvider { private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil) override def sourceSchema( spark: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { ("dummySource", fakeSchema) } override def createSource( spark: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { BlockingSource.latch.await() new Source { override def schema: StructType = fakeSchema override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { import spark.implicits._ Seq[Int]().toDS().toDF() } override def stop() {} } } override def createSink( spark: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = {} } } } object BlockingSource { var latch: CountDownLatch = null }
Example 13
Source File: KuduSinkProvider.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp.kudu import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister { override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported") KuduSink.withDefaultContext(sqlContext, parameters) } override def shortName(): String = "kudu" }
Example 14
Source File: KuduSinkProvider.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp.kudu import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister { override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported") KuduSink.withDefaultContext(sqlContext, parameters) } override def shortName(): String = "kudu" }