org.apache.spark.sql.streaming.DataStreamWriter Scala Examples
The following examples show how to use org.apache.spark.sql.streaming.DataStreamWriter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ForeachBatchSink.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.api.python.PythonException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.streaming.DataStreamWriter class ForeachBatchSink[T](batchWriter: (Dataset[T], Long) => Unit, encoder: ExpressionEncoder[T]) extends Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = { val resolvedEncoder = encoder.resolveAndBind( data.logicalPlan.output, data.sparkSession.sessionState.analyzer) val rdd = data.queryExecution.toRdd.map[T](resolvedEncoder.fromRow)(encoder.clsTag) val ds = data.sparkSession.createDataset(rdd)(encoder) batchWriter(ds, batchId) } override def toString(): String = "ForeachBatchSink" } def call(batchDF: DataFrame, batchId: Long): Unit } object PythonForeachBatchHelper { def callForeachBatch(dsw: DataStreamWriter[Row], pythonFunc: PythonForeachBatchFunction): Unit = { dsw.foreachBatch(pythonFunc.call _) } }
Example 2
Source File: SparkExtension.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.sql.extension import com.microsoft.azure.kusto.data.ClientRequestProperties import com.microsoft.kusto.spark.datasink.{KustoSinkOptions, SparkIngestionProperties} import com.microsoft.kusto.spark.datasource.KustoSourceOptions import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.{DataFrameWriter, _} object SparkExtension { implicit class DataFrameReaderExtension(df: DataFrameReader) { def kusto(kustoCluster: String, database: String, query: String, conf: Map[String, String] = Map.empty[String, String], cpr: Option[ClientRequestProperties] = None): DataFrame = { if (cpr.isDefined) { df.option(KustoSourceOptions.KUSTO_CLIENT_REQUEST_PROPERTIES_JSON, cpr.get.toString) } df.format("com.microsoft.kusto.spark.datasource") .option(KustoSourceOptions.KUSTO_CLUSTER, kustoCluster) .option(KustoSourceOptions.KUSTO_DATABASE, database) .option(KustoSourceOptions.KUSTO_QUERY, query) .options(conf) .load() } } implicit class DataFrameWriterExtension(df: DataFrameWriter[Row]) { def kusto(kustoCluster: String, database: String, table: String, conf: Map[String, String] = Map.empty[String, String], sparkIngestionProperties: Option[SparkIngestionProperties] = None): Unit = { if (sparkIngestionProperties.isDefined) { df.option(KustoSinkOptions.KUSTO_SPARK_INGESTION_PROPERTIES_JSON, sparkIngestionProperties.get.toString) } df.format("com.microsoft.kusto.spark.datasource") .option(KustoSinkOptions.KUSTO_CLUSTER, kustoCluster) .option(KustoSinkOptions.KUSTO_DATABASE, database) .option(KustoSinkOptions.KUSTO_TABLE, table) .options(conf) .mode(SaveMode.Append) .save() } } implicit class DataStreamWriterExtension(df: DataStreamWriter[Row]) { def kusto(kustoCluster: String, database: String, table: String, conf: Map[String, String] = Map.empty[String, String], sparkIngestionProperties: Option[SparkIngestionProperties] = None): Unit = { if (sparkIngestionProperties.isDefined) { df.option(KustoSinkOptions.KUSTO_SPARK_INGESTION_PROPERTIES_JSON, sparkIngestionProperties.get.toString) } df.format("com.microsoft.kusto.spark.datasource") .option(KustoSinkOptions.KUSTO_CLUSTER, kustoCluster) .option(KustoSinkOptions.KUSTO_DATABASE, database) .option(KustoSinkOptions.KUSTO_TABLE, table) .options(conf) } } }
Example 3
Source File: DataStream.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.kafka2 import scala.language.implicitConversions import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.streaming.DataStreamWriter import com.paypal.gimel.datastreamfactory.{GimelDataStream2, StructuredStreamingResult} import com.paypal.gimel.kafka2.conf.KafkaClientConfiguration import com.paypal.gimel.kafka2.reader.KafkaStreamConsumer import com.paypal.gimel.kafka2.writer.KafkaStreamProducer import com.paypal.gimel.logger.Logger class DataStream(sparkSession: SparkSession) extends GimelDataStream2(sparkSession: SparkSession) { // GET LOGGER val logger = Logger() logger.info(s"Initiated --> ${this.getClass.getName}") private class DataStreamException(message: String, cause: Throwable) extends RuntimeException(message) { if (cause != null) { initCause(cause) } def this(message: String) = this(message, null) }
Example 4
Source File: KafkaStreamProducer.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.kafka2.writer import java.util.Properties import scala.collection.JavaConverters._ import scala.collection.immutable.Map import scala.language.implicitConversions import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.streaming.DataStreamWriter import com.paypal.gimel.common.conf.GimelConstants import com.paypal.gimel.kafka2.conf.{KafkaClientConfiguration, KafkaConstants} import com.paypal.gimel.kafka2.utilities.{KafkaOptionsLoaderUtils, KafkaUtilitiesException} object KafkaStreamProducer { val logger = com.paypal.gimel.logger.Logger() def produceStreamToKafka(conf: KafkaClientConfiguration, dataFrame: DataFrame): DataStreamWriter[Row] = { def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() logger.info(" @Begin --> " + MethodName) val kafkaProps: Properties = conf.kafkaProducerProps logger.info(s"Kafka Props for Producer -> ${kafkaProps.asScala.mkString("\n")}") logger.info("Begin Publishing to Kafka....") // Retrieve kafka options from OptionsLoader if specified val kafkaTopicsOptionsMap : Map[String, Map[String, String]] = KafkaOptionsLoaderUtils.getAllKafkaTopicsOptions(conf) logger.info("kafkaTopicsOptionsMap -> " + kafkaTopicsOptionsMap) try { val eachKafkaTopicToOptionsMap = KafkaOptionsLoaderUtils.getEachKafkaTopicToOptionsMap(kafkaTopicsOptionsMap) val kafkaTopicOptions = eachKafkaTopicToOptionsMap.get(conf.kafkaTopics) kafkaTopicOptions match { case None => throw new IllegalStateException(s"""Could not load options for the kafka topic -> $conf.kafkaTopics""") case Some(kafkaOptions) => dataFrame .writeStream .format(KafkaConstants.KAFKA_FORMAT) .option(KafkaConstants.KAFKA_TOPIC, conf.kafkaTopics) .option(GimelConstants.STREAMING_CHECKPOINT_LOCATION, conf.streamingCheckpointLocation) .outputMode(conf.streamingOutputMode) .options(kafkaOptions) } } catch { case ex: Throwable => { ex.printStackTrace() val msg = s""" |kafkaTopic -> ${conf.kafkaTopics} |kafkaParams --> ${kafkaProps.asScala.mkString("\n")}} """.stripMargin throw new KafkaUtilitiesException(s"Failed While Pushing Data Into Kafka \n ${msg}") } } } }
Example 5
Source File: Kudu.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp import org.apache.kudu.spark.kudu._ import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SparkSession} class KuduSink(master: String, database: String, checkpointLocation: String => String) { def writeTable(sinkName: String, triggerSeconds: Int = 10) = new Sink { override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = { val fullTableName = s"impala::$database.$name" df .writeStream .format("kudu") .option("kudu.master", master) .option("kudu.table", fullTableName) .option("checkpointLocation", checkpointLocation(name)) .option("retries", "3") .outputMode("update") } override val name: String = sinkName } }
Example 6
Source File: Memory.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode} import org.apache.spark.sql.{DataFrame, Row} object Memory { def memorySink(sinkName: String) = new Sink { override def createDataStreamWriter(df: DataFrame): DataStreamWriter[Row] = df .writeStream .outputMode(OutputMode.Append) .queryName(name) .format("memory") override val name: String = sinkName } }
Example 7
Source File: implicits.scala From spark-states with Apache License 2.0 | 5 votes |
package ru.chermenin.spark.sql.execution.streaming.state import org.apache.hadoop.fs.Path import org.apache.spark.sql.RuntimeConfig import org.apache.spark.sql.SparkSession.Builder import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.DataStreamWriter import ru.chermenin.spark.sql.execution.streaming.state.RocksDbStateStoreProvider._ import scala.collection.mutable object implicits extends Serializable { implicit class SessionImplicits(sparkSessionBuilder: Builder) { def useRocksDBStateStore(): Builder = sparkSessionBuilder.config(SQLConf.STATE_STORE_PROVIDER_CLASS.key, classOf[RocksDbStateStoreProvider].getCanonicalName) } implicit class WriterImplicits[T](dsw: DataStreamWriter[T]) { def stateTimeout(runtimeConfig: RuntimeConfig, queryName: String = "", expirySecs: Int = DEFAULT_STATE_EXPIRY_SECS.toInt, checkpointLocation: String = ""): DataStreamWriter[T] = { val extraOptions = getExtraOptions val name = queryName match { case "" | null => extraOptions.getOrElse("queryName", UNNAMED_QUERY) case _ => queryName } val location = new Path(checkpointLocation match { case "" | null => extraOptions.getOrElse("checkpointLocation", runtimeConfig.getOption(SQLConf.CHECKPOINT_LOCATION.key ).getOrElse(throw new IllegalStateException( "Checkpoint Location must be specified for State Expiry either " + """through option("checkpointLocation", ...) or """ + s"""SparkSession.conf.set("${SQLConf.CHECKPOINT_LOCATION.key}", ...)""")) ) case _ => checkpointLocation }, name) .toUri.toString runtimeConfig.set(s"$STATE_EXPIRY_SECS.$name", if (expirySecs < 0) -1 else expirySecs) dsw .queryName(name) .option("checkpointLocation", location) } private def getExtraOptions: mutable.HashMap[String, String] = { val className = classOf[DataStreamWriter[T]] val field = className.getDeclaredField("extraOptions") field.setAccessible(true) field.get(dsw).asInstanceOf[mutable.HashMap[String, String]] } } }