org.apache.spark.sql.ForeachWriter Scala Examples
The following examples show how to use org.apache.spark.sql.ForeachWriter.
Example 1
Source File: DNSstat.scala From jdbcsink with Apache License 2.0
import org.apache.spark.sql.SparkSession import java.util.Properties import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{from_json,window} import java.sql.{Connection,Statement,DriverManager} import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row class JDBCSink() extends ForeachWriter[Row]{ val driver = "com.mysql.jdbc.Driver" var connection:Connection = _ var statement:Statement = _ def open(partitionId: Long,version: Long): Boolean = { Class.forName(driver) connection = DriverManager.getConnection("jdbc:mysql://", "root", "mysqladmin") statement = connection.createStatement true } def process(value: Row): Unit = { statement.executeUpdate("replace into DNSStat(ip,domain,time,count) values(" + "'" + value.getString(0) + "'" + ","//ip + "'" + value.getString(1) + "'" + ","//domain + "'" + value.getTimestamp(2) + "'" + "," //time + value.getLong(3) //count + ")") } def close(errorOrNull: Throwable): Unit = { connection.close } } object DNSstatJob{ val schema: StructType = StructType( Seq(StructField("Vendor", StringType,true), StructField("Id", IntegerType,true), StructField("Time", LongType,true), StructField("Conn", StructType(Seq( StructField("Proto", IntegerType, true), StructField("Sport", IntegerType, true), StructField("Dport", IntegerType, true), StructField("Sip", StringType, true), StructField("Dip", StringType, true) )), true), StructField("Dns", StructType(Seq( StructField("Domain", StringType, true), StructField("IpCount", IntegerType, true), StructField("Ip", StringType, true) )), true))) def main(args: Array[String]) { val spark=SparkSession .builder .appName("DNSJob") .config("spark.some.config.option", "some-value") .getOrCreate() import spark.implicits._ val connectionProperties = new Properties() connectionProperties.put("user", "root") connectionProperties.put("password", "mysqladmin") val bruteForceTab = .jdbc("jdbc:mysql://", "DNSTab",connectionProperties) bruteForceTab.registerTempTable("DNSTab") val lines = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "") .option("subscribe","xdr") //.option("startingOffsets","earliest") .option("startingOffsets","latest") .load() .select(from_json($"value".cast(StringType),schema).as("jsonData")) lines.registerTempTable("xdr") val filterDNS = spark.sql("select CAST(from_unixtime(xdr.jsonData.Time DIV 1000000) as timestamp) as time,xdr.jsonData.Conn.Sip as sip, xdr.jsonData.Dns.Domain from xdr inner join DNSTab on xdr.jsonData.Dns.domain = DNSTab.domain") val windowedCounts = filterDNS .withWatermark("time","5 minutes") .groupBy(window($"time", "1 minutes", "1 minutes"),$"sip",$"domain") .count() .select($"sip",$"domain",$"window.start",$"count") val writer = new JDBCSink() val query = windowedCounts .writeStream .foreach(writer) .outputMode("update") .option("checkpointLocation","/checkpoint/") .start() query.awaitTermination() } }
Example 2
Source File: JDBCSink.scala From BigData-News with Apache License 2.0
package com.vita.spark import java.sql.{Connection, ResultSet, SQLException, Statement} import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.{ForeachWriter, Row} /** * 处理从StructuredStreaming中向mysql中写入数据 */ class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] { var statement: Statement = _ var resultSet: ResultSet = _ var connection: Connection = _ override def open(partitionId: Long, version: Long): Boolean = { connection = new MySqlPool(url, username, password).getJdbcConn() statement = connection.createStatement(); print("open") return true } override def process(value: Row): Unit = { println("process step one") val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "") val count = value.getAs[Long]("count") val querySql = "select 1 from webCount where titleName = '" + titleName + "'" val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')" val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'" println("process step two") try { //查看连接是否成功 var resultSet = statement.executeQuery(querySql) if ( { println("updateSql") statement.executeUpdate(updateSql) } else { println("insertSql") statement.execute(insertSql) } } catch { case ex: SQLException => { println("SQLException") } case ex: Exception => { println("Exception") } case ex: RuntimeException => { println("RuntimeException") } case ex: Throwable => { println("Throwable") } } } override def close(errorOrNull: Throwable): Unit = { if (statement == null) { statement.close() } if (connection == null) { connection.close() } } }
Example 3
Source File: RedisWriteKafkaOffset.scala From BigData-News with Apache License 2.0
package com.vita.spark.streaming.writer import com.vita.Constants import com.vita.redies.RedisSingle import org.apache.spark.sql.{ForeachWriter, Row} class RedisWriteKafkaOffset extends ForeachWriter[Row] { var redisSingle: RedisSingle = _ override def open(partitionId: Long, version: Long): Boolean = { redisSingle = new RedisSingle() redisSingle.init(Constants.IP, Constants.PORT) true } override def process(value: Row): Unit = { val offset = value.getAs[String]("offset") redisSingle.set(Constants.REDIDS_KEY, offset) } override def close(errorOrNull: Throwable): Unit = { redisSingle.getJedis().close() redisSingle.getPool().close() } }
Example 4
Source File: 4-jdbcsink.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License
// Databricks notebook source // MAGIC %md // MAGIC # JDBC Sink for Structured Streaming // MAGIC Structued streaming does not feature a JDBC sink currently.<br> // MAGIC The following is a custom sink we will use in the lab. // COMMAND ---------- import java.sql._ import org.apache.spark.sql.ForeachWriter class JDBCSink(url: String, user:String, pwd:String) extends org.apache.spark.sql.ForeachWriter[org.apache.spark.sql.Row]{ val driver = "" var connection:java.sql.Connection = _ var statement:java.sql.Statement = _ def open(partitionId: Long, version: Long):Boolean = { Class.forName(driver) connection = java.sql.DriverManager.getConnection(url, user, pwd) statement = connection.createStatement true } def process(value: org.apache.spark.sql.Row): Unit = { statement.executeUpdate("INSERT INTO chicago_crimes_curated_summary(case_id, primary_type, arrest_made,case_year, case_month, case_day_of_month) VALUES (" + "'" + value(0) + "'" + "," + "'" + value(1) + "'" + "," + "'" + value(2) + "'" + "," + "'" + value(3) + "'" + "," + "'" + value(4) + "'" + "," + "'" + value(5) + "'" + ");") } def close(errorOrNull:Throwable):Unit = { connection.close } } // COMMAND ----------
Example 5
Source File: CassandraForeachWriter.scala From structured-streaming-application with Apache License 2.0
package knolx.spark import com.datastax.driver.core.{Cluster, Session} import knolx.Config.{cassandraHosts, keyspace} import org.apache.spark.sql.{ForeachWriter, Row} object CassandraForeachWriter extends Serializable { val writeToCassandra = new ForeachWriter[Row] { private var cluster: Cluster = _ private var session: Session = _ override def process(row: Row): Unit = { val word = row.getString(0) val count = row.getLong(1) session.execute(s"insert into $keyspace.wordcount (word, count) values ('$word', $count);") } override def close(errorOrNull: Throwable): Unit = { session.close() session.getCluster.close() } override def open(partitionId: Long, version: Long): Boolean = { cluster = Cluster.builder.addContactPoints(cassandraHosts).build session = cluster.newSession() true } } }
Example 6
Source File: EventHubsForeachWriter.scala From azure-event-hubs-spark with Apache License 2.0
package org.apache.spark.sql.eventhubs import import org.apache.spark.eventhubs.{ EventHubsConf, EventHubsUtils } import org.apache.spark.eventhubs.client.Client import org.apache.spark.eventhubs.utils.MetricPlugin import org.apache.spark.internal.Logging import org.apache.spark.sql.ForeachWriter case class EventHubsForeachWriter(ehConf: EventHubsConf) extends ForeachWriter[String] with Logging { private lazy val metricPlugin: Option[MetricPlugin] = ehConf.metricPlugin() var client: Client = _ var totalMessageSizeInBytes = 0 var totalMessageCount = 0 var writerOpenTime = 0L def open(partitionId: Long, version: Long): Boolean = {"open is called. ${EventHubsUtils.getTaskContextSlim}") writerOpenTime = System.currentTimeMillis() client = EventHubsSourceProvider.clientFactory(ehConf.toMap)(ehConf) true } def process(body: String): Unit = { val event = EventData.create(s"$body".getBytes("UTF-8")) client.send(event) totalMessageCount += 1 totalMessageSizeInBytes += event.getBytes.length } def close(errorOrNull: Throwable): Unit = {"close is called. ${EventHubsUtils.getTaskContextSlim}") errorOrNull match { case t: Throwable => log.warn(s"an error occurred. eventhub name = ${}, error = ${t.getMessage}") closeInner(false) throw t case _ => closeInner(true) } } private def closeInner(isSuccess: Boolean): Unit = { var success = false if (client != null) { try { client.close() success = true } catch { case e: Exception => log.warn(s"an error occurred. eventhub name = ${}, error = ${e.getMessage}") throw e } client = null } metricPlugin.foreach( _.onSendMetric(EventHubsUtils.getTaskContextSlim,, totalMessageCount, totalMessageSizeInBytes, System.currentTimeMillis() - writerOpenTime, isSuccess && success)) } }
Example 7
Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0
package org.apache.spark.sql.kafka010 import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import scala.collection.mutable import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.{StreamTest, Trigger} import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession} // Run tests in KafkaSourceSuiteBase in continuous execution mode. class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest { import testImplicits._ override val brokerProps = Map("auto.create.topics.enable" -> "false") test("subscribing topic by pattern with topic deletions") { val topicPrefix = newTopic() val topic = topicPrefix + "-seems" val topic2 = topicPrefix + "-bad" testUtils.createTopic(topic, partitions = 5) testUtils.sendMessages(topic, Array("-1")) require(testUtils.getLatestOffsets(Set(topic)).size === 5) val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", testUtils.brokerAddress) .option("", "1") .option("subscribePattern", s"$topicPrefix-.*") .option("failOnDataLoss", "false") val kafka = reader.load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = => kv._2.toInt + 1) testStream(mapped)( makeSureGetOffsetCalled, AddKafkaData(Set(topic), 1, 2, 3), CheckAnswer(2, 3, 4), Execute { query => testUtils.deleteTopic(topic) testUtils.createTopic(topic2, partitions = 5) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists { r => // Ensure the new topic is present and the old topic is gone. r.knownPartitions.exists(_.topic == topic2) }, s"query never reconfigured to new topic $topic2") } }, AddKafkaData(Set(topic2), 4, 5, 6), CheckAnswer(2, 3, 4, 5, 6, 7) ) } } class KafkaContinuousSourceStressForDontFailOnDataLossSuite extends KafkaSourceStressForDontFailOnDataLossSuite { override protected def startStream(ds: Dataset[Int]) = { ds.writeStream .format("memory") .queryName("memory") .trigger(Trigger.Continuous("1 second")) .start() } }
Example 8
Source File: CassandraSinkForeach.scala From Spark-Structured-Streaming-Examples with Apache License 2.0
package cassandra.foreachSink import cassandra.CassandraDriver import log.LazyLogger import org.apache.spark.sql.ForeachWriter import radio.SimpleSongAggregation class CassandraSinkForeach() extends ForeachWriter[SimpleSongAggregation] with LazyLogger { private def cqlRadio(record: SimpleSongAggregation): String = s""" insert into ${CassandraDriver.namespace}.${CassandraDriver.foreachTableSink} (title, artist, radio, count) values('${record.title}', '${record.artist}', '${}', ${record.count})""" def open(partitionId: Long, version: Long): Boolean = { // open connection //@TODO command to check if cassandra cluster is up true } // def process(record: SimpleSongAggregation) = { log.warn(s"Saving record: $record") CassandraDriver.connector.withSessionDo(session => session.execute(cqlRadio(record)) ) } // def close(errorOrNull: Throwable): Unit = { // close the connection //connection.keep_alive_ms --> 5000ms : Period of time to keep unused connections open } }
Example 9
Source File: StructuredIdentity.scala From Swallow with Apache License 2.0
package import import import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredIdentity() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val query = ds.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis(), outTime) } }) .start() query.awaitTermination() } }
Example 10
Source File: StructuredRepartition.scala From Swallow with Apache License 2.0
package import import import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredRepartition() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val results = ds.repartition(config.coreNumber) val query = results.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis(), outTime) } }) .start() query.awaitTermination() } }