org.apache.spark.sql.ForeachWriter Scala Examples
The following examples show how to use org.apache.spark.sql.ForeachWriter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DNSstat.scala From jdbcsink with Apache License 2.0 | 6 votes |
import org.apache.spark.sql.SparkSession import java.util.Properties import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{from_json,window} import java.sql.{Connection,Statement,DriverManager} import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row class JDBCSink() extends ForeachWriter[Row]{ val driver = "com.mysql.jdbc.Driver" var connection:Connection = _ var statement:Statement = _ def open(partitionId: Long,version: Long): Boolean = { Class.forName(driver) connection = DriverManager.getConnection("jdbc:mysql://10.88.1.102:3306/aptwebservice", "root", "mysqladmin") statement = connection.createStatement true } def process(value: Row): Unit = { statement.executeUpdate("replace into DNSStat(ip,domain,time,count) values(" + "'" + value.getString(0) + "'" + ","//ip + "'" + value.getString(1) + "'" + ","//domain + "'" + value.getTimestamp(2) + "'" + "," //time + value.getLong(3) //count + ")") } def close(errorOrNull: Throwable): Unit = { connection.close } } object DNSstatJob{ val schema: StructType = StructType( Seq(StructField("Vendor", StringType,true), StructField("Id", IntegerType,true), StructField("Time", LongType,true), StructField("Conn", StructType(Seq( StructField("Proto", IntegerType, true), StructField("Sport", IntegerType, true), StructField("Dport", IntegerType, true), StructField("Sip", StringType, true), StructField("Dip", StringType, true) )), true), StructField("Dns", StructType(Seq( StructField("Domain", StringType, true), StructField("IpCount", IntegerType, true), StructField("Ip", StringType, true) )), true))) def main(args: Array[String]) { val spark=SparkSession .builder .appName("DNSJob") .config("spark.some.config.option", "some-value") .getOrCreate() import spark.implicits._ val connectionProperties = new Properties() connectionProperties.put("user", "root") connectionProperties.put("password", "mysqladmin") val bruteForceTab = spark.read .jdbc("jdbc:mysql://10.88.1.102:3306/aptwebservice", "DNSTab",connectionProperties) bruteForceTab.registerTempTable("DNSTab") val lines = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "10.94.1.110:9092") .option("subscribe","xdr") //.option("startingOffsets","earliest") .option("startingOffsets","latest") .load() .select(from_json($"value".cast(StringType),schema).as("jsonData")) lines.registerTempTable("xdr") val filterDNS = spark.sql("select CAST(from_unixtime(xdr.jsonData.Time DIV 1000000) as timestamp) as time,xdr.jsonData.Conn.Sip as sip, xdr.jsonData.Dns.Domain from xdr inner join DNSTab on xdr.jsonData.Dns.domain = DNSTab.domain") val windowedCounts = filterDNS .withWatermark("time","5 minutes") .groupBy(window($"time", "1 minutes", "1 minutes"),$"sip",$"domain") .count() .select($"sip",$"domain",$"window.start",$"count") val writer = new JDBCSink() val query = windowedCounts .writeStream .foreach(writer) .outputMode("update") .option("checkpointLocation","/checkpoint/") .start() query.awaitTermination() } }
Example 2
Source File: JDBCSink.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import java.sql.{Connection, ResultSet, SQLException, Statement} import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.{ForeachWriter, Row} /** * 处理从StructuredStreaming中向mysql中写入数据 */ class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] { var statement: Statement = _ var resultSet: ResultSet = _ var connection: Connection = _ override def open(partitionId: Long, version: Long): Boolean = { connection = new MySqlPool(url, username, password).getJdbcConn() statement = connection.createStatement(); print("open") return true } override def process(value: Row): Unit = { println("process step one") val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "") val count = value.getAs[Long]("count") val querySql = "select 1 from webCount where titleName = '" + titleName + "'" val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')" val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'" println("process step two") try { //查看连接是否成功 var resultSet = statement.executeQuery(querySql) if (resultSet.next()) { println("updateSql") statement.executeUpdate(updateSql) } else { println("insertSql") statement.execute(insertSql) } } catch { case ex: SQLException => { println("SQLException") } case ex: Exception => { println("Exception") } case ex: RuntimeException => { println("RuntimeException") } case ex: Throwable => { println("Throwable") } } } override def close(errorOrNull: Throwable): Unit = { if (statement == null) { statement.close() } if (connection == null) { connection.close() } } }
Example 3
Source File: RedisWriteKafkaOffset.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.streaming.writer import com.vita.Constants import com.vita.redies.RedisSingle import org.apache.spark.sql.{ForeachWriter, Row} class RedisWriteKafkaOffset extends ForeachWriter[Row] { var redisSingle: RedisSingle = _ override def open(partitionId: Long, version: Long): Boolean = { redisSingle = new RedisSingle() redisSingle.init(Constants.IP, Constants.PORT) true } override def process(value: Row): Unit = { val offset = value.getAs[String]("offset") redisSingle.set(Constants.REDIDS_KEY, offset) } override def close(errorOrNull: Throwable): Unit = { redisSingle.getJedis().close() redisSingle.getPool().close() } }
Example 4
Source File: 4-jdbcsink.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License | 5 votes |
// Databricks notebook source // MAGIC %md // MAGIC # JDBC Sink for Structured Streaming // MAGIC Structued streaming does not feature a JDBC sink currently.<br> // MAGIC The following is a custom sink we will use in the lab. // COMMAND ---------- import java.sql._ import org.apache.spark.sql.ForeachWriter class JDBCSink(url: String, user:String, pwd:String) extends org.apache.spark.sql.ForeachWriter[org.apache.spark.sql.Row]{ val driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver" var connection:java.sql.Connection = _ var statement:java.sql.Statement = _ def open(partitionId: Long, version: Long):Boolean = { Class.forName(driver) connection = java.sql.DriverManager.getConnection(url, user, pwd) statement = connection.createStatement true } def process(value: org.apache.spark.sql.Row): Unit = { statement.executeUpdate("INSERT INTO chicago_crimes_curated_summary(case_id, primary_type, arrest_made,case_year, case_month, case_day_of_month) VALUES (" + "'" + value(0) + "'" + "," + "'" + value(1) + "'" + "," + "'" + value(2) + "'" + "," + "'" + value(3) + "'" + "," + "'" + value(4) + "'" + "," + "'" + value(5) + "'" + ");") } def close(errorOrNull:Throwable):Unit = { connection.close } } // COMMAND ----------
Example 5
Source File: CassandraForeachWriter.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import com.datastax.driver.core.{Cluster, Session} import knolx.Config.{cassandraHosts, keyspace} import org.apache.spark.sql.{ForeachWriter, Row} object CassandraForeachWriter extends Serializable { val writeToCassandra = new ForeachWriter[Row] { private var cluster: Cluster = _ private var session: Session = _ override def process(row: Row): Unit = { val word = row.getString(0) val count = row.getLong(1) session.execute(s"insert into $keyspace.wordcount (word, count) values ('$word', $count);") } override def close(errorOrNull: Throwable): Unit = { session.close() session.getCluster.close() } override def open(partitionId: Long, version: Long): Boolean = { cluster = Cluster.builder.addContactPoints(cassandraHosts).build session = cluster.newSession() true } } }
Example 6
Source File: EventHubsForeachWriter.scala From azure-event-hubs-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.eventhubs import com.microsoft.azure.eventhubs.EventData import org.apache.spark.eventhubs.{ EventHubsConf, EventHubsUtils } import org.apache.spark.eventhubs.client.Client import org.apache.spark.eventhubs.utils.MetricPlugin import org.apache.spark.internal.Logging import org.apache.spark.sql.ForeachWriter case class EventHubsForeachWriter(ehConf: EventHubsConf) extends ForeachWriter[String] with Logging { private lazy val metricPlugin: Option[MetricPlugin] = ehConf.metricPlugin() var client: Client = _ var totalMessageSizeInBytes = 0 var totalMessageCount = 0 var writerOpenTime = 0L def open(partitionId: Long, version: Long): Boolean = { log.info(s"open is called. ${EventHubsUtils.getTaskContextSlim}") writerOpenTime = System.currentTimeMillis() client = EventHubsSourceProvider.clientFactory(ehConf.toMap)(ehConf) true } def process(body: String): Unit = { val event = EventData.create(s"$body".getBytes("UTF-8")) client.send(event) totalMessageCount += 1 totalMessageSizeInBytes += event.getBytes.length } def close(errorOrNull: Throwable): Unit = { log.info(s"close is called. ${EventHubsUtils.getTaskContextSlim}") errorOrNull match { case t: Throwable => log.warn(s"an error occurred. eventhub name = ${ehConf.name}, error = ${t.getMessage}") closeInner(false) throw t case _ => closeInner(true) } } private def closeInner(isSuccess: Boolean): Unit = { var success = false if (client != null) { try { client.close() success = true } catch { case e: Exception => log.warn(s"an error occurred. eventhub name = ${ehConf.name}, error = ${e.getMessage}") throw e } client = null } metricPlugin.foreach( _.onSendMetric(EventHubsUtils.getTaskContextSlim, ehConf.name, totalMessageCount, totalMessageSizeInBytes, System.currentTimeMillis() - writerOpenTime, isSuccess && success)) } }
Example 7
Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import scala.collection.mutable import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.{StreamTest, Trigger} import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession} // Run tests in KafkaSourceSuiteBase in continuous execution mode. class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest { import testImplicits._ override val brokerProps = Map("auto.create.topics.enable" -> "false") test("subscribing topic by pattern with topic deletions") { val topicPrefix = newTopic() val topic = topicPrefix + "-seems" val topic2 = topicPrefix + "-bad" testUtils.createTopic(topic, partitions = 5) testUtils.sendMessages(topic, Array("-1")) require(testUtils.getLatestOffsets(Set(topic)).size === 5) val reader = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", testUtils.brokerAddress) .option("kafka.metadata.max.age.ms", "1") .option("subscribePattern", s"$topicPrefix-.*") .option("failOnDataLoss", "false") val kafka = reader.load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = kafka.map(kv => kv._2.toInt + 1) testStream(mapped)( makeSureGetOffsetCalled, AddKafkaData(Set(topic), 1, 2, 3), CheckAnswer(2, 3, 4), Execute { query => testUtils.deleteTopic(topic) testUtils.createTopic(topic2, partitions = 5) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists { r => // Ensure the new topic is present and the old topic is gone. r.knownPartitions.exists(_.topic == topic2) }, s"query never reconfigured to new topic $topic2") } }, AddKafkaData(Set(topic2), 4, 5, 6), CheckAnswer(2, 3, 4, 5, 6, 7) ) } } class KafkaContinuousSourceStressForDontFailOnDataLossSuite extends KafkaSourceStressForDontFailOnDataLossSuite { override protected def startStream(ds: Dataset[Int]) = { ds.writeStream .format("memory") .queryName("memory") .trigger(Trigger.Continuous("1 second")) .start() } }
Example 8
Source File: CassandraSinkForeach.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package cassandra.foreachSink import cassandra.CassandraDriver import log.LazyLogger import org.apache.spark.sql.ForeachWriter import radio.SimpleSongAggregation class CassandraSinkForeach() extends ForeachWriter[SimpleSongAggregation] with LazyLogger { private def cqlRadio(record: SimpleSongAggregation): String = s""" insert into ${CassandraDriver.namespace}.${CassandraDriver.foreachTableSink} (title, artist, radio, count) values('${record.title}', '${record.artist}', '${record.radio}', ${record.count})""" def open(partitionId: Long, version: Long): Boolean = { // open connection //@TODO command to check if cassandra cluster is up true } //https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md#connection-pooling def process(record: SimpleSongAggregation) = { log.warn(s"Saving record: $record") CassandraDriver.connector.withSessionDo(session => session.execute(cqlRadio(record)) ) } //https://github.com/datastax/spark-cassandra-connector/blob/master/doc/reference.md#cassandra-connection-parameters def close(errorOrNull: Throwable): Unit = { // close the connection //connection.keep_alive_ms --> 5000ms : Period of time to keep unused connections open } }
Example 9
Source File: StructuredIdentity.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.structuredstreaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredIdentity() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val query = ds.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) } }) .start() query.awaitTermination() } }
Example 10
Source File: StructuredRepartition.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.structuredstreaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig import org.apache.spark.sql.Column import org.apache.spark.sql.DataFrame import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ class StructuredRepartition() extends StructuredBenchBase { override def process(ds: DataFrame, config: SparkBenchConfig) = { // Get the singleton instance of SparkSession val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate() import spark.implicits._ val results = ds.repartition(config.coreNumber) val query = results.writeStream .foreach(new ForeachWriter[Row] { var reporter: KafkaReporter = _ def open(partitionId: Long, version: Long): Boolean = { val reportTopic = config.reporterTopic val brokerList = config.brokerList reporter = new KafkaReporter(reportTopic, brokerList) true } def close(errorOrNull: Throwable): Unit = {} def process(record: Row): Unit = { val inTime = record(0).asInstanceOf[String].toLong val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) } }) .start() query.awaitTermination() } }