org.apache.spark.sql.ForeachWriter Scala Example

Source File: DNSstat.scala From jdbcsink with Apache License 2.0

6 votes

import org.apache.spark.sql.SparkSession
import java.util.Properties
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{from_json,window}
import java.sql.{Connection,Statement,DriverManager}
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row

class JDBCSink() extends ForeachWriter[Row]{
 val driver = "com.mysql.jdbc.Driver"
      var connection:Connection = _
      var statement:Statement = _

    def open(partitionId: Long,version: Long): Boolean = {
        Class.forName(driver)
        connection = DriverManager.getConnection("jdbc:mysql://10.88.1.102:3306/aptwebservice", "root", "mysqladmin")
        statement = connection.createStatement
        true
      }
      def process(value: Row): Unit = {
        statement.executeUpdate("replace into DNSStat(ip,domain,time,count) values(" 
                                    + "'" + value.getString(0) + "'" + ","//ip
                                    + "'" + value.getString(1) + "'" + ","//domain
                                    + "'" + value.getTimestamp(2) + "'" + "," //time
                                    + value.getLong(3) //count
                                    + ")") 
      }

      def close(errorOrNull: Throwable): Unit = {
        connection.close
      }
}

object DNSstatJob{

val schema: StructType = StructType(
        Seq(StructField("Vendor", StringType,true),
         StructField("Id", IntegerType,true),
         StructField("Time", LongType,true),
         StructField("Conn", StructType(Seq(
                                        StructField("Proto", IntegerType, true), 
                                        StructField("Sport", IntegerType, true), 
                                        StructField("Dport", IntegerType, true), 
                                        StructField("Sip", StringType, true), 
                                        StructField("Dip", StringType, true)
                                        )), true),
        StructField("Dns", StructType(Seq(
                                        StructField("Domain", StringType, true), 
                                        StructField("IpCount", IntegerType, true), 
                                        StructField("Ip", StringType, true) 
                                        )), true)))

    def main(args: Array[String]) {
    val spark=SparkSession
          .builder
          .appName("DNSJob")
          .config("spark.some.config.option", "some-value")
          .getOrCreate()
    import spark.implicits._
    val connectionProperties = new Properties()
    connectionProperties.put("user", "root")
    connectionProperties.put("password", "mysqladmin")
    val bruteForceTab = spark.read
                .jdbc("jdbc:mysql://10.88.1.102:3306/aptwebservice", "DNSTab",connectionProperties)
    bruteForceTab.registerTempTable("DNSTab")
    val lines = spark
          .readStream
          .format("kafka")
          .option("kafka.bootstrap.servers", "10.94.1.110:9092")
          .option("subscribe","xdr")
          //.option("startingOffsets","earliest")
          .option("startingOffsets","latest")
          .load()
          .select(from_json($"value".cast(StringType),schema).as("jsonData"))
    lines.registerTempTable("xdr")
    val filterDNS = spark.sql("select CAST(from_unixtime(xdr.jsonData.Time DIV 1000000) as timestamp) as time,xdr.jsonData.Conn.Sip as sip, xdr.jsonData.Dns.Domain from xdr inner join DNSTab on xdr.jsonData.Dns.domain = DNSTab.domain")
    
    val windowedCounts = filterDNS
                        .withWatermark("time","5 minutes")
                        .groupBy(window($"time", "1 minutes", "1 minutes"),$"sip",$"domain")
                        .count()
                        .select($"sip",$"domain",$"window.start",$"count")

    val writer = new JDBCSink()
    val query = windowedCounts
       .writeStream
        .foreach(writer)
        .outputMode("update")
        .option("checkpointLocation","/checkpoint/")
        .start()
        query.awaitTermination() 
   } 
}

Source File: JDBCSink.scala From BigData-News with Apache License 2.0

5 votes

package com.vita.spark

import java.sql.{Connection, ResultSet, SQLException, Statement}

import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.{ForeachWriter, Row}

/**
  * 处理从StructuredStreaming中向mysql中写入数据
  */
class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] {

  var statement: Statement = _
  var resultSet: ResultSet = _
  var connection: Connection = _

  override def open(partitionId: Long, version: Long): Boolean = {
    connection = new MySqlPool(url, username, password).getJdbcConn()
    statement = connection.createStatement();
    print("open")
    return true
  }

  override def process(value: Row): Unit = {
    println("process step one")
    val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "")
    val count = value.getAs[Long]("count")

    val querySql = "select 1 from webCount where titleName = '" + titleName + "'"
    val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')"
    val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'"
    println("process step two")
    try {
      //查看连接是否成功
      var resultSet = statement.executeQuery(querySql)
      if (resultSet.next()) {
        println("updateSql")
        statement.executeUpdate(updateSql)
      } else {
        println("insertSql")
        statement.execute(insertSql)
      }

    } catch {
      case ex: SQLException => {
        println("SQLException")
      }
      case ex: Exception => {
        println("Exception")
      }
      case ex: RuntimeException => {
        println("RuntimeException")
      }
      case ex: Throwable => {
        println("Throwable")
      }
    }
  }

  override def close(errorOrNull: Throwable): Unit = {
    if (statement == null) {
      statement.close()
    }
    if (connection == null) {
      connection.close()
    }
  }
}

Source File: RedisWriteKafkaOffset.scala From BigData-News with Apache License 2.0

5 votes

package com.vita.spark.streaming.writer

import com.vita.Constants
import com.vita.redies.RedisSingle
import org.apache.spark.sql.{ForeachWriter, Row}

class RedisWriteKafkaOffset extends ForeachWriter[Row] {
  var redisSingle: RedisSingle = _

  override def open(partitionId: Long, version: Long): Boolean = {
    redisSingle = new RedisSingle()
    redisSingle.init(Constants.IP, Constants.PORT)
    true
  }

  override def process(value: Row): Unit = {
    val offset = value.getAs[String]("offset")
    redisSingle.set(Constants.REDIDS_KEY, offset)
  }

  override def close(errorOrNull: Throwable): Unit = {
    redisSingle.getJedis().close()
    redisSingle.getPool().close()
  }
}

Source File: 4-jdbcsink.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License

5 votes

// Databricks notebook source
// MAGIC %md
// MAGIC # JDBC Sink for Structured Streaming
// MAGIC Structued streaming does not feature a JDBC sink currently.<br>
// MAGIC The following is a custom sink we will use in the lab.

// COMMAND ----------

import java.sql._
import org.apache.spark.sql.ForeachWriter

class JDBCSink(url: String, user:String, pwd:String) extends org.apache.spark.sql.ForeachWriter[org.apache.spark.sql.Row]{
    val driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
    var connection:java.sql.Connection = _
    var statement:java.sql.Statement = _

    def open(partitionId: Long, version: Long):Boolean = {
        Class.forName(driver)
        connection = java.sql.DriverManager.getConnection(url, user, pwd)
        statement = connection.createStatement
        true
    }

    def process(value: org.apache.spark.sql.Row): Unit = {        
    statement.executeUpdate("INSERT INTO chicago_crimes_curated_summary(case_id, primary_type, arrest_made,case_year, case_month, case_day_of_month) VALUES (" 
                 + "'" + value(0) + "'" + "," 
                 + "'" + value(1) + "'" + "," 
                 + "'" + value(2) + "'" + "," 
                 + "'" + value(3) + "'" + "," 
                 + "'" + value(4) + "'" + "," 
                 + "'" + value(5) + "'" + ");")
    }

    def close(errorOrNull:Throwable):Unit = {
        connection.close
    }
}


// COMMAND ----------

Source File: CassandraForeachWriter.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import com.datastax.driver.core.{Cluster, Session}
import knolx.Config.{cassandraHosts, keyspace}
import org.apache.spark.sql.{ForeachWriter, Row}


object CassandraForeachWriter extends Serializable {
  val writeToCassandra = new ForeachWriter[Row] {
    private var cluster: Cluster = _
    private var session: Session = _

    override def process(row: Row): Unit = {
      val word = row.getString(0)
      val count = row.getLong(1)

      session.execute(s"insert into $keyspace.wordcount (word, count) values ('$word', $count);")
    }

    override def close(errorOrNull: Throwable): Unit = {
      session.close()
      session.getCluster.close()
    }

    override def open(partitionId: Long, version: Long): Boolean = {
      cluster = Cluster.builder.addContactPoints(cassandraHosts).build
      session = cluster.newSession()
      true
    }
  }
}

Source File: EventHubsForeachWriter.scala From azure-event-hubs-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.eventhubs

import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.eventhubs.{ EventHubsConf, EventHubsUtils }
import org.apache.spark.eventhubs.client.Client
import org.apache.spark.eventhubs.utils.MetricPlugin
import org.apache.spark.internal.Logging
import org.apache.spark.sql.ForeachWriter


case class EventHubsForeachWriter(ehConf: EventHubsConf)
    extends ForeachWriter[String]
    with Logging {
  private lazy val metricPlugin: Option[MetricPlugin] = ehConf.metricPlugin()
  var client: Client = _
  var totalMessageSizeInBytes = 0
  var totalMessageCount = 0
  var writerOpenTime = 0L

  def open(partitionId: Long, version: Long): Boolean = {
    log.info(s"open is called. ${EventHubsUtils.getTaskContextSlim}")

    writerOpenTime = System.currentTimeMillis()
    client = EventHubsSourceProvider.clientFactory(ehConf.toMap)(ehConf)
    true
  }

  def process(body: String): Unit = {
    val event = EventData.create(s"$body".getBytes("UTF-8"))
    client.send(event)
    totalMessageCount += 1
    totalMessageSizeInBytes += event.getBytes.length
  }

  def close(errorOrNull: Throwable): Unit = {
    log.info(s"close is called. ${EventHubsUtils.getTaskContextSlim}")

    errorOrNull match {
      case t: Throwable =>
        log.warn(s"an error occurred. eventhub name = ${ehConf.name}, error = ${t.getMessage}")
        closeInner(false)
        throw t
      case _ => closeInner(true)
    }
  }

  private def closeInner(isSuccess: Boolean): Unit = {
    var success = false
    if (client != null) {
      try {
        client.close()
        success = true
      } catch {
        case e: Exception =>
          log.warn(s"an error occurred. eventhub name = ${ehConf.name}, error = ${e.getMessage}")
          throw e
      }
      client = null
    }

    metricPlugin.foreach(
      _.onSendMetric(EventHubsUtils.getTaskContextSlim,
                     ehConf.name,
                     totalMessageCount,
                     totalMessageSizeInBytes,
                     System.currentTimeMillis() - writerOpenTime,
                     isSuccess && success))
  }
}

Source File: KafkaContinuousSourceSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
}

Source File: CassandraSinkForeach.scala From Spark-Structured-Streaming-Examples with Apache License 2.0

5 votes

package cassandra.foreachSink

import cassandra.CassandraDriver
import log.LazyLogger
import org.apache.spark.sql.ForeachWriter
import radio.SimpleSongAggregation


class CassandraSinkForeach() extends ForeachWriter[SimpleSongAggregation] with LazyLogger {
  private def cqlRadio(record: SimpleSongAggregation): String = s"""
       insert into ${CassandraDriver.namespace}.${CassandraDriver.foreachTableSink} (title, artist, radio, count)
       values('${record.title}', '${record.artist}', '${record.radio}', ${record.count})"""

  def open(partitionId: Long, version: Long): Boolean = {
    // open connection
    //@TODO command to check if cassandra cluster is up
    true
  }

  //https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md#connection-pooling
  def process(record: SimpleSongAggregation) = {
    log.warn(s"Saving record: $record")
    CassandraDriver.connector.withSessionDo(session =>
      session.execute(cqlRadio(record))
    )
  }

  //https://github.com/datastax/spark-cassandra-connector/blob/master/doc/reference.md#cassandra-connection-parameters

  def close(errorOrNull: Throwable): Unit = {
    // close the connection
    //connection.keep_alive_ms	--> 5000ms :	Period of time to keep unused connections open
  }
}

Source File: StructuredIdentity.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredIdentity() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val query = ds.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
}

Source File: StructuredRepartition.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredRepartition() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val results = ds.repartition(config.coreNumber)
    
    val query = results.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
}

org.apache.spark.sql.ForeachWriter Scala Examples