kafka.serializer.StringDecoder Scala Examples
The following examples show how to use kafka.serializer.StringDecoder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: L5-15KafkaDirect.scala From prosparkstreaming with Apache License 2.0 | 6 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountDirectApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Set(topic) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 2
Source File: KafkaStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() with mutable.SynchronizedMap[String, Long] stream.map(_._2).countByValue().foreachRDD { r => val ret = r.collect() ret.toMap.foreach { kv => val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(sent === result) } } }
Example 3
Source File: DirectKafkaWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import kafka.serializer.StringDecoder import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ import org.apache.spark.SparkConf object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(brokers, topics) = args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines, split them into words, count the words and print val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } }
Example 4
Source File: KafkaStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() with mutable.SynchronizedMap[String, Long] stream.map(_._2).countByValue().foreachRDD { r => val ret = r.collect() ret.toMap.foreach { kv => val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(sent === result) } } }
Example 5
Source File: WeatherDataStream.scala From spark-scala with Creative Commons Zero v1.0 Universal | 5 votes |
package com.supergloo import com.killrweather.data.Weather.RawWeatherData import kafka.serializer.StringDecoder import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka.KafkaUtils parsedWeatherStream.map { weather => (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip) }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip) } def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = { val parsedWeatherStream = rawWeatherStream.map(_._2.split(",")) .map(RawWeatherData(_)) parsedWeatherStream } }
Example 6
Source File: DirectKafkaWordCount.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import kafka.serializer.StringDecoder import org.apache.spark.SparkConf import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ StreamingExamples.setStreamingLogLevels() //数组赋值 val Array(brokers, topics) = Array("localhost:9092","topic1")//args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster("local") val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create direct kafka stream with brokers and topics //创建具有brokers和topics主题的kafka直接流 val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines, split them into words, count the words and print //获取行,将它们分割成单词,计算单词和打印 val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 7
Source File: KafkaStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") {//Kafka输入流 val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() with mutable.SynchronizedMap[String, Long] stream.map(_._2).countByValue().foreachRDD { r => val ret = r.collect() ret.toMap.foreach { kv => val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(sent === result) } } }
Example 8
Source File: KafkaStreamSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } ssc.stop() } }
Example 9
Source File: DirectKafkaWordCount.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import kafka.serializer.StringDecoder import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ import org.apache.spark.SparkConf object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(brokers, topics) = args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines, split them into words, count the words and print val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 10
Source File: DirectKafkaWordCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import kafka.serializer.StringDecoder import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ import org.apache.spark.SparkConf object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(brokers, topics) = args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines, split them into words, count the words and print val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 11
Source File: SparkJob.scala From intro-to-dcos with Apache License 2.0 | 5 votes |
package de.codecentric.dcos_intro.spark import de.codecentric.dcos_intro.{Tweet, TweetDecoder} import kafka.serializer.StringDecoder import org.apache.spark.SparkConf import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import com.datastax.spark.connector.streaming._ object SparkJob { def main(args: Array[String]) { val consumerTopic = args(0) val sparkConf = new SparkConf() .setAppName(getClass.getName) .set("spark.cassandra.connection.host", s"${args(1)}") .set("spark.cassandra.connection.port", s"${args(2)}") val consumerProperties = Map("bootstrap.servers" -> args(3), "auto.offset.reset" -> "smallest") val ssc = new StreamingContext(sparkConf, Seconds(1)) val kafkaStream = KafkaUtils.createDirectStream[String, Tweet, StringDecoder, TweetDecoder]( ssc, consumerProperties, Set(consumerTopic) ) kafkaStream.map(tuple => tuple._2).saveToCassandra("dcos", "tweets") ssc.start() ssc.awaitTermination() ssc.stop() } }
Example 12
Source File: DirectKafkaDataVec.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License | 5 votes |
package org.googlielmo.sparkstreamingkafka import kafka.serializer.StringDecoder import org.apache.spark.SparkConf import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ import org.datavec.api.records.reader.impl.csv.CSVRecordReader import org.datavec.api.transform.TransformProcess import org.datavec.api.transform.schema.Schema import org.datavec.spark.transform.SparkTransformExecutor import org.datavec.spark.transform.misc.{StringToWritablesFunction, WritablesToStringFunction} import scala.collection.JavaConverters._ object DirectKafkaDataVec { def main(args: Array[String]) { // Check for the correct number of arguments if (args.length < 3) { System.err.println(s""" |Usage: DirectKafkaWordCount <spark_master> <brokers> <topics> | <spark_master> is the Spark master URL | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } val Array(master, brokers, topics) = args // Define the input data schema val inputDataSchema = new Schema.Builder() .addColumnsString("id", "description", "notes") .build println(inputDataSchema) // Define some transformation (remove some columns) val tp = new TransformProcess.Builder(inputDataSchema) .removeColumns("notes") .build // Get and then print the new schema (after the transformations) val outputSchema = tp.getFinalSchema println("\n\n\nSchema after transforming data:") println(outputSchema) // Create a streaming context with 5 seconds batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaDataVec").setMaster(master) val ssc = new StreamingContext(sparkConf, Seconds(5)) ssc.sparkContext.setLogLevel("WARN") // Create a direct Kafka stream val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines val lines = messages.map(_._2) lines.foreachRDD { rdd => val javaRdd = rdd.toJavaRDD val rr = new CSVRecordReader val parsedInputData = javaRdd.map(new StringToWritablesFunction(rr)) if(!parsedInputData.isEmpty) { val processedData = SparkTransformExecutor.execute(parsedInputData, tp) // Collect the data locally and print it val processedAsString = processedData.map(new WritablesToStringFunction(",")) val processedCollected = processedAsString.collect val inputDataCollected = javaRdd.collect println("\n\n---- Original Data ----") for (s <- inputDataCollected.asScala) println(s) println("\n\n---- Processed Data ----") for (s <- processedCollected.asScala) println(s) } } // Start the computation and keep it alive waiting for a termination signal ssc.start() ssc.awaitTermination() } }
Example 13
Source File: DirectKafkaWordCount.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License | 5 votes |
package org.googlielmo.sparkstreamingkafka import kafka.serializer.StringDecoder import org.apache.spark.SparkConf import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ object DirectKafkaWordCount { def main(args: Array[String]) { // Check for the correct number of arguments if (args.length < 3) { System.err.println(s""" |Usage: DirectKafkaWordCount <spark_master> <brokers> <topics> | <spark_master> is the Spark Master URL | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } val Array(master, brokers, topics) = args // Create a streaming context with 5 seconds batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster(master) val ssc = new StreamingContext(sparkConf, Seconds(5)) ssc.sparkContext.setLogLevel("WARN") // Create a direct Kafka stream val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines, split them into words and then count the words and print to console val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation and keep it alive waiting for a termination signal ssc.start() ssc.awaitTermination() } }
Example 14
Source File: DirectKafkaWordCount.scala From awesome-recommendation-engine with Apache License 2.0 | 5 votes |
package example.spark import kafka.serializer.StringDecoder import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ import org.apache.spark.SparkConf object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } val Array(brokers, topics) = args // Create context with 2 second batch interval //no need to create spark context... val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster("local[4]") val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) // Get the lines, split them into words, count the words and print val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } }
Example 15
Source File: KafkaStreamingWC.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.kafka08 import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import com.example._ object KafkaStreamingWC { def main(args: Array[String]): Unit = { kafkaStreamingWC } def kafkaStreamingWC = launch(kafka08StreamingWC, "Kafka08Streaming", "checkpointing") def kafka08StreamingWC(ssc: StreamingContext) = { val brokers = "127.0.0.1:9092" val topics = "sample_topic" val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) val lines = messages.map { case (_, value) => value } val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() } }
Example 16
Source File: StreamingJob.scala From confluent-platform-spark-streaming with Apache License 2.0 | 5 votes |
package example import com.typesafe.config.ConfigFactory import io.confluent.kafka.serializers.KafkaAvroDecoder import kafka.serializer.StringDecoder import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkContext, SparkConf} object StreamingJob extends App { // Get job configuration val config = ConfigFactory.load() Logger.getLogger("example").setLevel(Level.toLevel(config.getString("loglevel"))) private val logger = Logger.getLogger(getClass) // Spark config and contexts val sparkMaster = config.getString("spark.master") val sparkConf = new SparkConf() .setMaster(sparkMaster) .setAppName("StreamingExample") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(sparkConf) val batchInterval = config.getInt("spark.batch.interval") val ssc = new StreamingContext(sc, Seconds(batchInterval)) // Create Kafka stream val groupId = config.getString("kafka.group.id") val topic = config.getString("topic") val kafkaParams = Map( "bootstrap.servers" -> config.getString("kafka.bootstrap.servers"), "schema.registry.url" -> config.getString("kafka.schema.registry.url"), "group.id" -> groupId ) @transient val kafkaStream: DStream[(String, Object)] = KafkaUtils.createDirectStream[String, Object, StringDecoder, KafkaAvroDecoder]( ssc, kafkaParams, Set(topic) ) // Load JSON strings into DataFrame kafkaStream.foreachRDD { rdd => // Get the singleton instance of SQLContext val sqlContext = SQLContext.getOrCreate(rdd.sparkContext) import sqlContext.implicits._ val topicValueStrings = rdd.map(_._2.toString) val df = sqlContext.read.json(topicValueStrings) df.printSchema() println("DataFrame count: " + df.count()) df.take(1).foreach(println) } ssc.start() ssc.awaitTermination() }
Example 17
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.streaming.ingestion.hbase import java.io.File import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 18
Source File: KafkaStreamSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } } }
Example 19
Source File: StreamingKafka8.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import kafka.serializer.StringDecoder import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} class StreamingKafka8 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("streaming") .getOrCreate() val sc = spark.sparkContext val ssc = new StreamingContext(sc, Seconds(5)) // Create direct kafka stream with brokers and topics val topicsSet = Set("weblogs") val kafkaParams = Map[String, String]("metadata.broker.list" -> "node5:9092") val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val lines = kafkaStream.map(x => x._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 20
Source File: SimpleConsumer.scala From embedded-kafka with Apache License 2.0 | 5 votes |
package com.tuplejump.embedded.kafka import java.util.Properties import java.util.concurrent.{CountDownLatch, Executors} import scala.util.Try import kafka.serializer.StringDecoder import kafka.consumer.{ Consumer, ConsumerConfig } class SimpleConsumer( val latch: CountDownLatch, consumerConfig: Map[String, String], topic: String, groupId: String, partitions: Int, numThreads: Int) { val connector = Consumer.create(createConsumerConfig) val streams = connector .createMessageStreams(Map(topic -> partitions), new StringDecoder(), new StringDecoder()) .get(topic) val executor = Executors.newFixedThreadPool(numThreads) for (stream <- streams) { executor.submit(new Runnable() { def run(): Unit = { for (s <- stream) { while (s.iterator.hasNext) { latch.countDown() } } } }) } private def createConsumerConfig: ConsumerConfig = { import scala.collection.JavaConverters._ val props = new Properties() props.putAll(consumerConfig.asJava) new ConsumerConfig(props) } def shutdown(): Unit = Try { connector.shutdown() executor.shutdown() } }
Example 21
Source File: KafkaStreamSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } } }
Example 22
Source File: DirectKafkaWordCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import kafka.serializer.StringDecoder import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ import org.apache.spark.SparkConf object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(brokers, topics) = args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines, split them into words, count the words and print val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 23
Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel object StationJourneyCountCustomApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 24
Source File: CheckpointingKafkaExtractor.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.spark.examples.kafka import com.memsql.spark.etl.api.{UserExtractConfig, PhaseConfig, ByteArrayExtractor} import com.memsql.spark.etl.utils.PhaseLogger import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.StreamingContext import kafka.serializer.{DefaultDecoder, StringDecoder} import org.apache.spark.streaming.kafka.{CheckpointedDirectKafkaInputDStream, CheckpointedKafkaUtils} import org.apache.spark.streaming.dstream.InputDStream class CheckpointingKafkaExtractor extends ByteArrayExtractor { var CHECKPOINT_DATA_VERSION = 1 var dstream: CheckpointedDirectKafkaInputDStream[String, Array[Byte], StringDecoder, DefaultDecoder, Array[Byte]] = null var zkQuorum: String = null var topic: String = null override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { val kafkaConfig = config.asInstanceOf[UserExtractConfig] zkQuorum = kafkaConfig.getConfigString("zk_quorum").getOrElse { throw new IllegalArgumentException("\"zk_quorum\" must be set in the config") } topic = kafkaConfig.getConfigString("topic").getOrElse { throw new IllegalArgumentException("\"topic\" must be set in the config") } } def extract(ssc: StreamingContext, extractConfig: PhaseConfig, batchDuration: Long, logger: PhaseLogger): InputDStream[Array[Byte]] = { val kafkaParams = Map[String, String]( "memsql.zookeeper.connect" -> zkQuorum ) val topics = Set(topic) dstream = CheckpointedKafkaUtils.createDirectStreamFromZookeeper[String, Array[Byte], StringDecoder, DefaultDecoder]( ssc, kafkaParams, topics, batchDuration, lastCheckpoint) dstream } override def batchCheckpoint: Option[Map[String, Any]] = { dstream match { case null => None case default => { val currentOffsets = dstream.getCurrentOffsets.map { case (tp, offset) => Map("topic" -> tp.topic, "partition" -> tp.partition, "offset" -> offset) } Some(Map("offsets" -> currentOffsets, "zookeeper" -> zkQuorum, "version" -> CHECKPOINT_DATA_VERSION)) } } } override def batchRetry: Unit = { if (dstream.prevOffsets != null) { dstream.setCurrentOffsets(dstream.prevOffsets) } } }
Example 25
Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import kafka.serializer.StringDecoder import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_KafkaStream { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val brokerList = args(0) val consumeTopic = args(1) val checkpointDir = args(2) val saveDir = args(3) val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir) // StreamingContextの取得 val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(brokerList: String, consumeTopic: String, checkpointDir: String, saveDir: String): () => StreamingContext = { () => { System.out.println(values) Some(running.getOrElse(0) + values.length) } def run(stream: InputDStream[(String, String)], saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) { val baseStream = stream.transform(rdd => { val t = (Long.MaxValue - System.currentTimeMillis) rdd.map(x => (x._1, x._2 + ", " + t)) }).map(x => { val splitVal = x._2.split(",") val userVal = splitVal(0).split(":") val actionVal = splitVal(1).split(":") val pageVal = splitVal(2).split(":") val timestamp = splitVal(3) (actionVal(1), userVal(1), pageVal(1), timestamp) }) baseStream.persist() val accountStream = baseStream.filter(_._1 == "view") .map(x => x._2) .countByValue() val totalUniqueUser = accountStream .updateStateByKey[Int](updateStateByKeyFunction _) .count() .map(x => "totalUniqueUser:" + x) val baseStreamPerTirty = baseStream .window(Seconds(windowLength), Seconds(slideInterval)) .filter(_._1 == "view") baseStreamPerTirty.persist() val pageViewPerTirty = baseStreamPerTirty .count() .map(x => "PageView:" + x) val uniqueUserPerTirty = baseStreamPerTirty .map(x => x._2) .countByValue() .count() .map(x => "UniqueUser:" + x) val pageViewStream = baseStream .filter(_._1 == "view") .map(x => x._3) .count() .map(x => "PageView:" + x) val outputStream = totalUniqueUser .union(pageViewPerTirty) .union(uniqueUserPerTirty) .union(pageViewStream) .reduce((x, y) => x + ", " + y) .saveAsTextFiles(saveDir) } } // scalastyle:on println
Example 26
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase import java.io.File import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 27
Source File: StreamingTSExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.streaming import java.util.UUID import kafka.serializer.StringDecoder import org.apache.spark.sql.Row import org.apache.spark.streaming.Durations import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.{SparkConf, SparkContext} import com.basho.riak.spark.streaming._ import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat object StreamingTSExample { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf(true) .setAppName("Simple Spark Streaming to Riak TS Demo") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") setSparkOpt(sparkConf, "kafka.broker", "127.0.0.1:9092") val sc = new SparkContext(sparkConf) val streamCtx = new StreamingContext(sc, Durations.seconds(15)) val kafkaProps = Map[String, String]( "metadata.broker.list" -> sparkConf.get("kafka.broker"), "client.id" -> UUID.randomUUID().toString ) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streamCtx, kafkaProps, Set[String]("ingest-ts") ) map { case (key, value) => val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) val wr = mapper.readValue(value, classOf[Map[String,String]]) Row( wr("weather"), wr("family"), DateTime.parse(wr("time"),DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")).getMillis, wr("temperature"), wr("humidity"), wr("pressure")) } saveToRiakTS "ts_weather_demo" streamCtx.start() println("Spark streaming context started. Spark UI could be found at http://SPARK_MASTER_HOST:4040") println("NOTE: if you're running job on the 'local' master open http://localhost:4040") streamCtx.awaitTermination() } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 28
Source File: StreamingKVExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.streaming import java.util.UUID import kafka.serializer.StringDecoder import com.basho.riak.spark._ import com.basho.riak.spark.streaming._ import com.basho.riak.spark.util.RiakObjectConversionUtil import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Durations, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object StreamingKVExample { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf(true) .setAppName("Simple Spark Streaming to Riak KV Demo") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") setSparkOpt(sparkConf, "kafka.broker", "127.0.0.1:9092") val sc = new SparkContext(sparkConf) val streamCtx = new StreamingContext(sc, Durations.seconds(15)) val kafkaProps = Map[String, String]( "metadata.broker.list" -> sparkConf.get("kafka.broker"), "client.id" -> UUID.randomUUID().toString ) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streamCtx, kafkaProps, Set[String]("ingest-kv") ) map { case (key, value) => val obj = RiakObjectConversionUtil.to(value) obj.setContentType("application/json") obj } saveToRiak "test-data" streamCtx.start() println("Spark streaming context started. Spark UI could be found at http://SPARK_MASTER_HOST:4040") println("NOTE: if you're running job on the 'local' master open http://localhost:4040") streamCtx.awaitTermination() } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 29
Source File: GraphToETLStreaming.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.counter.loader.stream import kafka.producer.KeyedMessage import kafka.serializer.StringDecoder import org.apache.s2graph.core.GraphUtil import org.apache.s2graph.counter.config.S2CounterConfig import org.apache.s2graph.counter.loader.config.StreamingConfig import org.apache.s2graph.spark.config.S2ConfigFactory import org.apache.s2graph.spark.spark.{WithKafka, SparkApp, HashMapParam} import org.apache.spark.streaming.Durations._ import org.apache.spark.streaming.kafka.KafkaRDDFunctions.rddToKafkaRDDFunctions import scala.collection.mutable import scala.collection.mutable.{HashMap => MutableHashMap} object GraphToETLStreaming extends SparkApp with WithKafka { lazy val config = S2ConfigFactory.config lazy val s2Config = new S2CounterConfig(config) lazy val className = getClass.getName.stripSuffix("$") lazy val producer = getProducer[String, String](StreamingConfig.KAFKA_BROKERS) override def run(): Unit = { validateArgument("interval", "topic") val (intervalInSec, topic) = (seconds(args(0).toLong), args(1)) val groupId = buildKafkaGroupId(topic, "graph_to_etl") val kafkaParam = Map( // "auto.offset.reset" -> "smallest", "group.id" -> groupId, "metadata.broker.list" -> StreamingConfig.KAFKA_BROKERS, "zookeeper.connect" -> StreamingConfig.KAFKA_ZOOKEEPER, "zookeeper.connection.timeout.ms" -> "10000" ) val conf = sparkConf(s"$topic: $className") val ssc = streamingContext(conf, intervalInSec) val sc = ssc.sparkContext val acc = sc.accumulable(MutableHashMap.empty[String, Long], "Throughput")(HashMapParam[String, Long](_ + _)) val stream = getStreamHelper(kafkaParam).createStream[String, String, StringDecoder, StringDecoder](ssc, topic.split(',').toSet) stream.foreachRDD { rdd => rdd.foreachPartitionWithOffsetRange { case (osr, part) => val m = MutableHashMap.empty[Int, mutable.MutableList[String]] for { (k, v) <- part line <- GraphUtil.parseString(v) } { try { val sp = GraphUtil.split(line) // get partition key by target vertex id val partKey = getPartKey(sp(4), 20) val values = m.getOrElse(partKey, mutable.MutableList.empty[String]) values += line m.update(partKey, values) } catch { case ex: Throwable => log.error(s"$ex: $line") } } m.foreach { case (k, v) => v.grouped(1000).foreach { grouped => producer.send(new KeyedMessage[String, String](StreamingConfig.KAFKA_TOPIC_ETL, null, k, grouped.mkString("\n"))) } } getStreamHelper(kafkaParam).commitConsumerOffset(osr) } } ssc.start() ssc.awaitTermination() } }
Example 30
Source File: ExactCounterStreaming.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.counter.loader.stream import kafka.serializer.StringDecoder import org.apache.s2graph.counter.config.S2CounterConfig import org.apache.s2graph.counter.loader.config.StreamingConfig import org.apache.s2graph.counter.loader.core.CounterFunctions import org.apache.s2graph.spark.config.S2ConfigFactory import org.apache.s2graph.spark.spark.{WithKafka, SparkApp, HashMapParam} import org.apache.spark.streaming.Durations._ import org.apache.spark.streaming.kafka.KafkaRDDFunctions.rddToKafkaRDDFunctions import org.apache.spark.streaming.kafka.{HasOffsetRanges, StreamHelper} import scala.collection.mutable.{HashMap => MutableHashMap} import scala.language.postfixOps object ExactCounterStreaming extends SparkApp with WithKafka { lazy val config = S2ConfigFactory.config lazy val s2Config = new S2CounterConfig(config) lazy val className = getClass.getName.stripSuffix("$") lazy val producer = getProducer[String, String](StreamingConfig.KAFKA_BROKERS) val inputTopics = Set(StreamingConfig.KAFKA_TOPIC_COUNTER) val strInputTopics = inputTopics.mkString(",") val groupId = buildKafkaGroupId(strInputTopics, "counter_v2") val kafkaParam = Map( // "auto.offset.reset" -> "smallest", "group.id" -> groupId, "metadata.broker.list" -> StreamingConfig.KAFKA_BROKERS, "zookeeper.connect" -> StreamingConfig.KAFKA_ZOOKEEPER, "zookeeper.connection.timeout.ms" -> "10000" ) val streamHelper = StreamHelper(kafkaParam) override def run() = { validateArgument("interval", "clear") val (intervalInSec, clear) = (seconds(args(0).toLong), args(1).toBoolean) if (clear) { streamHelper.kafkaHelper.consumerGroupCleanup() } val conf = sparkConf(s"$strInputTopics: $className") val ssc = streamingContext(conf, intervalInSec) val sc = ssc.sparkContext implicit val acc: HashMapAccumulable = sc.accumulable(MutableHashMap.empty[String, Long], "Throughput")(HashMapParam[String, Long](_ + _)) // make stream val stream = streamHelper.createStream[String, String, StringDecoder, StringDecoder](ssc, inputTopics) stream.foreachRDD { (rdd, ts) => val offsets = rdd.asInstanceOf[HasOffsetRanges].offsetRanges val exactRDD = CounterFunctions.makeExactRdd(rdd, offsets.length) // for at-least once semantic exactRDD.foreachPartitionWithIndex { (i, part) => // update exact counter val trxLogs = CounterFunctions.updateExactCounter(part.toSeq, acc) CounterFunctions.produceTrxLog(trxLogs) // commit offset range streamHelper.commitConsumerOffset(offsets(i)) } } ssc.start() ssc.awaitTermination() } }
Example 31
Source File: WalLogStat.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.loader.subscriber import kafka.producer.KeyedMessage import kafka.serializer.StringDecoder import org.apache.s2graph.spark.spark.{WithKafka, SparkApp} import org.apache.spark.streaming.Durations._ import org.apache.spark.streaming.kafka.HasOffsetRanges import scala.collection.mutable.{HashMap => MutableHashMap} import scala.language.postfixOps object WalLogStat extends SparkApp with WithKafka { override def run() = { validateArgument("kafkaZkQuorum", "brokerList", "topics", "intervalInSec", "dbUrl", "statTopic") val kafkaZkQuorum = args(0) val brokerList = args(1) val topics = args(2) val intervalInSec = seconds(args(3).toLong) val dbUrl = args(4) val statTopic = args(5) val conf = sparkConf(s"$topics: ${getClass.getSimpleName}") val ssc = streamingContext(conf, intervalInSec) val sc = ssc.sparkContext val groupId = topics.replaceAll(",", "_") + "_stat" val kafkaParams = Map( "zookeeper.connect" -> kafkaZkQuorum, "group.id" -> groupId, "metadata.broker.list" -> brokerList, "zookeeper.connection.timeout.ms" -> "10000", "auto.offset.reset" -> "largest") val stream = getStreamHelper(kafkaParams).createStream[String, String, StringDecoder, StringDecoder](ssc, topics.split(",").toSet) val statProducer = getProducer[String, String](brokerList) stream.foreachRDD { (rdd, time) => val offsets = rdd.asInstanceOf[HasOffsetRanges].offsetRanges val ts = time.milliseconds val elements = rdd.mapPartitions { partition => // set executor setting. val phase = System.getProperty("phase") GraphSubscriberHelper.apply(phase, dbUrl, "none", brokerList) partition.map { case (key, msg) => GraphSubscriberHelper.g.elementBuilder.toGraphElement(msg) match { case Some(elem) => val serviceName = elem.serviceName msg.split("\t", 7) match { case Array(_, operation, log_type, _, _, label, _*) => Seq(serviceName, label, operation, log_type).mkString("\t") case _ => Seq("no_service_name", "no_label", "no_operation", "parsing_error").mkString("\t") } case None => Seq("no_service_name", "no_label", "no_operation", "no_element_error").mkString("\t") } } } val countByKey = elements.map(_ -> 1L).reduceByKey(_ + _).collect() val totalCount = countByKey.map(_._2).sum val keyedMessage = countByKey.map { case (key, value) => new KeyedMessage[String, String](statTopic, s"$ts\t$key\t$value\t$totalCount") } statProducer.send(keyedMessage: _*) elements.mapPartitionsWithIndex { (i, part) => // commit offset range val osr = offsets(i) getStreamHelper(kafkaParams).commitConsumerOffset(osr) Iterator.empty }.foreach { (_: Nothing) => () } } ssc.start() ssc.awaitTermination() } }
Example 32
Source File: KafkaStreamSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } } }
Example 33
Source File: DirectKafkaWordCount.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import kafka.serializer.StringDecoder import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ import org.apache.spark.SparkConf object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | """.stripMargin) System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(brokers, topics) = args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) // Get the lines, split them into words, count the words and print val lines = messages.map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } } // scalastyle:on println