org.apache.spark.streaming.kafka.KafkaUtils Scala Examples
The following examples show how to use org.apache.spark.streaming.kafka.KafkaUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: L5-15KafkaDirect.scala From prosparkstreaming with Apache License 2.0 | 6 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountDirectApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Set(topic) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 2
Source File: L5-13Kafka.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 3
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.streaming.ingestion.hbase import java.io.File import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 4
Source File: StreamingJob.scala From confluent-platform-spark-streaming with Apache License 2.0 | 5 votes |
package example import com.typesafe.config.ConfigFactory import io.confluent.kafka.serializers.KafkaAvroDecoder import kafka.serializer.StringDecoder import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkContext, SparkConf} object StreamingJob extends App { // Get job configuration val config = ConfigFactory.load() Logger.getLogger("example").setLevel(Level.toLevel(config.getString("loglevel"))) private val logger = Logger.getLogger(getClass) // Spark config and contexts val sparkMaster = config.getString("spark.master") val sparkConf = new SparkConf() .setMaster(sparkMaster) .setAppName("StreamingExample") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(sparkConf) val batchInterval = config.getInt("spark.batch.interval") val ssc = new StreamingContext(sc, Seconds(batchInterval)) // Create Kafka stream val groupId = config.getString("kafka.group.id") val topic = config.getString("topic") val kafkaParams = Map( "bootstrap.servers" -> config.getString("kafka.bootstrap.servers"), "schema.registry.url" -> config.getString("kafka.schema.registry.url"), "group.id" -> groupId ) @transient val kafkaStream: DStream[(String, Object)] = KafkaUtils.createDirectStream[String, Object, StringDecoder, KafkaAvroDecoder]( ssc, kafkaParams, Set(topic) ) // Load JSON strings into DataFrame kafkaStream.foreachRDD { rdd => // Get the singleton instance of SQLContext val sqlContext = SQLContext.getOrCreate(rdd.sparkContext) import sqlContext.implicits._ val topicValueStrings = rdd.map(_._2.toString) val df = sqlContext.read.json(topicValueStrings) df.printSchema() println("DataFrame count: " + df.count()) df.take(1).foreach(println) } ssc.start() ssc.awaitTermination() }
Example 5
Source File: KafkaStreamingWC.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.kafka08 import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import com.example._ object KafkaStreamingWC { def main(args: Array[String]): Unit = { kafkaStreamingWC } def kafkaStreamingWC = launch(kafka08StreamingWC, "Kafka08Streaming", "checkpointing") def kafka08StreamingWC(ssc: StreamingContext) = { val brokers = "127.0.0.1:9092" val topics = "sample_topic" val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) val lines = messages.map { case (_, value) => value } val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() } }
Example 6
Source File: SparkJob.scala From intro-to-dcos with Apache License 2.0 | 5 votes |
package de.codecentric.dcos_intro.spark import de.codecentric.dcos_intro.{Tweet, TweetDecoder} import kafka.serializer.StringDecoder import org.apache.spark.SparkConf import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import com.datastax.spark.connector.streaming._ object SparkJob { def main(args: Array[String]) { val consumerTopic = args(0) val sparkConf = new SparkConf() .setAppName(getClass.getName) .set("spark.cassandra.connection.host", s"${args(1)}") .set("spark.cassandra.connection.port", s"${args(2)}") val consumerProperties = Map("bootstrap.servers" -> args(3), "auto.offset.reset" -> "smallest") val ssc = new StreamingContext(sparkConf, Seconds(1)) val kafkaStream = KafkaUtils.createDirectStream[String, Tweet, StringDecoder, TweetDecoder]( ssc, consumerProperties, Set(consumerTopic) ) kafkaStream.map(tuple => tuple._2).saveToCassandra("dcos", "tweets") ssc.start() ssc.awaitTermination() ssc.stop() } }
Example 7
Source File: KappaTagging.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.tagging import com.typesafe.config.ConfigFactory import io.gzet.tagging.gdelt.GdeltTagger import io.gzet.tagging.twitter.TwitterHIS import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import twitter4j.Status import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder object KappaTagging { final val config = ConfigFactory.load().getConfig("io.gzet.kappa") final val esNodes = config.getString("esNodes") final val batchSize = config.getInt("batchSize") def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("GDELT Kappa tagging") val ssc = new StreamingContext(sparkConf, Seconds(batchSize)) val sc = ssc.sparkContext // Create a counter that can be shared accross batches val batchId = sc.longAccumulator("GZET") val twitterStream = createTwitterStream(ssc, Array[String]()) val twitterProcessor = new TwitterHIS() twitterProcessor.train(twitterStream, batchId) val gdeltStream = createGdeltStream(ssc) val gdeltProcessor = new GdeltTagger() gdeltProcessor.predict(gdeltStream, batchId) ssc.start() ssc.awaitTermination() } private def createTwitterStream(ssc: StreamingContext, filters: Array[String]): DStream[Status] = { TwitterUtils.createStream( ssc, getTwitterConfiguration, filters ) } private def getTwitterConfiguration = { val builder = new ConfigurationBuilder() builder.setOAuthConsumerKey(config.getString("apiKey")) builder.setOAuthConsumerSecret(config.getString("apiSecret")) builder.setOAuthAccessToken(config.getString("tokenKey")) builder.setOAuthAccessTokenSecret(config.getString("tokenSecret")) val configuration = builder.build() Some(new OAuthAuthorization(configuration)) } private def createGdeltStream(ssc: StreamingContext) = { val topics = Map( config.getString("kafkaTopic") -> config.getInt("kafkaTopicPartition") ) KafkaUtils.createStream( ssc, config.getString("zkQuorum"), config.getString("kafkaGroupId"), topics ).map(_._2) } }
Example 8
Source File: KafkaWordCount.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.util.HashMap import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} import org.apache.spark.SparkConf import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka.KafkaUtils StreamingExamples.setStreamingLogLevels() val Array(zkQuorum, group, topics, numThreads) = Array("localhost:2181","","topic1,topic2,topic3,topic4","1")//args val sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("checkpoint") val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)) .reduceByKeyAndWindow(_ + _, _ - _, Minutes(10), Seconds(2), 2) wordCounts.print() ssc.start() ssc.awaitTermination() } } // Produces some random words between 1 and 100. // object KafkaWordCountProducer { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: KafkaWordCountProducer <metadataBrokerList> <topic> " + "<messagesPerSec> <wordsPerMessage>") System.exit(1) } val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args // Zookeeper connection properties val props = new HashMap[String, Object]() props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) // Send some messages while(true) { (1 to messagesPerSec.toInt).foreach { messageNum => val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(10).toString) .mkString(" ") val message = new ProducerRecord[String, String](topic, null, str) producer.send(message) } Thread.sleep(1000) } } } // scalastyle:on println
Example 9
Source File: WeatherDataStream.scala From spark-scala with Creative Commons Zero v1.0 Universal | 5 votes |
package com.supergloo import com.killrweather.data.Weather.RawWeatherData import kafka.serializer.StringDecoder import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka.KafkaUtils parsedWeatherStream.map { weather => (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip) }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip) } def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = { val parsedWeatherStream = rawWeatherStream.map(_._2.split(",")) .map(RawWeatherData(_)) parsedWeatherStream } }
Example 10
Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel object StationJourneyCountCustomApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 11
Source File: StreamingKafka8.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import kafka.serializer.StringDecoder import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} class StreamingKafka8 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("streaming") .getOrCreate() val sc = spark.sparkContext val ssc = new StreamingContext(sc, Seconds(5)) // Create direct kafka stream with brokers and topics val topicsSet = Set("weblogs") val kafkaParams = Map[String, String]("metadata.broker.list" -> "node5:9092") val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val lines = kafkaStream.map(x => x._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 12
Source File: gihyo_6_3_Union.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.KafkaUtils object gihyo_6_3_Union { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHosts = args(0) val consumerGroup = args(1) val targetTopics = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val KafkaStreams = (1 to 5).map { i => KafkaUtils.createStream(ssc, targetHosts, consumerGroup, Map(targetTopics -> 1)) } run(ssc, KafkaStreams) ssc.start ssc.awaitTermination } def run(ssc: StreamingContext, streams: IndexedSeq[InputDStream[(String, String)]]) { val unionedStream = ssc.union(streams) unionedStream.print } }
Example 13
Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import kafka.serializer.StringDecoder import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_KafkaStream { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val brokerList = args(0) val consumeTopic = args(1) val checkpointDir = args(2) val saveDir = args(3) val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir) // StreamingContextの取得 val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(brokerList: String, consumeTopic: String, checkpointDir: String, saveDir: String): () => StreamingContext = { () => { System.out.println(values) Some(running.getOrElse(0) + values.length) } def run(stream: InputDStream[(String, String)], saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) { val baseStream = stream.transform(rdd => { val t = (Long.MaxValue - System.currentTimeMillis) rdd.map(x => (x._1, x._2 + ", " + t)) }).map(x => { val splitVal = x._2.split(",") val userVal = splitVal(0).split(":") val actionVal = splitVal(1).split(":") val pageVal = splitVal(2).split(":") val timestamp = splitVal(3) (actionVal(1), userVal(1), pageVal(1), timestamp) }) baseStream.persist() val accountStream = baseStream.filter(_._1 == "view") .map(x => x._2) .countByValue() val totalUniqueUser = accountStream .updateStateByKey[Int](updateStateByKeyFunction _) .count() .map(x => "totalUniqueUser:" + x) val baseStreamPerTirty = baseStream .window(Seconds(windowLength), Seconds(slideInterval)) .filter(_._1 == "view") baseStreamPerTirty.persist() val pageViewPerTirty = baseStreamPerTirty .count() .map(x => "PageView:" + x) val uniqueUserPerTirty = baseStreamPerTirty .map(x => x._2) .countByValue() .count() .map(x => "UniqueUser:" + x) val pageViewStream = baseStream .filter(_._1 == "view") .map(x => x._3) .count() .map(x => "PageView:" + x) val outputStream = totalUniqueUser .union(pageViewPerTirty) .union(uniqueUserPerTirty) .union(pageViewStream) .reduce((x, y) => x + ", " + y) .saveAsTextFiles(saveDir) } } // scalastyle:on println
Example 14
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase import java.io.File import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 15
Source File: StreamingTSExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.streaming import java.util.UUID import kafka.serializer.StringDecoder import org.apache.spark.sql.Row import org.apache.spark.streaming.Durations import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.{SparkConf, SparkContext} import com.basho.riak.spark.streaming._ import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat object StreamingTSExample { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf(true) .setAppName("Simple Spark Streaming to Riak TS Demo") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") setSparkOpt(sparkConf, "kafka.broker", "127.0.0.1:9092") val sc = new SparkContext(sparkConf) val streamCtx = new StreamingContext(sc, Durations.seconds(15)) val kafkaProps = Map[String, String]( "metadata.broker.list" -> sparkConf.get("kafka.broker"), "client.id" -> UUID.randomUUID().toString ) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streamCtx, kafkaProps, Set[String]("ingest-ts") ) map { case (key, value) => val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) val wr = mapper.readValue(value, classOf[Map[String,String]]) Row( wr("weather"), wr("family"), DateTime.parse(wr("time"),DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")).getMillis, wr("temperature"), wr("humidity"), wr("pressure")) } saveToRiakTS "ts_weather_demo" streamCtx.start() println("Spark streaming context started. Spark UI could be found at http://SPARK_MASTER_HOST:4040") println("NOTE: if you're running job on the 'local' master open http://localhost:4040") streamCtx.awaitTermination() } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 16
Source File: StreamingKVExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.streaming import java.util.UUID import kafka.serializer.StringDecoder import com.basho.riak.spark._ import com.basho.riak.spark.streaming._ import com.basho.riak.spark.util.RiakObjectConversionUtil import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Durations, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object StreamingKVExample { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf(true) .setAppName("Simple Spark Streaming to Riak KV Demo") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") setSparkOpt(sparkConf, "kafka.broker", "127.0.0.1:9092") val sc = new SparkContext(sparkConf) val streamCtx = new StreamingContext(sc, Durations.seconds(15)) val kafkaProps = Map[String, String]( "metadata.broker.list" -> sparkConf.get("kafka.broker"), "client.id" -> UUID.randomUUID().toString ) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streamCtx, kafkaProps, Set[String]("ingest-kv") ) map { case (key, value) => val obj = RiakObjectConversionUtil.to(value) obj.setContentType("application/json") obj } saveToRiak "test-data" streamCtx.start() println("Spark streaming context started. Spark UI could be found at http://SPARK_MASTER_HOST:4040") println("NOTE: if you're running job on the 'local' master open http://localhost:4040") streamCtx.awaitTermination() } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 17
Source File: KafkaWordCount.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.basic import java.util import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object KafkaWordCount { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <zkQuorum> <group> <topics> <numThreads>") System.exit(1) } val Array(zkQuorum, group, topics, numThreads) = args val sparkConf = new SparkConf().setAppName("KafkaWordCount"). set("spark.streaming.receiver.writeAheadLog.enable", "true"). set("spark.streaming.kafka.maxRatePerPartition", "1000") val ssc = new StreamingContext(sparkConf, Seconds(2)) // 设置 checkpoint,这是考虑到了有 window 操作,window 操作一般是需要进行 checkpoint ssc.checkpoint("checkpoint") val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap // createStream 返回的是一个 Tuple2,具有 key,value,这里只关注 value. // 注意这里是 Receiver-based 方式(还提供了 non-receiver 模式),默认配置下,这种方式是会在 receiver 挂掉 // 丢失数据的,需要设置 Write Ahead, 上面我们已经配置了, 那么存储 level 也可以进行相应调整. val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK_SER).map(_._2) val words = lines.flatMap(_.split(" ")) // 统计的是 10 分钟内的单词数量,每隔 10 秒统计 1 次 val wordCounts = words.map(x => (x, 1L)) .reduceByKeyAndWindow(_ + _, _ - _, Seconds(10), Seconds(2), 2). filter(x => x._2 > 0) wordCounts.print() ssc.start() ssc.awaitTermination() } } // Produces some random words between 1 and 100. object KafkaWordCountProducer { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <metadataBrokerList> <topic> " + "<messagesPerSec> <wordsPerMessage>") System.exit(1) } // 需要注意的是这里是 broker list,为 host:port,host:port 形式 val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args // Zookeeper connection properties val props = new util.HashMap[String, Object]() props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) // Send some messages while (true) { (1 to messagesPerSec.toInt).foreach { messageNum => val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(100).toString) .mkString(" ") val message = new ProducerRecord[String, String](topic, null, str) producer.send(message) } Thread.sleep(1000) } } }