org.apache.spark.streaming.dstream.InputDStream Scala Examples
The following examples show how to use org.apache.spark.streaming.dstream.InputDStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: gihyo_6_3_countByValue.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValue { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countValue = stream.countByValue() countValue.print } }
Example 2
Source File: gihyo_6_3_reduceByKeyAndWindow_efficient.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow_efficient { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow( (a: Int, b: Int) => a + b, (a: Int, b: Int) => a - b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 3
Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Transform { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))) run(lines, blackList) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], blackList: RDD[(String, String)]) { val userList = stream.map(x => (x, "action:Login")).transform(rdd => { val tmpUserList = rdd.leftOuterJoin(blackList) tmpUserList.filter(user => (user._2._2 == None)) }) userList.print } }
Example 4
Source File: gihyo_6_3_reduceByKeyAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 5
Source File: gihyo_6_3_countByValueAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValueAndWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val f = createStreamingContext(targetHost, targetHostPort, checkpointDir) val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext( targetHost: String, targetHostPort: Int, checkpointDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc } } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByValueAndWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } } // scalastyle:on println
Example 6
Source File: gihyo_6_3_updateStateByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_updateStateByKey { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val userList = stream.map(x => (x, 1)).updateStateByKey[Int](updateStateByKeyFunction _) userList.print } def updateStateByKeyFunction(values: Seq[Int], running: Option[Int]): Option[Int] = { Some(running.getOrElse(0) + values.size) } }
Example 7
Source File: gihyo_6_3_Filter.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Filter { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val overData = stream.filter(line => line.length > 5) overData.print } }
Example 8
Source File: gihyo_6_3_countByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 9
Source File: gihyo_6_3_Window.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Window { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.window(Seconds(windowLength), Seconds(slideInterval)).countByValue() userList.print } }
Example 10
Source File: gihyo_6_3_reduceByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKey { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countKeyValue = stream.map(x => (x, 1)).reduceByKey((x, y) => x + y) countKeyValue.print } }
Example 11
Source File: CheckpointingKafkaExtractor.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.spark.examples.kafka import com.memsql.spark.etl.api.{UserExtractConfig, PhaseConfig, ByteArrayExtractor} import com.memsql.spark.etl.utils.PhaseLogger import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.StreamingContext import kafka.serializer.{DefaultDecoder, StringDecoder} import org.apache.spark.streaming.kafka.{CheckpointedDirectKafkaInputDStream, CheckpointedKafkaUtils} import org.apache.spark.streaming.dstream.InputDStream class CheckpointingKafkaExtractor extends ByteArrayExtractor { var CHECKPOINT_DATA_VERSION = 1 var dstream: CheckpointedDirectKafkaInputDStream[String, Array[Byte], StringDecoder, DefaultDecoder, Array[Byte]] = null var zkQuorum: String = null var topic: String = null override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { val kafkaConfig = config.asInstanceOf[UserExtractConfig] zkQuorum = kafkaConfig.getConfigString("zk_quorum").getOrElse { throw new IllegalArgumentException("\"zk_quorum\" must be set in the config") } topic = kafkaConfig.getConfigString("topic").getOrElse { throw new IllegalArgumentException("\"topic\" must be set in the config") } } def extract(ssc: StreamingContext, extractConfig: PhaseConfig, batchDuration: Long, logger: PhaseLogger): InputDStream[Array[Byte]] = { val kafkaParams = Map[String, String]( "memsql.zookeeper.connect" -> zkQuorum ) val topics = Set(topic) dstream = CheckpointedKafkaUtils.createDirectStreamFromZookeeper[String, Array[Byte], StringDecoder, DefaultDecoder]( ssc, kafkaParams, topics, batchDuration, lastCheckpoint) dstream } override def batchCheckpoint: Option[Map[String, Any]] = { dstream match { case null => None case default => { val currentOffsets = dstream.getCurrentOffsets.map { case (tp, offset) => Map("topic" -> tp.topic, "partition" -> tp.partition, "offset" -> offset) } Some(Map("offsets" -> currentOffsets, "zookeeper" -> zkQuorum, "version" -> CHECKPOINT_DATA_VERSION)) } } } override def batchRetry: Unit = { if (dstream.prevOffsets != null) { dstream.setCurrentOffsets(dstream.prevOffsets) } } }
Example 12
Source File: SimpleJsonFileInputDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import scala.io.Source import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.{StreamingContext, Time} class SimpleJsonFileInputDStream ( sqlc: SQLContext, @transient ssc: StreamingContext, path: String) extends InputDStream[String](ssc) { val jsons = Source.fromFile(path).getLines().toList var index = 0 override def start(): Unit = { } override def stop(): Unit = { } override def compute(validTime: Time): Option[RDD[String]] = { val rddOption = Option(ssc.sparkContext.parallelize(List(jsons(index % jsons.size)))) index = index + 1 rddOption } }
Example 13
Source File: KafkaUtility.scala From real-time-stream-processing-engine with Apache License 2.0 | 5 votes |
package com.knoldus.streaming.kafka import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} object KafkaUtility { //TODO It should read from config private val kafkaParams = Map( "bootstrap.servers" -> "localhost:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "auto.offset.reset" -> "earliest", "group.id" -> "tweet-consumer" ) private val preferredHosts = LocationStrategies.PreferConsistent def createDStreamFromKafka(ssc: StreamingContext, topics: List[String]): InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, preferredHosts, ConsumerStrategies.Subscribe[String, String](topics.distinct, kafkaParams) ) }
Example 14
Source File: WeatherDataStream.scala From spark-scala with Creative Commons Zero v1.0 Universal | 5 votes |
package com.supergloo import com.killrweather.data.Weather.RawWeatherData import kafka.serializer.StringDecoder import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka.KafkaUtils parsedWeatherStream.map { weather => (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip) }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip) } def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = { val parsedWeatherStream = rawWeatherStream.map(_._2.split(",")) .map(RawWeatherData(_)) parsedWeatherStream } }
Example 15
Source File: KafkaFlowExample.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.flow import org.apache.spark.streaming.dstream.DStream._ import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.joda.time.DateTime import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import scala.util.Try case class Purchase(item_id: String, amount: BigDecimal, time: Long) case class Key(item_id: String, time: DateTime) case class Summary(item_id: String, time: DateTime, total: BigDecimal) object KafkaFlowExample { implicit val formats = DefaultFormats def extract(message: String): Option[(Key, BigDecimal)] = { for { parsed <- Try(parse(message)).toOption purchase <- parsed.extractOpt[Purchase] } yield { val datetime = new DateTime(purchase.time) val roundedTime = datetime.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0) Key(purchase.item_id, roundedTime) -> purchase.amount } } def transformStream(stream: InputDStream[String]): DStream[Summary] = { stream .flatMap(extract) .reduceByKey(_ + _) .map { case (key, amount) => Summary(key.item_id, key.time, amount) } } }
Example 16
Source File: KafkaStreamingLatestExample.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.kafka010 import java.{util => ju} import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010._ import org.apache.spark.{SparkContext, TaskContext} import scala.collection.JavaConversions._ import com.example._ object KafkaStreamingLatestExample { def main(args: Array[String]): Unit = { kafkaStream010Checkpointing() } def kafkaStream010Itself() = launchWithItself(kafkaStreaming010, appName = "Kafka010_DirectStream") private def kafkaStreaming010(streamingContext: StreamingContext): Unit = { val topics = Array("sample_topic") val stream = KafkaUtils.createDirectStream[String, String]( streamingContext, PreferConsistent, //It will consistently distribute partitions across all executors. Subscribe[String, String](topics, kafkaParams) ) stream.map(record => (record.key, record.value)).print() stream.foreachRDD { rdd => val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges rdd.foreachPartition { _ => val o: OffsetRange = offsetRanges(TaskContext.get.partitionId) println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}") } } storingOffsetsItself(stream) } private def storingOffsetsItself(stream: InputDStream[ConsumerRecord[String, String]]) = { stream.foreachRDD { rdd => val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) } } private def kafkaRdd010() = { val sparkContext = new SparkContext("local[*]", "kafkaRdd010") val offsetRanges = Array( // topic, partition, inclusive starting offset, exclusive ending offset OffsetRange("sample_topic", 0, 10, 20), OffsetRange("sample_topic", 1, 10, 20) ) val params = new ju.HashMap[String, Object](kafkaParams) val kafkaRDD = KafkaUtils.createRDD[String, String](sparkContext, params , offsetRanges, PreferConsistent) println(kafkaRDD.map(_.value()).first()) } }
Example 17
Source File: Kafka2OdpsDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.kafka import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} object Kafka2OdpsDemo { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("test") val ssc = new StreamingContext(sparkConf, Seconds(10)) // 请使用OSS作为Checkpoint存储,修改为有效OSS路径。OSS访问文档请参考 https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E ssc.checkpoint("oss://bucket/checkpointdir") // kafka配置参数 val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "localhost:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "testGroupId", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) // 创建kafka dstream val topics = Set("test") val recordDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) val dstream = recordDstream.map(f => (f.key(), f.value())) // 解析kafka数据并写入odps val data: DStream[String] = dstream.map(_._2) val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) wordsDStream.foreachRDD(rdd => { val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) import spark.implicits._ rdd.toDF("id").write.mode("append").saveAsTable("test_table") }) ssc.start() ssc.awaitTermination() } }
Example 18
Source File: KafkaStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.kafka import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} object KafkaStreamingDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("KafkaStreamingDemo") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) // 请使用OSS作为Checkpoint存储 ssc.checkpoint("oss://bucket/checkpointDir/") // kafka配置参数 val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "192.168.1.1:9200,192.168.1.2:9200,192.168.1.3:9200", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "testGroupId", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) val topics = Set("event_topic") val recordDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) val dstream = recordDstream.map(f => (f.key(), f.value())) val data: DStream[String] = dstream.map(_._2) val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) val wordAndOneDstream: DStream[(String, Int)] = wordsDStream.map((_, 1)) val result: DStream[(String, Int)] = wordAndOneDstream.reduceByKey(_ + _) result.print() ssc.start() ssc.awaitTermination() } }
Example 19
Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import kafka.serializer.StringDecoder import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_KafkaStream { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val brokerList = args(0) val consumeTopic = args(1) val checkpointDir = args(2) val saveDir = args(3) val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir) // StreamingContextの取得 val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(brokerList: String, consumeTopic: String, checkpointDir: String, saveDir: String): () => StreamingContext = { () => { System.out.println(values) Some(running.getOrElse(0) + values.length) } def run(stream: InputDStream[(String, String)], saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) { val baseStream = stream.transform(rdd => { val t = (Long.MaxValue - System.currentTimeMillis) rdd.map(x => (x._1, x._2 + ", " + t)) }).map(x => { val splitVal = x._2.split(",") val userVal = splitVal(0).split(":") val actionVal = splitVal(1).split(":") val pageVal = splitVal(2).split(":") val timestamp = splitVal(3) (actionVal(1), userVal(1), pageVal(1), timestamp) }) baseStream.persist() val accountStream = baseStream.filter(_._1 == "view") .map(x => x._2) .countByValue() val totalUniqueUser = accountStream .updateStateByKey[Int](updateStateByKeyFunction _) .count() .map(x => "totalUniqueUser:" + x) val baseStreamPerTirty = baseStream .window(Seconds(windowLength), Seconds(slideInterval)) .filter(_._1 == "view") baseStreamPerTirty.persist() val pageViewPerTirty = baseStreamPerTirty .count() .map(x => "PageView:" + x) val uniqueUserPerTirty = baseStreamPerTirty .map(x => x._2) .countByValue() .count() .map(x => "UniqueUser:" + x) val pageViewStream = baseStream .filter(_._1 == "view") .map(x => x._3) .count() .map(x => "PageView:" + x) val outputStream = totalUniqueUser .union(pageViewPerTirty) .union(uniqueUserPerTirty) .union(pageViewStream) .reduce((x, y) => x + ", " + y) .saveAsTextFiles(saveDir) } } // scalastyle:on println
Example 20
Source File: StreamHelper.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import kafka.KafkaHelper import kafka.common.TopicAndPartition import kafka.consumer.PartitionTopicInfo import kafka.message.MessageAndMetadata import kafka.serializer.Decoder import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.{Logging, SparkException} import scala.reflect.ClassTag case class StreamHelper(kafkaParams: Map[String, String]) extends Logging { // helper for kafka zookeeper lazy val kafkaHelper = KafkaHelper(kafkaParams) lazy val kc = new KafkaCluster(kafkaParams) // 1. get leader's earliest and latest offset // 2. get consumer offset // 3-1. if (2) is bounded in (1) use (2) for stream // 3-2. else use (1) by "auto.offset.reset" private def getStartOffsets(topics: Set[String]): Map[TopicAndPartition, Long] = { lazy val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) lazy val consumerOffsets = kafkaHelper.getConsumerOffsets(topics.toSeq) { for { topicPartitions <- kc.getPartitions(topics).right smallOffsets <- kc.getEarliestLeaderOffsets(topicPartitions).right largeOffsets <- kc.getLatestLeaderOffsets(topicPartitions).right } yield { { for { tp <- topicPartitions } yield { val co = consumerOffsets.getOrElse(tp, PartitionTopicInfo.InvalidOffset) val so = smallOffsets.get(tp).map(_.offset).get val lo = largeOffsets.get(tp).map(_.offset).get logWarning(s"$tp: $co $so $lo") if (co >= so && co <= lo) { (tp, co) } else { (tp, reset match { case Some("smallest") => so case _ => lo }) } } }.toMap } }.fold(errs => throw new SparkException(errs.mkString("\n")), ok => ok) } def createStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topics: Set[String]): InputDStream[(K, V)] = { type R = (K, V) val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key(), mmd.message()) kafkaHelper.registerConsumerInZK(topics) new DirectKafkaInputDStream[K, V, KD, VD, R](ssc, kafkaParams, getStartOffsets(topics), messageHandler) } def commitConsumerOffsets(offsets: HasOffsetRanges): Unit = { val offsetsMap = { for { range <- offsets.offsetRanges if range.fromOffset < range.untilOffset } yield { logDebug(range.toString()) TopicAndPartition(range.topic, range.partition) -> range.untilOffset } }.toMap kafkaHelper.commitConsumerOffsets(offsetsMap) } def commitConsumerOffset(range: OffsetRange): Unit = { if (range.fromOffset < range.untilOffset) { try { val tp = TopicAndPartition(range.topic, range.partition) logDebug("Committed offset " + range.untilOffset + " for topic " + tp) kafkaHelper.commitConsumerOffset(tp, range.untilOffset) } catch { case t: Throwable => // log it and let it go logWarning("exception during commitOffsets", t) throw t } } } def commitConsumerOffsets[R](stream: InputDStream[R]): Unit = { stream.foreachRDD { rdd => commitConsumerOffsets(rdd.asInstanceOf[HasOffsetRanges]) } } }
Example 21
Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver} import org.apache.spark.streaming.{Seconds, StreamingContext} class SocketTextStream extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ //var schema:String=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] //schema=MapUtil.get(map,key="schema").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor //descriptor = schema :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession](); val socketDF = spark .readStream .format("socket") .option("host",hostname) .option("port",port) .load() out.write(socketDF) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.asInstanceOf[DStream[String]] } }
Example 22
Source File: TestableQueueInputDStream.scala From SparkUnitTestingExamples with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{ObjectInputStream, ObjectOutputStream} import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.dstream.InputDStream import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag class TestableQueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 23
Source File: redisStreamingFunctions.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.streaming import com.redislabs.provider.redis.{ReadWriteConfig, RedisConfig} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream def createRedisStreamWithoutListname(keys: Array[String], storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2) (implicit redisConf: RedisConfig = RedisConfig.fromSparkConf(ssc.sparkContext.getConf)): RedisInputDStream[String] = { new RedisInputDStream(ssc, keys, storageLevel, redisConf, classOf[String]) } def createRedisXStream(consumersConfig: Seq[ConsumerConfig], storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2) (implicit redisConfig: RedisConfig = RedisConfig.fromSparkConf(ssc.sparkContext.getConf)): InputDStream[StreamItem] = { val readWriteConfig = ReadWriteConfig.fromSparkConf(ssc.sparkContext.getConf) val receiver = new RedisStreamReceiver(consumersConfig, redisConfig, readWriteConfig, storageLevel) ssc.receiverStream(receiver) } } trait RedisStreamingFunctions { implicit def toRedisStreamingContext(ssc: StreamingContext): RedisStreamingContext = new RedisStreamingContext(ssc) }
Example 24
Source File: gihyo_6_2_1_Sample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_2_1_Sample { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val wordCounts = run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(_.split(" ")) val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) wordCounts.print } }
Example 25
Source File: gihyo_6_3_Join.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Join { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], joinStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = joinStream.map(x => (x, Array("attribute2", "attribute3", "attribute4"))) val linesKVW = lines1KV.join(lines2KV) linesKVW.print } }
Example 26
Source File: gihyo_6_3_Reduce.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Reduce { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val windowLineCount = stream.reduce((x, y) => x + "," + y) windowLineCount.print } }
Example 27
Source File: gihyo_6_3_reduceByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.reduceByWindow((x, y) => x + y, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 28
Source File: DirectKafkaWordCount.scala From spark-secure-kafka-app with Apache License 2.0 | 5 votes |
package com.cloudera.spark.examples import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka010.{ConsumerStrategies, LocationStrategies, KafkaUtils} import org.apache.spark.streaming._ object DirectKafkaWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println(s""" |Usage: DirectKafkaWordCount <brokers> <topics> | <brokers> is a list of one or more Kafka brokers | <topics> is a list of one or more kafka topics to consume from | <ssl> true if using SSL, false otherwise. | """.stripMargin) System.exit(1) } val Array(brokers, topics, ssl) = args // Create context with 2 second batch interval val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) val isUsingSsl = ssl.toBoolean // Create direct kafka stream with brokers and topics val topicsSet = topics.split(",").toSet val commonParams = Map[String, Object]( "bootstrap.servers" -> brokers, "security.protocol" -> (if (isUsingSsl) "SASL_SSL" else "SASL_PLAINTEXT"), "sasl.kerberos.service.name" -> "kafka", "auto.offset.reset" -> "earliest", "key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "group.id" -> "default", "enable.auto.commit" -> (false: java.lang.Boolean) ) val additionalSslParams = if (isUsingSsl) { Map( "ssl.truststore.location" -> "/etc/cdep-ssl-conf/CA_STANDARD/truststore.jks", "ssl.truststore.password" -> "cloudera" ) } else { Map.empty } val kafkaParams = commonParams ++ additionalSslParams val messages: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams) ) // Get the lines, split them into words, count the words and print val lines = messages.map(_.value()) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() } }
Example 29
Source File: gihyo_6_3_TwitterStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.atilika.kuromoji.Token import twitter4j.Status import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object gihyo_6_3_TwitterStream { def main(args: Array[String]) { if (args.length != 7) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val Array(cKey, cSecret, aToken, aSecret, cDir, tagDir, wordDir) = args System.setProperty("twitter4j.oauth.consumerKey", cKey) System.setProperty("twitter4j.oauth.consumerSecret", cSecret) System.setProperty("twitter4j.oauth.accessToken", aToken) System.setProperty("twitter4j.oauth.accessTokenSecret", aSecret) val f = createStreamingContext(cDir, tagDir, wordDir) val ssc = StreamingContext.getOrCreate(cDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(checkpointDir: String, tagDir: String, wordDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.registerKryoClasses(Array(classOf[UserDic])) val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val twitterStream = TwitterUtils.createStream(ssc, None) run(sc, twitterStream, tagDir, wordDir) ssc } } def run(sc: SparkContext, stream: InputDStream[Status], tagDir: String, wordDir: String) { val tokenizer = sc.broadcast(UserDic.getInstance) val tweets = stream.map(tweet => tweet.getText()) tweets.persist() val TweetText = tweets .flatMap(text => { val tokens = tokenizer.value.tokenize(text).toArray tokens.filter(t => { val token = t.asInstanceOf[Token] ((token.getPartOfSpeech.indexOf("名詞") > -1 && token.getPartOfSpeech.indexOf("一般") > -1) || token.getPartOfSpeech.indexOf("カスタム名詞") > -1) && token.getSurfaceForm.length > 1 && !(token.getSurfaceForm matches "^[a-zA-Z]+$|^[0-9]+$") }).map(t => t.asInstanceOf[Token].getSurfaceForm) }) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) val TweetTags = tweets .flatMap(tweet => tweet.split(" ").filter(_.startsWith("#"))) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) TweetText.saveAsTextFiles(wordDir) TweetTags.saveAsTextFiles(tagDir) } } // scalastyle:on println
Example 30
Source File: gihyo_6_3_Union.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.KafkaUtils object gihyo_6_3_Union { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHosts = args(0) val consumerGroup = args(1) val targetTopics = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val KafkaStreams = (1 to 5).map { i => KafkaUtils.createStream(ssc, targetHosts, consumerGroup, Map(targetTopics -> 1)) } run(ssc, KafkaStreams) ssc.start ssc.awaitTermination } def run(ssc: StreamingContext, streams: IndexedSeq[InputDStream[(String, String)]]) { val unionedStream = ssc.union(streams) unionedStream.print } }
Example 31
Source File: gihyo_6_3_flatMap.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_flatMap { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(line => line.split(" ")) words.print } }
Example 32
Source File: gihyo_6_3_Repartition.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Repartition { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val repartitionData = stream.repartition(3) // scalastyle:off println repartitionData.foreachRDD(rdd => println(s"partition size: ${rdd.partitions.size.toString}")) // scalastyle:on println repartitionData.print } }
Example 33
Source File: gihyo_6_3_Count.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Count { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val lineCount = stream.window(Seconds(windowLength), Seconds(slideInterval)).count lineCount.print } }
Example 34
Source File: gihyo_6_3_Map.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Map { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val lineCount = stream.map(line => (line, 1)) lineCount.print } }
Example 35
Source File: gihyo_6_3_Cogroup.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Cogroup { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], otherStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = otherStream.map(x => (x, "attribute2")) val linesKVW = lines1KV.cogroup(lines2KV) linesKVW.print } }