org.apache.spark.streaming.Seconds Scala Examples
The following examples show how to use org.apache.spark.streaming.Seconds.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingKafka10.scala From BigData-News with Apache License 2.0 | 7 votes |
package com.vita.spark import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka010.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe object StreamingKafka10 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("streaming") .getOrCreate() val sc = spark.sparkContext val ssc = new StreamingContext(sc, Seconds(5)) val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "node6:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "0001", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) val topics = Array("weblogs") val stream = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) val lines = stream.map(x => x.value()) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 2
Source File: SqlNetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 3
Source File: L5-15KafkaDirect.scala From prosparkstreaming with Apache License 2.0 | 6 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountDirectApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Set(topic) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 4
Source File: StreamingKafka8.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import kafka.serializer.StringDecoder import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} class StreamingKafka8 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("streaming") .getOrCreate() val sc = spark.sparkContext val ssc = new StreamingContext(sc, Seconds(5)) // Create direct kafka stream with brokers and topics val topicsSet = Set("weblogs") val kafkaParams = Map[String, String]("metadata.broker.list" -> "node5:9092") val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val lines = kafkaStream.map(x => x._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 5
Source File: Test.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.test import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} /** * 这是一个接收来自网络端口的信息 * 参数 spark集群的主节点地址,网络通信的节点地址,网络通信的端口,每个多长时间作为一个单位进行执行任务 * local[*] localhost 8888 5 */ object Test { case class Person(username: String, usercount: Int) def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[2]") .appName("hdfsTest") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(1)) val lines = ssc.socketTextStream("localhost", 9999) val words = lines.flatMap(_.split(" ")) words.print() println() ssc.start() ssc.awaitTermination() } }
Example 6
Source File: StreamingTestExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.stat.test.{BinarySample, StreamingTest} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.util.Utils object StreamingTestExample { def main(args: Array[String]) { if (args.length != 3) { // scalastyle:off println System.err.println( "Usage: StreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>") // scalastyle:on println System.exit(1) } val dataDir = args(0) val batchDuration = Seconds(args(1).toLong) val numBatchesTimeout = args(2).toInt val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample") val ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint { val dir = Utils.createTempDir() dir.toString } // $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => BinarySample(label.toBoolean, value.toDouble) }) val streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch") val out = streamingTest.registerStream(data) out.print() // $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout out.foreachRDD { rdd => timeoutCounter -= 1 val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _) if (timeoutCounter == 0 || anySignificant) rdd.context.stop() } ssc.start() ssc.awaitTermination() } }
Example 7
Source File: StreamingKMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 8
Source File: QueueStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.Queue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new Queue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into rddQueue for (i <- 1 to 30) { rddQueue.synchronized { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) } Thread.sleep(1000) } ssc.stop() } }
Example 9
Source File: CustomReceiver.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 10
Source File: HdfsWordCount.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object HdfsWordCount { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: HdfsWordCount <directory>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("HdfsWordCount") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 11
Source File: NetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 12
Source File: TestStreamingSpec.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import org.specs2.mutable.Specification import org.specs2.specification.BeforeAfterAll class TestStreamingSpec extends Specification with BeforeAfterAll { private val master = "local[2]" private val appName = "test_streaming" private val batchDuration = Seconds(1) private var sc: SparkContext = _ private var ssc: StreamingContext = _ override def beforeAll(): Unit = { val conf = new SparkConf() .setMaster(master) .setAppName(appName) ssc = new StreamingContext(conf, batchDuration) sc = ssc.sparkContext } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() } } }
Example 13
Source File: SocketTextStreamByWindow.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.DStream class SocketTextStreamByWindow extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket by window" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ var windowDuration:Int = _ var slideDuration:Int = _ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] windowDuration=MapUtil.get(map,key="windowDuration").asInstanceOf[String].toInt slideDuration=MapUtil.get(map,key="slideDuration").asInstanceOf[String].toInt val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data ").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) val windowDuration = new PropertyDescriptor().name("windowDuration").displayName("windowDuration").description("the window duration, the unit is seconds").defaultValue("").required(true) val slideDuration = new PropertyDescriptor().name("slideDuration").displayName("slideDuration").description("the slide duration, the unit is seconds").defaultValue("").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor descriptor = batchDuration :: descriptor descriptor = windowDuration :: descriptor descriptor = slideDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStreamByWindow.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.window(Seconds(windowDuration),Seconds(slideDuration)) //dstream.reduceByWindow(_ + _,Seconds(windowDuration),Seconds(slideDuration)) } }
Example 14
Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver} import org.apache.spark.streaming.{Seconds, StreamingContext} class SocketTextStream extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ //var schema:String=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] //schema=MapUtil.get(map,key="schema").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor //descriptor = schema :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession](); val socketDF = spark .readStream .format("socket") .option("host",hostname) .option("port",port) .load() out.write(socketDF) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.asInstanceOf[DStream[String]] } }
Example 15
Source File: StreamingLogisticRegression.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import com.bigchange.util.{FileUtil, TimeUtil} import org.apache.spark.SparkConf import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) // model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() model.predictOnValues(testData.map(lp => (lp.label, lp.features))).map(x => x._1 +"\t" +x._2).foreachRDD(rdd =>{ val value = rdd.collect() FileUtil.normalFileWriter("F:\\datatest\\ai\\StreamingLogisticRegression\\"+TimeUtil.getCurrentHour,value) }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 16
Source File: MonitorHDFSDirFiles.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.basic object MonitorHDFSDirFiles { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: <directory>") System.exit(1) } val sparkConf = new SparkConf().setAppName("MonitorHDFSDirFiles") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _).foreachRDD(rdd =>{ val arr = rdd.collect() arr.foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 17
Source File: KafkaWordCount.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.basic import java.util import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object KafkaWordCount { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <zkQuorum> <group> <topics> <numThreads>") System.exit(1) } val Array(zkQuorum, group, topics, numThreads) = args val sparkConf = new SparkConf().setAppName("KafkaWordCount"). set("spark.streaming.receiver.writeAheadLog.enable", "true"). set("spark.streaming.kafka.maxRatePerPartition", "1000") val ssc = new StreamingContext(sparkConf, Seconds(2)) // 设置 checkpoint,这是考虑到了有 window 操作,window 操作一般是需要进行 checkpoint ssc.checkpoint("checkpoint") val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap // createStream 返回的是一个 Tuple2,具有 key,value,这里只关注 value. // 注意这里是 Receiver-based 方式(还提供了 non-receiver 模式),默认配置下,这种方式是会在 receiver 挂掉 // 丢失数据的,需要设置 Write Ahead, 上面我们已经配置了, 那么存储 level 也可以进行相应调整. val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK_SER).map(_._2) val words = lines.flatMap(_.split(" ")) // 统计的是 10 分钟内的单词数量,每隔 10 秒统计 1 次 val wordCounts = words.map(x => (x, 1L)) .reduceByKeyAndWindow(_ + _, _ - _, Seconds(10), Seconds(2), 2). filter(x => x._2 > 0) wordCounts.print() ssc.start() ssc.awaitTermination() } } // Produces some random words between 1 and 100. object KafkaWordCountProducer { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <metadataBrokerList> <topic> " + "<messagesPerSec> <wordsPerMessage>") System.exit(1) } // 需要注意的是这里是 broker list,为 host:port,host:port 形式 val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args // Zookeeper connection properties val props = new util.HashMap[String, Object]() props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) // Send some messages while (true) { (1 to messagesPerSec.toInt).foreach { messageNum => val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(100).toString) .mkString(" ") val message = new ProducerRecord[String, String](topic, null, str) producer.send(message) } Thread.sleep(1000) } } }
Example 18
Source File: StreamingSimpleModel.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.streaming import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingSimpleModel { def main(args: Array[String]) { val ssc = new StreamingContext("local","test",Seconds(10)) val stream = ssc.socketTextStream("localhost",9999) val numberFeatures = 100 val zeroVector = DenseVector.zeros[Double](numberFeatures) val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.dense(zeroVector.data)) .setNumIterations(1) .setStepSize(0.01) val labeledStream = stream.map { event => val split = event.split("\t") val y = split(0).toDouble val features = split(1).split(",").map(_.toDouble) LabeledPoint(label = y, features = Vectors.dense(features)) } model.trainOn(labeledStream) // 使用DStream的转换算子 val predictAndTrue = labeledStream.transform { rdd => val latestModel = model.latestModel() rdd.map { point => val predict = latestModel.predict(point.features) predict - point.label } } // 计算MSE predictAndTrue.foreachRDD { rdd => val mse = rdd.map(x => x * x).mean() val rmse = math.sqrt(mse) println(s"current batch, MSE: $mse, RMSE:$rmse") } ssc.start() ssc.awaitTermination() } }
Example 19
Source File: SparkStreamingFixture.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import org.apache.spark.{Logging, SparkContext} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.junit.{After, Before} trait SparkStreamingFixture extends Logging { protected var sc: SparkContext protected var ssc: StreamingContext = _ protected val batchDuration = Seconds(1) @Before def startStreamingContext(): Unit = { ssc = new StreamingContext(sc, batchDuration) logInfo("Streaming context created") } @After def stopStreamingContext(): Unit = { Option(ssc).foreach(_.stop()) logInfo("Streaming context stopped") } }
Example 20
Source File: SparkStreaming_6_KafkaDirectStream.scala From HadoopLearning with MIT License | 5 votes |
package com.c503.streaming import com.utils.{ConfManager, SparkConf} import org.apache.spark.streaming.kafka010._ import org.apache.spark.streaming.{Seconds, StreamingContext} //执行数据 dataStream.foreachRDD(rdd => { rdd.foreach(partition => { var msg = "topic=" + partition.topic() + "\n" msg += "partition=" + partition.partition() + "\n" msg += "offset=" + partition.offset() + "\n" msg += "timestamp=" + partition.timestamp() + "\n" msg += "checksum=" + partition.checksum() + "\n" msg += "key=" + partition.key() + "\n" msg += "value=" + partition.value() + "\n" println(msg) }) //手动管理kafka的offset dataStream.asInstanceOf[CanCommitOffsets].commitAsync(rdd.asInstanceOf[HasOffsetRanges].offsetRanges) }) context.start() context.awaitTermination() } }
Example 21
Source File: SparkStreaming_1_1_local_TextFile.scala From HadoopLearning with MIT License | 5 votes |
package com.c503.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object SparkStreaming_1_1_local_TextFile { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("SparkStreaming_1_1_local_TextFile").setMaster("local[2]") val sc = new StreamingContext(sparkConf, Seconds(1)) val lines = sc.textFileStream("/Users/liuxm/A_study/idea_ws/mapreduce/") println(lines) val words = lines.flatMap(_.split(" ")) val pairs = words.map((_, 1)) val wordCounts = pairs.reduceByKey(_ + _) wordCounts.foreachRDD(rdd => { println("*" * 30) rdd.sortBy(x => x._2, false).foreach(e => { println(e) }) }) sc.start() sc.awaitTermination() } }
Example 22
Source File: Streaming.scala From scala-spark-cab-rides-predictions with MIT License | 5 votes |
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter import com.amazonaws.services.kinesis.model.Record import com.google.gson.Gson import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext} object Trials extends App { import org.apache.log4j.{Level, Logger} Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) //session setup System.setProperty("hadoop.home.dir", "C:\\winutils") val sparkSession = SparkSession.builder() .master("local[*]") .appName("test") .getOrCreate() val sc = sparkSession.sparkContext val ssc = new StreamingContext(sc, Seconds(10)) val sqlContext = sparkSession.sqlContext //creates an array of strings from raw byte array def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",") //converts records to map of key value pair and then json def recordHandler = (record: Record) => { val gson = new Gson val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage) gson.toJson(map) } case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String) val stream_cab = KinesisInputDStream.builder .streamingContext(ssc) .streamName("cab_rides") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) val stream_weather = KinesisInputDStream.builder .streamingContext(ssc) .streamName("weather") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) //creating dataframe, can be stored as temp view val cabSchema = Encoders.product[CabPrice].schema stream_cab.foreachRDD(rdd => { import sqlContext.implicits._ //val xx: Dataset[String] = rdd.toDS() val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS()) df.show() }) ssc.start() ssc.awaitTermination() }
Example 23
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase import java.io.File import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 24
Source File: SKRSpec.scala From spark-kafka-writer with Apache License 2.0 | 5 votes |
package com.github.benfradet.spark.kafka.writer import java.util.concurrent.atomic.AtomicInteger import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer} import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.scalatest.concurrent.Eventually import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} import scala.collection.mutable.ArrayBuffer import scala.util.Random import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec case class Foo(a: Int, b: String) trait SKRSpec extends AnyWordSpec with Matchers with BeforeAndAfterEach with BeforeAndAfterAll with Eventually { val sparkConf = new SparkConf() .setMaster("local[1]") .setAppName(getClass.getSimpleName) var ktu: KafkaTestUtils = _ override def beforeAll(): Unit = { ktu = new KafkaTestUtils ktu.setup() } override def afterAll(): Unit = { SKRSpec.callbackTriggerCount.set(0) if (ktu != null) { ktu.tearDown() ktu = null } } var topic: String = _ var ssc: StreamingContext = _ var spark: SparkSession = _ override def afterEach(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (spark != null) { spark.stop() spark = null } } override def beforeEach(): Unit = { ssc = new StreamingContext(sparkConf, Seconds(1)) spark = SparkSession.builder .config(sparkConf) .getOrCreate() topic = s"topic-${Random.nextInt()}" ktu.createTopics(topic) } def collect(ssc: StreamingContext, topic: String): ArrayBuffer[String] = { val kafkaParams = Map( "bootstrap.servers" -> ktu.brokerAddress, "auto.offset.reset" -> "earliest", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "test-collect" ) val results = new ArrayBuffer[String] KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Set(topic), kafkaParams) ).map(_.value()) .foreachRDD { rdd => results ++= rdd.collect() () } results } val producerConfig = Map( "bootstrap.servers" -> "127.0.0.1:9092", "key.serializer" -> classOf[StringSerializer].getName, "value.serializer" -> classOf[StringSerializer].getName ) } object SKRSpec { val callbackTriggerCount = new AtomicInteger() }
Example 25
Source File: StreamingExample.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.spark.examples import com.pygmalios.reactiveinflux._ import com.pygmalios.reactiveinflux.spark._ import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.joda.time.DateTime import scala.concurrent.duration._ object StreamingExample extends App { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Example") val ssc = new StreamingContext(conf, Seconds(1)) val point1 = Point( time = DateTime.now(), measurement = "measurement1", tags = Map( "tagKey1" -> "tagValue1", "tagKey2" -> "tagValue2"), fields = Map( "fieldKey1" -> "fieldValue1", "fieldKey2" -> 10.7) ) // Provide settings for reactiveinflux implicit val params = ReactiveInfluxDbName("example") implicit val awaitAtMost = 1.second // Create DStream of Influx points val queue = new scala.collection.mutable.Queue[RDD[Point]] val queueStream: DStream[Point] = ssc.queueStream(queue) // Add single RDD with a single Influx point to the DStream queue.enqueue(ssc.sparkContext.parallelize(Seq(point1))) // Save DStream to Influx queueStream.saveToInflux() // Start Spark streaming ssc.start() ssc.awaitTermination() }
Example 26
Source File: Predict.scala From spark-twitter-sentiment with Apache License 2.0 | 5 votes |
package com.dhruv import org.apache.spark.SparkConf import org.apache.spark.mllib.classification.NaiveBayesModel import org.apache.spark.streaming.twitter._ import org.apache.spark.streaming.{Seconds, StreamingContext} object Predict { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: " + this.getClass.getSimpleName + " <modelDirectory> ") System.exit(1) } val Array(modelFile) = Utils.parseCommandLineWithTwitterCredentials(args) println("Initializing Streaming Spark Context...") val conf = new SparkConf().setAppName(this.getClass.getSimpleName) val ssc = new StreamingContext(conf, Seconds(5)) println("Initializing Twitter stream...") val tweets = TwitterUtils.createStream(ssc, Utils.getAuth) val statuses = tweets.filter(_.getLang == "en").map(_.getText) println("Initalizaing the Naive Bayes model...") val model = NaiveBayesModel.load(ssc.sparkContext, modelFile.toString) val labeled_statuses = statuses .map(t => (t, model.predict(Utils.featurize(t)))) labeled_statuses.print() // Start the streaming computation println("Initialization complete.") ssc.start() ssc.awaitTermination() } }
Example 27
Source File: MSNBCStreamingExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingExample extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999) val sequences: DStream[Array[Array[Int]]] = rawSequences .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 28
Source File: MSNBCStreamingAdvanced.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingAdvanced extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999) val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": ")) .map(kv => (kv(0).toInt, kv(1))) val countIds = events.map(e => (e._1, 1)) val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _) def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = { Some(runningCount.getOrElse(0) + newValues.sum) } val runningCounts = countIds.updateStateByKey[Int](updateFunction _) val duration = Seconds(20) val slide = Seconds(10) val rawSequences: DStream[(Int, String)] = events .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide) val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2) .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 29
Source File: StreamingDemo.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} def createSchema(): Unit = { CassandraConnector(conf).withSessionDo { session => session.execute(s"DROP KEYSPACE IF EXISTS $CassandraKeyspace") session.execute(s"CREATE KEYSPACE IF NOT EXISTS $CassandraKeyspace WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s""" CREATE TABLE IF NOT EXISTS $CassandraKeyspace.$CassandraTable ( event text, interval text, dimension text, subtotal counter, PRIMARY KEY((event, interval), dimension) ) WITH CLUSTERING ORDER BY (dimension ASC) """) } } }
Example 30
Source File: PersistStreamByInterval.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup import com.datastax.examples.meetup.model.MeetupRsvp import com.datastax.examples.meetup.model.EventInterval import com.datastax.examples.meetup.websocket._ import com.datastax.spark.connector._ import com.datastax.spark.connector.streaming._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, Minutes, StreamingContext} import org.apache.spark.streaming.StreamingContext._ class PersistStreamByInterval extends Serializable { val tableColumns = SomeColumns("event", "interval", "dimension", "subtotal") def start(ssc: StreamingContext, websocket: String, keyspace: String, table: String) { val stream = ssc.receiverStream[MeetupRsvp](new WebSocketReceiver(websocket, StorageLevel.MEMORY_ONLY_SER)) //stream.checkpoint(Seconds(60)) //stream.repartition(2) // Filter Accepted RSVP val rsvpAccepted = stream.filter(_.response == "yes") // Number of attendees by Country val rsvpByCountry = rsvpAccepted .map( rsvp => (rsvp.group.group_country, rsvp.guests + 1) ) .reduceByKey(_ + _) .map{ case (country, attendees) => ("attending", EventInterval.All, country, attendees) } rsvpByCountry.saveToCassandra(keyspace, table, tableColumns) // Trending Topics val trendingTopics = rsvpAccepted .flatMap( rsvp => rsvp.group.group_topics ) .map( topic => (topic.topic_name, 1) ) .reduceByKeyAndWindow((a:Int,b:Int) => a+b, Minutes(5), Seconds(10)) .filter( t => t._2 > 5 ) // min threshold = 5 .transform( (rdd, time) => rdd.map { case (topic, count) => ("trending", EventInterval.Seconds(time), topic, count)} ) trendingTopics.saveToCassandra(keyspace, table, tableColumns) ssc.start() ssc.awaitTermination() } }
Example 31
Source File: TestAdditionInWindow.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{ StreamingContext, Seconds } import org.apache.spark.SparkConf object TestAdditionInWindow { def main(args: Array[String]): Unit = { val ssc = new StreamingContext(new SparkConf().setAppName("TestAdditionJob"), Seconds(1)) val msg = ssc.socketTextStream("localhost", 9999) msg .map(data => ("sum", data.toInt)) .reduceByKey(_ + _) .window(Seconds(3), Seconds(2)) .print() ssc.start() ssc.awaitTermination() } }
Example 32
Source File: TestStreamingListener.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{ StreamingContext, Seconds } import org.apache.spark.streaming.scheduler.{ StreamingListener, StreamingListenerBatchStarted, StreamingListenerBatchCompleted } import org.apache.spark.SparkConf object TestStreamingListener { def main(args: Array[String]): Unit = { val ssc = new StreamingContext(new SparkConf().setAppName("TestStreamingListenerJob"), Seconds(5)) ssc.addStreamingListener(new MyStreamingListener()) ssc .socketTextStream("localhost", 9999) .flatMap(_.split(" ")) .count() .print() ssc.start() ssc.awaitTermination() } } class MyStreamingListener extends StreamingListener { override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = { println(">>> Batch started...records in batch = " + batchStarted.batchInfo.numRecords) } override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { println(">>> Batch completed...time taken (ms) = " + batchCompleted.batchInfo.totalDelay) } }
Example 33
Source File: TestMapWithState.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.StreamingContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, State, StateSpec } def mappingFunc(key: String, value: Option[Int], state: State[Int]): Option[(String, Int)] = { val sum = value.getOrElse(0) + state.getOption().getOrElse(0) // updating the state of non-idle keys... // To call State.update(...) we need to check State.isTimingOut() == false, // else there will be NoSuchElementException("Cannot update the state that is timing out") if (state.isTimingOut()) println(key + " key is timing out...will be removed.") else state.update(sum) Some((key, sum)) } }
Example 34
Source File: SparkStreamingRedisSuite.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis import com.redislabs.provider.redis.env.Env import com.redislabs.provider.redis.util.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} import org.scalatest.{BeforeAndAfterEach, FunSuite} trait SparkStreamingRedisSuite extends FunSuite with Env with BeforeAndAfterEach with Logging { override protected def beforeEach(): Unit = { super.beforeEach() spark = SparkSession.builder().config(conf).getOrCreate() sc = spark.sparkContext ssc = new StreamingContext(sc, Seconds(1)) } override protected def afterEach(): Unit = { ssc.stop() spark.stop System.clearProperty("spark.driver.port") super.afterEach() } }
Example 35
Source File: CustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { println("Connecting to " + host + ":" + port) socket = new Socket(host, port) println("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() println("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 36
Source File: TFLCustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object TFLCustomReceiver { private val url = "https://api.tfl.gov.uk/Line/circle/Arrivals?stopPointId=940GZZLUERC&app_id=a73727f3&app_key=dc8150560a2422afae2b70cf291c4327" def main(args: Array[String]) { // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("TFLCustomReceiver") val ssc = new StreamingContext(sparkConf, Seconds(300)) val lines = ssc.receiverStream(new TFLCustomReceiver(url)) lines.print() ssc.start() ssc.awaitTermination() } } class TFLCustomReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { def onStart() { // Start the thread that receives data over a connection new Thread("Http Receiver") { override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself if isStopped() returns false } private def receive() { var userInput: String = null var httpClient: DefaultHttpClient = null var getRequest: HttpGet = null try { // Connect to host:port httpClient = new DefaultHttpClient(); getRequest = new HttpGet(url); getRequest.addHeader("accept", "application/json"); while(!isStopped) { val response = httpClient.execute(getRequest); if (response.getStatusLine().getStatusCode() != 200) { throw new RuntimeException("Failed : HTTP error code : "+ response.getStatusLine().getStatusCode()); } val reader = new BufferedReader(new InputStreamReader((response.getEntity().getContent()))); userInput = reader.readLine() while(userInput != null) { store(userInput) //println(userInput) userInput = reader.readLine() } reader.close() Thread.sleep(60*1000) } httpClient.close() // Restart in an attempt to connect again when server is active again //restart("Trying to connect again") } catch { case e: java.net.ConnectException => // restart if could not connect to server restart("Error connecting to " + url, e) case t: Throwable => // restart if there is any other error restart("Error receiving data", t) } } }
Example 37
Source File: TFLStreamingApp.scala From Learning-Spark-SQL with MIT License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object TFLStreamingApp { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TFLStreaming") val ssc = new StreamingContext(conf, Seconds(300)) val stream = ssc.receiverStream(new TFLArrivalPredictionsByLine()) println("Before") stream.print() println("After") if (args.length > 2) { stream.saveAsTextFiles(args(2)) } ssc.start() ssc.awaitTermination() } }
Example 38
Source File: gihyo_6_2_1_Sample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_2_1_Sample { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val wordCounts = run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(_.split(" ")) val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) wordCounts.print } }
Example 39
Source File: gihyo_6_3_Join.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Join { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], joinStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = joinStream.map(x => (x, Array("attribute2", "attribute3", "attribute4"))) val linesKVW = lines1KV.join(lines2KV) linesKVW.print } }
Example 40
Source File: gihyo_6_3_Reduce.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Reduce { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val windowLineCount = stream.reduce((x, y) => x + "," + y) windowLineCount.print } }
Example 41
Source File: gihyo_6_3_reduceByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.reduceByWindow((x, y) => x + y, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 42
Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import kafka.serializer.StringDecoder import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_KafkaStream { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val brokerList = args(0) val consumeTopic = args(1) val checkpointDir = args(2) val saveDir = args(3) val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir) // StreamingContextの取得 val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(brokerList: String, consumeTopic: String, checkpointDir: String, saveDir: String): () => StreamingContext = { () => { System.out.println(values) Some(running.getOrElse(0) + values.length) } def run(stream: InputDStream[(String, String)], saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) { val baseStream = stream.transform(rdd => { val t = (Long.MaxValue - System.currentTimeMillis) rdd.map(x => (x._1, x._2 + ", " + t)) }).map(x => { val splitVal = x._2.split(",") val userVal = splitVal(0).split(":") val actionVal = splitVal(1).split(":") val pageVal = splitVal(2).split(":") val timestamp = splitVal(3) (actionVal(1), userVal(1), pageVal(1), timestamp) }) baseStream.persist() val accountStream = baseStream.filter(_._1 == "view") .map(x => x._2) .countByValue() val totalUniqueUser = accountStream .updateStateByKey[Int](updateStateByKeyFunction _) .count() .map(x => "totalUniqueUser:" + x) val baseStreamPerTirty = baseStream .window(Seconds(windowLength), Seconds(slideInterval)) .filter(_._1 == "view") baseStreamPerTirty.persist() val pageViewPerTirty = baseStreamPerTirty .count() .map(x => "PageView:" + x) val uniqueUserPerTirty = baseStreamPerTirty .map(x => x._2) .countByValue() .count() .map(x => "UniqueUser:" + x) val pageViewStream = baseStream .filter(_._1 == "view") .map(x => x._3) .count() .map(x => "PageView:" + x) val outputStream = totalUniqueUser .union(pageViewPerTirty) .union(uniqueUserPerTirty) .union(pageViewStream) .reduce((x, y) => x + ", " + y) .saveAsTextFiles(saveDir) } } // scalastyle:on println
Example 43
Source File: gihyo_6_3_TwitterStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.atilika.kuromoji.Token import twitter4j.Status import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object gihyo_6_3_TwitterStream { def main(args: Array[String]) { if (args.length != 7) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val Array(cKey, cSecret, aToken, aSecret, cDir, tagDir, wordDir) = args System.setProperty("twitter4j.oauth.consumerKey", cKey) System.setProperty("twitter4j.oauth.consumerSecret", cSecret) System.setProperty("twitter4j.oauth.accessToken", aToken) System.setProperty("twitter4j.oauth.accessTokenSecret", aSecret) val f = createStreamingContext(cDir, tagDir, wordDir) val ssc = StreamingContext.getOrCreate(cDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(checkpointDir: String, tagDir: String, wordDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.registerKryoClasses(Array(classOf[UserDic])) val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val twitterStream = TwitterUtils.createStream(ssc, None) run(sc, twitterStream, tagDir, wordDir) ssc } } def run(sc: SparkContext, stream: InputDStream[Status], tagDir: String, wordDir: String) { val tokenizer = sc.broadcast(UserDic.getInstance) val tweets = stream.map(tweet => tweet.getText()) tweets.persist() val TweetText = tweets .flatMap(text => { val tokens = tokenizer.value.tokenize(text).toArray tokens.filter(t => { val token = t.asInstanceOf[Token] ((token.getPartOfSpeech.indexOf("名詞") > -1 && token.getPartOfSpeech.indexOf("一般") > -1) || token.getPartOfSpeech.indexOf("カスタム名詞") > -1) && token.getSurfaceForm.length > 1 && !(token.getSurfaceForm matches "^[a-zA-Z]+$|^[0-9]+$") }).map(t => t.asInstanceOf[Token].getSurfaceForm) }) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) val TweetTags = tweets .flatMap(tweet => tweet.split(" ").filter(_.startsWith("#"))) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) TweetText.saveAsTextFiles(wordDir) TweetTags.saveAsTextFiles(tagDir) } } // scalastyle:on println
Example 44
Source File: gihyo_6_3_Union.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.KafkaUtils object gihyo_6_3_Union { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHosts = args(0) val consumerGroup = args(1) val targetTopics = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val KafkaStreams = (1 to 5).map { i => KafkaUtils.createStream(ssc, targetHosts, consumerGroup, Map(targetTopics -> 1)) } run(ssc, KafkaStreams) ssc.start ssc.awaitTermination } def run(ssc: StreamingContext, streams: IndexedSeq[InputDStream[(String, String)]]) { val unionedStream = ssc.union(streams) unionedStream.print } }
Example 45
Source File: gihyo_6_3_flatMap.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_flatMap { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(line => line.split(" ")) words.print } }
Example 46
Source File: gihyo_6_3_Repartition.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Repartition { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val repartitionData = stream.repartition(3) // scalastyle:off println repartitionData.foreachRDD(rdd => println(s"partition size: ${rdd.partitions.size.toString}")) // scalastyle:on println repartitionData.print } }
Example 47
Source File: gihyo_6_3_Count.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Count { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val lineCount = stream.window(Seconds(windowLength), Seconds(slideInterval)).count lineCount.print } }
Example 48
Source File: gihyo_6_3_Map.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Map { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val lineCount = stream.map(line => (line, 1)) lineCount.print } }
Example 49
Source File: gihyo_6_3_Cogroup.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Cogroup { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], otherStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = otherStream.map(x => (x, "attribute2")) val linesKVW = lines1KV.cogroup(lines2KV) linesKVW.print } }
Example 50
Source File: gihyo_6_3_reduceByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKey { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countKeyValue = stream.map(x => (x, 1)).reduceByKey((x, y) => x + y) countKeyValue.print } }
Example 51
Source File: gihyo_6_3_reduceByKeyAndWindow_efficient.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow_efficient { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow( (a: Int, b: Int) => a + b, (a: Int, b: Int) => a - b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 52
Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Transform { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))) run(lines, blackList) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], blackList: RDD[(String, String)]) { val userList = stream.map(x => (x, "action:Login")).transform(rdd => { val tmpUserList = rdd.leftOuterJoin(blackList) tmpUserList.filter(user => (user._2._2 == None)) }) userList.print } }
Example 53
Source File: gihyo_6_3_reduceByKeyAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 54
Source File: gihyo_6_3_countByValueAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValueAndWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val f = createStreamingContext(targetHost, targetHostPort, checkpointDir) val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext( targetHost: String, targetHostPort: Int, checkpointDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc } } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByValueAndWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } } // scalastyle:on println
Example 55
Source File: gihyo_6_3_updateStateByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_updateStateByKey { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val userList = stream.map(x => (x, 1)).updateStateByKey[Int](updateStateByKeyFunction _) userList.print } def updateStateByKeyFunction(values: Seq[Int], running: Option[Int]): Option[Int] = { Some(running.getOrElse(0) + values.size) } }
Example 56
Source File: gihyo_6_3_Filter.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Filter { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val overData = stream.filter(line => line.length > 5) overData.print } }
Example 57
Source File: gihyo_6_3_countByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 58
Source File: gihyo_6_3_Window.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Window { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.window(Seconds(windowLength), Seconds(slideInterval)).countByValue() userList.print } }
Example 59
Source File: gihyo_6_3_countByValue.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValue { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countValue = stream.countByValue() countValue.print } }
Example 60
Source File: TestStreamingContext.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark import org.scalatest.{BeforeAndAfterEach, Suite} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import jp.gihyo.spark.ch06.UserDic private[spark] trait TestStreamingContext extends BeforeAndAfterEach { self: Suite => @transient var ssc: StreamingContext = _ @transient var sc: SparkContext = _ val master = "local[2]" val appN = "StreamingUnitTest" val bd = Seconds(1) override def beforeEach() { super.beforeEach() val conf = new SparkConf().setMaster(master) .setAppName(appN) .set("spark.streaming.clock", "org.apache.spark.util.ManualClock") .registerKryoClasses(Array(classOf[UserDic])) ssc = new StreamingContext(conf, bd) sc = ssc.sparkContext } override def afterEach() { try { if (ssc != null) { // stop with sc ssc.stop(true) } ssc = null; } finally { super.afterEach() } } }
Example 61
Source File: AvroTransformerSpec.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package test import com.memsql.spark.connector.MemSQLContext import com.memsql.spark.etl.api.{UserTransformConfig, UserExtractConfig} import com.memsql.spark.examples.avro.{AvroTransformer, AvroRandomExtractor} import org.apache.spark.streaming.{StreamingContext, Seconds} import test.util.{Fixtures, UnitSpec, LocalSparkContext} import spray.json._ class AvroTransformerSpec extends UnitSpec with LocalSparkContext { var ssc: StreamingContext = _ var msc: MemSQLContext = _ override def beforeEach(): Unit = { super.beforeEach() ssc = new StreamingContext(sc, Seconds(1)) msc = new MemSQLContext(sc) } val avroConfig = Fixtures.avroConfig.parseJson val extractConfig = UserExtractConfig(class_name = "Test", value = avroConfig) val transformConfig = UserTransformConfig(class_name = "Test", value = avroConfig) "AvroRandomTransformer" should "emit a dataframe of properly deserialized data" in { val extractor = new AvroRandomExtractor val transformer = new AvroTransformer extractor.initialize(null, null, extractConfig, 0, null) transformer.initialize(null, transformConfig, null) val maybeDf = extractor.next(null, 0, msc, null, 0, null) assert(maybeDf.isDefined) val extractedDf = maybeDf.get val transformedDf = transformer.transform(msc, extractedDf, null, null) val rows = transformedDf.collect() for (row <- rows) { assert(row(0).isInstanceOf[Boolean]) assert(row(1).isInstanceOf[Double]) assert(row(2).isInstanceOf[Float]) assert(row(3).isInstanceOf[Int]) assert(row(4).isInstanceOf[Long]) assert(row(5) === null) assert(row(6).isInstanceOf[String]) assert(row(7).isInstanceOf[String]) } } }
Example 62
Source File: FlumeWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.flume._ object FlumeWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("FlumeWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/flume_check") val hostPort=args(0).split(":") System.out.println("Opening a sink at host: [" + hostPort(0) + "] port: [" + hostPort(1).toInt + "]") val lines = FlumeUtils.createPollingStream(ssc, hostPort(0), hostPort(1).toInt, StorageLevel.MEMORY_ONLY) val words = lines .map(e => new String(e.event.getBody.array)).map(_.toLowerCase).flatMap(_.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 63
Source File: KafkaWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka._ object KafkaWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/kafka_check") System.out.println("Opening a Kafka consumer at zk: [" + args(0) + "] for group group-1 and topic example") val lines = KafkaUtils.createStream(ssc, args(0), "group-1", Map("example" -> 1), StorageLevel.MEMORY_ONLY) val words = lines .flatMap(_._2.toLowerCase.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 64
Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.Graph.graphToGraphOps import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object UserRankApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .foreachRDD(rdd => { val edges = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) }) .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) val vertices = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String]) }) .map(r => (r.hashCode.toLong, r)) val tolerance = 0.0001 val graph = Graph(vertices, edges, "defaultUser") .subgraph(vpred = (id, idStr) => idStr != "defaultUser") val pr = graph.pageRank(tolerance).cache graph.outerJoinVertices(pr.vertices) { (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) }.vertices.top(10) { Ordering.by(_._2._1) }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) }) ssc.start() ssc.awaitTermination() } }
Example 65
Source File: L10-2DataProc.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.HashPartitioner import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.JsonAST.JNothing import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object DataProcApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .filter(jvalue => { jvalue \ "attributes" \ "Wi-Fi" != JNothing }) .map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int]) }) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .print() ssc.start() ssc.awaitTermination() } }
Example 66
Source File: L5-7MultipleSocketStreams.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearMultiApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: TripByYearMultiApp <appname> <hostname> <base_port> <num_of_sockets>") System.exit(1) } val Seq(appName, hostname, basePort, nSockets) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i)) val uniStream = ssc.union(streams) uniStream .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 67
Source File: L5-9Mqtt.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.mqtt.MQTTUtils object YearlyDistributionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>") System.exit(1) } val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => rec.split(",")) .map(rec => (rec(1).split(" ")(0), 1)) .updateStateByKey(statefulCount) .map(pair => (pair._2, pair._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("YearlyDistribution") ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 68
Source File: L5-11FlumePull.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 69
Source File: L5-6SocketStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: TripByYearApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 70
Source File: L5-16Twitter.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.storage.StorageLevel import twitter4j.conf.ConfigurationBuilder import twitter4j.TwitterFactory object TwitterApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: TwitterApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val cb = new ConfigurationBuilder() cb.setOAuthConsumerKey("") cb.setOAuthConsumerSecret("") cb.setOAuthAccessToken("") cb.setOAuthAccessTokenSecret("") val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization() val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share")) tweetStream.count().print() tweetStream.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 71
Source File: L5-11FlumePush.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 72
Source File: L5-13Kafka.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 73
Source File: L5-18Http.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HttpApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: HttpApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval) .flatMap(rec => (parse(rec) \ "stationBeanList").children) .filter(rec => { implicit val formats = DefaultFormats (rec \ "statusKey").extract[Integer] != 1 }) .map(rec => rec.filterField { case JField("id", _) => true case JField("stationName", _) => true case JField("statusValue", _) => true case _ => false }) .map(rec => { implicit val formats = DefaultFormats (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String]) }) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 74
Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel object StationJourneyCountCustomApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 75
Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object ReferrerApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>") System.exit(1) } val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.externalBlockStore.url", tachyonUrl) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val clickstream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .persist(StorageLevel.OFF_HEAP) val topRefStream = clickstream .map(rec => { var prev_title = rec(3) if (!prev_title.startsWith("other")) { prev_title = "wikipedia" } (prev_title, 1) }) val topSparkStream = clickstream .filter(rec => rec(4).equals("Apache_Spark")) .map(rec => (rec(3), 1)) saveTopKeys(topRefStream, outputPathTop) saveTopKeys(topSparkStream, outputPathSpark) ssc.start() ssc.awaitTermination() } def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) } }
Example 76
Source File: L7-4UI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.concurrent.atomic.AtomicLong import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object SocialSearchApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: SocialSearchApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.eventLog.enabled", "true") //.set("spark.eventLog.dir", "/tmp/historical") val countSearch = new AtomicLong(0) val countSocial = new AtomicLong(0) val ssc = new StreamingContext(conf, Seconds(1)) val titleStream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .filter(_(3) match { case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => (rec(3), rec(4))) .cache() val searchStream = titleStream.filter(_._1 match { case "other-google" | "other-bing" | "other-yahoo" => true case _ => false }) .map(rec => rec._2) val socialStream = titleStream.filter(_._1 match { case "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => rec._2) val exclusiveSearch = searchStream.transformWith(socialStream, (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD)) .foreachRDD(rdd => { countSearch.addAndGet(rdd.count()) println("Exclusive count search engines: " + countSearch) }) val exclusiveSocial = socialStream.transformWith(searchStream, (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD)) .foreachRDD(rdd => { countSocial.addAndGet(rdd.count()) println("Exclusive count social media: " + countSocial) }) ssc.start() ssc.awaitTermination() } }
Example 77
Source File: L4-1Voyager.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerApp <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC") val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) voyager1.map(rec => { val attrs = rec.split("\\s+") ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble)) }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 78
Source File: L4-4Kryo.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerAppKryo { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[ProtonFlux])) val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val projected = voyager1.map(rec => { val attrs = rec.split("\\s+") new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21), attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27), attrs(28)) }) val filtered = projected.filter(pflux => pflux.isSolarStorm) val yearlyBreakdown = filtered.map(rec => (rec.year, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false)) yearlyBreakdown.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 79
Source File: L8-1DataFrameAPI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 80
Source File: L8-3-6-7DataFrameCreation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.native.Serialization.write import org.json4s.DefaultFormats object DataframeCreationApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { //val cdrs = sqlC.createDataFrame(seqToCdr(rdd)) //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect()) //val cdrs = seqToCdr(rdd).toDF() val cdrsJson = seqToCdr(rdd).map(r => { implicit val formats = DefaultFormats write(r) }) val cdrs = sqlC.read.json(cdrsJson) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 81
Source File: L8-29DataFrameExamplesJoin.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamples3App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString val gridGeo = (parse(gridFile) \ "features") val gridStr = gridGeo.children.map(r => { val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r)) val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)), ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7))) compact(render(JObject(l))) }) val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr)) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.join(gridDF, $"squareId" === $"id").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 82
Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import java.nio.file.Paths import org.apache.spark.SparkFiles object CdrStreamingSparkRApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ ssc.sparkContext.addFile(rScriptPath) val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) val master = hiveC.sparkContext.getConf.get("spark.master") val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD((rdd, time) => { val iTableName = tableName + time.milliseconds seqToCdr(rdd).toDF().write.saveAsTable(iTableName) hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 83
Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions.desc import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr import org.json4s.DefaultFormats object CdrDataframeExamplesActionsApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) counts.show(5) counts.show() println("head(5): " + counts.head(5)) println("take(5): " + counts.take(5)) println("head(): " + counts.head()) println("first(5): " + counts.first()) println("count(): " + counts.count()) println("collect(): " + counts.collect()) println("collectAsList(): " + counts.collectAsList()) println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) counts.write.format("parquet").save("/tmp/parquent" + rdd.id) counts.write.format("json").save("/tmp/json" + rdd.id) counts.write.parquet("/tmp/parquent2" + rdd.id) counts.write.json("/tmp/json2" + rdd.id) counts.write.saveAsTable("count_table") cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") val prop: java.util.Properties = new java.util.Properties() counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 84
Source File: L8-10-11UDF.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.io.Source import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.jackson.JsonMethods.parse import org.json4s.jvalue2extractable import org.json4s.string2JsonInput object CdrUDFApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrUDFApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ def getCountryCodeMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap) } def getCountryNameMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]] } def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = { mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound") } val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int) sqlC.udf.register("getCountryNamePartial", getCountryNamePartial) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 85
Source File: L8-4DataFrameCreationSchema.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataframeCreationApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeApp2 <appname> <batchInterval> <hostname> <port> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } }
Example 86
Source File: L8-14-27DataFrameExamples.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamplesApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.select("squareId", "timeInterval", "countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").show() cdrs.filter("squareId = 5").show() cdrs.drop("countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show() cdrs.limit(5).show() cdrs.groupBy("squareId").count().show() cdrs.groupBy("countryCode").avg("internetTrafficActivity").show() cdrs.groupBy("countryCode").max("callOutActivity").show() cdrs.groupBy("countryCode").min("callOutActivity").show() cdrs.groupBy("squareId").sum("internetTrafficActivity").show() cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show() cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode()) cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode()) cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show() cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show() cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show() cdrs.sample(true, 0.01).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 87
Source File: L8-28DataFrameExamplesOps.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamples2App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamples2App <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ var previousCdrs: Option[DataFrame] = None val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates() previousCdrs match { case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show() //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show() //case Some(prevCdrs) => cdrs.except(prevCdrs).show() case None => Unit } previousCdrs = Some(cdrs) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 88
Source File: T8-3DataFrameExamplesNA.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamplesNAApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesNAApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.na.drop("any").show() cdrs.na.fill(0, Array("squareId")).show() cdrs.na.replace("squareId", Map(0 -> 1)).show() println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity")) println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity")) cdrs.stat.crosstab("squareId", "countryCode").show() cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show() cdrs.stat.crosstab("callOutActivity", "callInActivity").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 89
Source File: L8-8Sql.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrSqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrSqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() sqlC.dropTempTable("cdrs") }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 90
Source File: L8-35DataFrameExamplesRDD.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats object CdrDataframeExamplesRDDApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamplesRDDApp <appname> <batchInterval> <hostname> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema) val highOther = cdrs.except(highInternet) val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates() val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates() highOtherGrid.except(highInternetGrid).show() highInternetGrid.except(highOtherGrid).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 91
Source File: L8-13HiveQL.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrHiveqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { seqToCdr(rdd).toDF().registerTempTable("cdrs") hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'") hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 92
Source File: L6-6PerRecord.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppB { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreach { rec => { val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))) client.disconnect() client.close() } } } ssc.start() ssc.awaitTermination() } }
Example 93
Source File: L6-12StaticPool.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppF { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val mqttSink = ssc.sparkContext.broadcast(MqttSinkLazy(outputBrokerUrl)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => par.foreach(message => mqttSink.value.client.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) } } ssc.start() ssc.awaitTermination() } } class MqttSinkLazy(brokerUrl: String) extends Serializable { lazy val client = { val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() sys.addShutdownHook { client.disconnect() client.close() } client } } object MqttSinkLazy { val brokerUrl = "tcp://localhost:1883" val client = new MqttSinkLazy(brokerUrl) def apply(brokerUrl: String): MqttSinkLazy = { client } }
Example 94
Source File: L6-8Static.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppD { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => par.foreach(message => MqttSink().publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) } } ssc.start() ssc.awaitTermination() } } object MqttSink { val brokerUrl = "tcp://localhost:1883" val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() sys.addShutdownHook { client.disconnect() client.close() } def apply(): MqttClient = { client } }
Example 95
Source File: L6-18Cassandra.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.Text import java.nio.ByteBuffer import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat import org.apache.cassandra.hadoop.ConfigHelper import org.apache.cassandra.thrift.ColumnOrSuperColumn import org.apache.cassandra.thrift.Column import org.apache.cassandra.utils.ByteBufferUtil import org.apache.cassandra.thrift.Mutation import java.util.Arrays object CassandraSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val jobConf = new Configuration() ConfigHelper.setOutputRpcPort(jobConf, cassandraPort) ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost) ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName) ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner") rdd.map(rec => { val c = new Column() c.setName(ByteBufferUtil.bytes(columnName)) c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval))) c.setTimestamp(System.currentTimeMillis) val m = new Mutation() m.setColumn_or_supercolumn(new ColumnOrSuperColumn()) m.column_or_supercolumn.setColumn(c) (ByteBufferUtil.bytes(rec._1), Arrays.asList(m)) }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 96
Source File: L6-20CassandraConnector.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import com.datastax.spark.connector.SomeColumns import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming.toDStreamFunctions import com.datastax.spark.connector.toNamedColumnRef object CassandraConnectorSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.cassandra.connection.host", cassandraHost) .set("spark.cassandra.connection.port", cassandraPort) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) CassandraConnector(conf).withSessionDo { session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace)) session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName)) } HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .map(stock => (stock._1, stock._2 / (windowSize / batchInterval))) .saveToCassandra(keyspace, tableName) ssc.start() ssc.awaitTermination() } }
Example 97
Source File: L6-5Exception.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppA { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() rdd.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) client.disconnect() client.close() } ssc.start() ssc.awaitTermination() } }
Example 98
Source File: L6-10LazyStatic.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import org.apache.commons.pool2.PooledObject import org.apache.commons.pool2.BasePooledObjectFactory import org.apache.commons.pool2.impl.DefaultPooledObject import org.apache.commons.pool2.impl.GenericObjectPool import org.apache.commons.pool2.ObjectPool object MqttSinkAppE { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => val mqttSink = MqttSinkPool().borrowObject() par.foreach(message => mqttSink.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) MqttSinkPool().returnObject(mqttSink) } } ssc.start() ssc.awaitTermination() } } object MqttSinkPool { val poolSize = 8 val brokerUrl = "tcp://localhost:1883" val mqttPool = new GenericObjectPool[MqttClient](new MqttClientFactory(brokerUrl)) mqttPool.setMaxTotal(poolSize) sys.addShutdownHook { mqttPool.close() } def apply(): GenericObjectPool[MqttClient] = { mqttPool } } class MqttClientFactory(brokerUrl: String) extends BasePooledObjectFactory[MqttClient] { override def create() = { val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() client } override def wrap(client: MqttClient) = new DefaultPooledObject[MqttClient](client) override def validateObject(pObj: PooledObject[MqttClient]) = pObj.getObject.isConnected() override def destroyObject(pObj: PooledObject[MqttClient]) = { pObj.getObject.disconnect() pObj.getObject.close() } override def passivateObject(pObj: PooledObject[MqttClient]) = {} }
Example 99
Source File: L6-16SparkHBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object SparkHBaseBulkPutApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val hbaseConf = HBaseConfiguration.create() val hContext = new HBaseContext(ssc.sparkContext, hbaseConf) val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) put }) ssc.start() ssc.awaitTermination() } }
Example 100
Source File: L6-22Counters.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.concurrent.atomic.AtomicLong import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object StatefulCountersApp { def main(args: Array[String]) { if (args.length != 1) { System.err.println( "Usage: StatefulCountersApp <appname>") System.exit(1) } val Seq(appName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) var globalMax: AtomicLong = new AtomicLong(Long.MinValue) var globalMin: AtomicLong = new AtomicLong(Long.MaxValue) var globalCounter500: AtomicLong = new AtomicLong(0) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)) }) .foreachRDD(rdd => { val stocks = rdd.take(10) stocks.foreach(stock => { val price = stock._2 val volume = stock._3 if (volume > globalMax.get()) { globalMax.set(volume) } if (volume < globalMin.get()) { globalMin.set(volume) } if (price > 500) { globalCounter500.incrementAndGet() } }) if (globalCounter500.get() > 1000L) { println("Global counter has reached 1000") println("Max ----> " + globalMax.get) println("Min ----> " + globalMin.get) globalCounter500.set(0) } }) ssc.start() ssc.awaitTermination() } }
Example 101
Source File: L6-24Accumulators.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.collection.mutable import org.apache.spark.AccumulableParam import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object StatefulAccumulatorsApp { object StockAccum extends AccumulableParam[mutable.HashMap[String, (Long, Long, Long)], (String, (Float, Long))] { def zero(t: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { new mutable.HashMap[String, (Long, Long, Long)]() } def addInPlace(t1: mutable.HashMap[String, (Long, Long, Long)], t2: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { t1 ++ t2.map { case (k, v2) => (k -> { val v1 = t1.getOrElse(k, (Long.MaxValue, Long.MinValue, 0L)) val newMin = if (v2._1 < v1._1) v2._1 else v1._1 val newMax = if (v2._2 > v1._2) v2._2 else v1._2 (newMin, newMax, v1._3 + v2._3) }) } } def addAccumulator(t1: mutable.HashMap[String, (Long, Long, Long)], t2: (String, (Float, Long))): mutable.HashMap[String, (Long, Long, Long)] = { val prevStats = t1.getOrElse(t2._1, (Long.MaxValue, Long.MinValue, 0L)) val newVals = t2._2 var newCount = prevStats._3 if (newVals._1 > 500.0) { newCount += 1 } val newMin = if (newVals._2 < prevStats._1) newVals._2 else prevStats._1 val newMax = if (newVals._2 > prevStats._2) newVals._2 else prevStats._2 t1 += t2._1 -> (newMin, newMax, newCount) } } def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: StatefulAccumulatorsApp <appname> <checkpointDir>") System.exit(1) } val Seq(appName, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val stateAccum = ssc.sparkContext.accumulable(new mutable.HashMap[String, (Long, Long, Long)]())(StockAccum) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) }) .foreachRDD(rdd => { rdd.foreach({ stock => stateAccum += (stock._1, (stock._2._1, stock._2._2)) }) for ((sym, stats) <- stateAccum.value.to) printf("Symbol: %s, Stats: %s\n", sym, stats) }) ssc.start() ssc.awaitTermination() } }
Example 102
Source File: L6-7PerPartition.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppC { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() par.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) client.disconnect() client.close() } } ssc.start() ssc.awaitTermination() } }
Example 103
Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.io.Text import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HBaseSinkApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val hbaseConf = HBaseConfiguration.create() hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName) hbaseConf.set("hbase.master", hbaseMaster) val jobConf = new Configuration(hbaseConf) jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName) rdd.map(rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) (rec._1, put) }).saveAsNewAPIHadoopDataset(jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 104
Source File: L6-23UpdateState.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object StatefulUpdateStateApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: StatefulUpdateStateApp <appname> <checkpointDir>") System.exit(1) } val Seq(appName, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) ssc.checkpoint(checkpointDir) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) }) .updateStateByKey(updateState) .print() def updateState(values: Seq[(Float, Long)], state: Option[(Long, Long, Long)]): Option[(Long, Long, Long)] = { val volumes = values.map(s => s._2) val localMin = volumes.min val localMax = volumes.max val localCount500 = values.map(s => s._1).count(price => price > 500) val globalValues = state.getOrElse((Long.MaxValue, Long.MinValue, 0L)).asInstanceOf[(Long, Long, Long)] val newMin = if (localMin < globalValues._1) localMin else globalValues._1 val newMax = if (localMax > globalValues._2) localMax else globalValues._2 val newCount500 = globalValues._3 + localCount500 return Some(newMin, newMax, newCount500) } ssc.start() ssc.awaitTermination() } }
Example 105
Source File: L6-26Redis.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.collection.JavaConversions.asScalaBuffer import scala.collection.JavaConversions.mutableMapAsJavaMap import scala.collection.mutable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import redis.clients.jedis.Jedis object StatefulRedisApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: StatefulRedisApp <appname> <checkpointDir> <hostname>") System.exit(1) } val Seq(appName, checkpointDir, hostname) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) }) .foreachRDD(rdd => { rdd.foreachPartition({ part => val jedis = new Jedis(hostname) part.foreach(f => { val prev = jedis.hmget(f._1, "min", "max", "count") if (prev(0) == null) { jedis.hmset(f._1, mutable.HashMap("min" -> Long.MaxValue.toString, "max" -> Long.MinValue.toString, "count" -> 0.toString)) } else { val prevLong = prev.toList.map(v => v.toLong) var newCount = prevLong(2) val newPrice = f._2._1 val newVolume = f._2._2 if (newPrice > 500.0) { newCount += 1 } val newMin = if (newVolume < prevLong(0)) newVolume else prevLong(0) val newMax = if (newVolume > prevLong(1)) newVolume else prevLong(1) jedis.hmset(f._1, mutable.HashMap("min" -> newMin.toString, "max" -> newMax.toString, "count" -> newCount.toString)) } }) jedis.close() }) val jedis = new Jedis(hostname) jedis.scan(0).getResult.foreach(sym => println("Symbol: %s, Stats: %s".format(sym, jedis.hmget(sym, "min", "max", "count").toString))) jedis.close() }) ssc.start() ssc.awaitTermination() } }
Example 106
Source File: L3-DStreamMapping.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditMappingApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditMappingApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val sdf = new SimpleDateFormat("yyyy-MM-dd") val tsKey = "created_utc" val secs = 1000L val keyedByDay = comments.map(rec => { val ts = (parse(rec) \ tsKey).values (sdf.format(new Date(ts.toString.toLong * secs)), rec) }) val keyedByDayPart = comments.mapPartitions(iter => { var ret = List[(String, String)]() while (iter.hasNext) { val rec = iter.next val ts = (parse(rec) \ tsKey).values ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec) } ret.iterator }) val wordTokens = comments.map(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val wordTokensFlat = comments.flatMap(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val filterSubreddit = comments.filter(rec => (parse(rec) \ "subreddit").values.toString.equals("AskReddit")) val sortedByAuthor = comments.transform(rdd => (rdd.sortBy(rec => (parse(rec) \ "author").values.toString))) ssc.start() ssc.awaitTermination() } }
Example 107
Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditKeyValueApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>") System.exit(1) } val Seq(appName, inputPath, inputPathPopular) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .groupByKey() .map(r => (r._2.sum, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .reduceByKey(_ + _) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length)) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubreddit2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2) val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubredditCo2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) ssc.start() ssc.awaitTermination() } }
Example 108
Source File: L3-DStreamVariation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditVariationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditVariationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val merged = comments.union(comments) val repartitionedComments = comments.repartition(4) val rddMin = comments.glom().map(arr => arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt))) ssc.start() ssc.awaitTermination() } }
Example 109
Source File: L3-1DStreams.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.hadoop.io.LongWritable import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.hadoop.io.Text object StreamingTranslateApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingTranslateApp <appname> <book_path> <output_path> <language>") System.exit(1) } val Seq(appName, bookPath, outputPath, lang) = args.toSeq val dict = getDictionary(lang) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) val book = ssc.textFileStream(bookPath) val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" ")) translated.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } def getDictionary(lang: String): Map[String, String] = { if (!Set("German", "French", "Italian", "Spanish").contains(lang)) { System.err.println( "Unsupported language: %s".format(lang)) System.exit(1) } val url = "http://www.june29.com/IDP/files/%s.txt".format(lang) println("Grabbing dictionary from: %s".format(url)) Source.fromURL(url, "ISO-8859-1").mkString .split("\\r?\\n") .filter(line => !line.startsWith("#")) .map(line => line.split("\\t")) .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap } }
Example 110
Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditWindowAndActionApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditWindowAndActionApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) val windowedCounts = windowedRecs.countByValue() windowedCounts.print(10) windowedCounts.saveAsObjectFiles("subreddit", "obj") windowedCounts.saveAsTextFiles("subreddit", "txt") globalCount.saveAsHadoopFiles("subreddit", "hadoop", classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) comments.foreachRDD(rdd => { LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) }) ssc.start() ssc.awaitTermination() } }
Example 111
Source File: L3-DStreamAggregation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditAggregationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditAggregationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val recCount = comments.count() val recCountValue = comments.countByValue() val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString)) .flatMap(body => body.split(" ")) .map(word => 1) .reduce(_ + _) ssc.start() ssc.awaitTermination() } }
Example 112
Source File: L9-3Statistics.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object StatisticsApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => { val stats = Statistics.colStats(rdd) println("Count: " + stats.count) println("Max: " + stats.max.toArray.mkString(" ")) println("Min: " + stats.min.toArray.mkString(" ")) println("Mean: " + stats.mean.toArray.mkString(" ")) println("L1-Norm: " + stats.normL1.toArray.mkString(" ")) println("L2-Norm: " + stats.normL2.toArray.mkString(" ")) println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" ")) println("Varience: " + stats.variance.toArray.mkString(" ")) }) ssc.start() ssc.awaitTermination() } }
Example 113
Source File: L9-7FeatureExtraction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.ChiSqSelector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object FeatureExtractionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: FeatureExtractionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048)))) datastream.foreachRDD(rdd => { val selector = new ChiSqSelector(5) val model = selector.fit(rdd) val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features))) filtered.take(20).foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 114
Source File: L9-14FPMining.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.fpm.FPGrowth import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object FPMiningApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: FPMiningApp <appname> <batchInterval> <iPath>") System.exit(1) } val Seq(appName, batchInterval, iPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val minSupport = 0.4 ssc.textFileStream(iPath) .map(r => r.split(" ")) .foreachRDD(transactionRDD => { val fpg = new FPGrowth() .setMinSupport(minSupport) val model = fpg.run(transactionRDD) model.freqItemsets .collect() .foreach(itemset => println("Items: %s, Frequency: %s".format(itemset.items.mkString(" "), itemset.freq))) }) ssc.start() ssc.awaitTermination() } }
Example 115
Source File: L9-9LogisticRegression.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD object LogisticRegressionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(4)) .setStepSize(0.0001) .setNumIterations(1) model.trainOn(train) model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd .map(v => math.pow((v._1 - v._2), 2)).mean()))) ssc.start() ssc.awaitTermination() } }
Example 116
Source File: L9-1LinearRegression.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object LinearRegressionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: LinearRegressionApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.zeros(4)) .setStepSize(0.0001) .setNumIterations(1) model.trainOn(train) model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd .map(v => math.pow((v._1 - v._2), 2)).mean()))) ssc.start() ssc.awaitTermination() } }
Example 117
Source File: T9-4DataTypes.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrices import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataTypesApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val denseV = substream.map(f => Vectors.dense(f.slice(1, 5))) denseV.print() val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) }) .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l)) sparseV.print() val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) labeledP.print() val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53))) denseM.print() denseV.foreachRDD(rdd => { val rowM = new RowMatrix(rdd) println(rowM) }) denseV.foreachRDD(rdd => { val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1)) val iRowM = new IndexedRowMatrix(iRdd) println(iRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val cRowM = new CoordinateMatrix(entries) println(cRowM) }) substream.foreachRDD(rdd => { val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList))) .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x) val blockM = new CoordinateMatrix(entries).toBlockMatrix println(blockM) }) ssc.start() ssc.awaitTermination() } }
Example 118
Source File: L9-5ChiSq.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object ChiSqApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) .filter(f => f(0) == 4.0 || f(0) == 5.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) .foreachRDD(rdd => { Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2))) }) ssc.start() ssc.awaitTermination() } }
Example 119
Source File: L9-17MLCrossValidation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object MLCrossValidationApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLCrossValidationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val validator = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new RegressionEvaluator) val pGrid = new ParamGridBuilder() .addGrid(normalizer.p, Array(1.0, 5.0, 10.0)) .addGrid(regressor.numTrees, Array(10, 50, 100)) .build() validator.setEstimatorParamMaps(pGrid) validator.setNumFolds(5) val bestModel = validator.fit(train) val prediction = bestModel.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 120
Source File: L9-4Correlation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CorrelationApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) walkingOrRunning.map(f => f.features).foreachRDD(rdd => { val corrSpearman = Statistics.corr(rdd, "spearman") val corrPearson = Statistics.corr(rdd, "pearson") println("Correlation Spearman: \n" + corrSpearman) println("Correlation Pearson: \n" + corrPearson) }) ssc.start() ssc.awaitTermination() } }
Example 121
Source File: L9-12CollabFiltering.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CollabFilteringApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringApp <appname> <batchInterval> <iPath>") System.exit(1) } val Seq(appName, batchInterval, iPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match { case Array(subject, activity, freq) => Rating(subject.toInt, activity.toInt, freq.toDouble) }) val rank = 10 val numIterations = 10 val lambda = 0.01 ratingStream.foreachRDD(ratingRDD => { val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7)) val model = ALS.train(testTrain(1), rank, numIterations, lambda) val test = testTrain(0).map { case Rating(subject, activity, freq) => (subject, activity) } val prediction = model.predict(test) prediction.take(5).map(println) }) ssc.start() ssc.awaitTermination() } }
Example 122
Source File: L9-6Preprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.StandardScaler import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object PreprocessingApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: PreprocessingAppApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") substream.map(f => Array(f(2), f(4), f(5), f(6))) .map(f => f.map(v => v.toDouble)) .map(f => Vectors.dense(f)) .foreachRDD(rdd => { val scalerModel = new StandardScaler().fit(rdd) val scaledRDD = scalerModel.transform(rdd) }) ssc.start() ssc.awaitTermination() } }
Example 123
Source File: L9-8PCA.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object PCAApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: PCAApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) datastream.foreachRDD(rdd => { val pca = new PCA(rdd.first().features.size / 2) .fit(rdd.map(_.features)) val testTrain = rdd.randomSplit(Array(0.3, 0.7)) val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features))) val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features))) train.take(20).foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 124
Source File: L9-10KMeans.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object KMeansClusteringApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") val orientationStream = substream .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray) .map(arr => arr.map(_.toDouble)) .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length)))) val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0)) val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache() val model = new StreamingKMeans() .setK(3) .setDecayFactor(0) .setRandomCenters(18, 0.0) model.trainOn(train.map(v => v.features)) val prediction = model.predictOnValues(test.map(v => (v.label, v.features))) ssc.start() ssc.awaitTermination() } }
Example 125
Source File: L9-15MLPipeline.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.ml.param.ParamMap object MLPipelineApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val pMap = ParamMap(normalizer.p -> 1.0) val model = pipeline.fit(train, pMap) val prediction = model.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 126
Source File: StreamingLifeCycle.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.streaming import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{Duration, Seconds, StreamingContext} private[spark] trait StreamingLifeCycle { val timeOut: Long protected[this] val batchDuration: Duration = Seconds(1) val streamingContext = new StreamingContext( new SparkConf() .setMaster("local[4]") .setAppName("StreamingStats") .set("spark.default.parallelism", "4") .set("spark.rdd.compress", "true") .set("spark.executor.memory", "8g") .set("spark.shuffle.spill", "true") .set("spark.shuffle.spill.compress", "true") .set("spark.io.compression.codec", "lzf"), Seconds(2) ) def sparkContext: SparkContext = streamingContext.sparkContext def start: Unit = streamingContext.start def terminate: Unit = { streamingContext.stop(true, true) streamingContext.awaitTerminationOrTimeout(timeOut) } } // ---------------------------------------------- EOF --------------------------------
Example 127
Source File: RabbitIntegrationSpec.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.input.rabbitmq import akka.actor.ActorSystem import akka.event.slf4j.SLF4JLogging import akka.util.Timeout import com.typesafe.config.ConfigFactory import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.concurrent.TimeLimitedTests import org.scalatest.time.{Minute, Span} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, Matchers, WordSpec} import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Try abstract class RabbitIntegrationSpec extends WordSpec with Matchers with SLF4JLogging with TimeLimitedTests with BeforeAndAfter with BeforeAndAfterAll { private lazy val config = ConfigFactory.load() implicit val system = ActorSystem("ActorRabbitMQSystem") implicit val timeout = Timeout(10 seconds) val timeLimit = Span(1, Minute) val RabbitTimeOut = 3 second val configQueueName = Try(config.getString("rabbitmq.queueName")).getOrElse("rabbitmq-queue") val configExchangeName = Try(config.getString("rabbitmq.exchangeName")).getOrElse("rabbitmq-exchange") val exchangeType = Try(config.getString("rabbitmq.exchangeType")).getOrElse("topic") val routingKey = Try(config.getString("rabbitmq.routingKey")).getOrElse("") val vHost = Try(config.getString("rabbitmq.vHost")).getOrElse("/") val hosts = Try(config.getString("rabbitmq.hosts")).getOrElse("127.0.0.1") val userName = Try(config.getString("rabbitmq.userName")).getOrElse("guest") val password = Try(config.getString("rabbitmq.password")).getOrElse("guest") val RabbitConnectionURI = s"amqp://$userName:$password@$hosts/%2F" var sc: Option[SparkContext] = None var ssc: Option[StreamingContext] = None def initSpark(): Unit = { sc = Some(new SparkContext(conf)) ssc = Some(new StreamingContext(sc.get, Seconds(1))) } def stopSpark(): Unit = { ssc.foreach(_.stop()) sc.foreach(_.stop()) System.gc() } def initRabbitMQ(): Unit def closeRabbitMQ(): Unit before { log.info("Init spark") initSpark() log.info("Sending messages to queue..") initRabbitMQ() log.info("Messages in queue.") } after { log.info("Stop spark") stopSpark() log.info("Clean rabbitmq") closeRabbitMQ() } }
Example 128
Source File: TemporalSparkContext.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin import org.apache.spark._ import org.apache.spark.streaming.{Seconds, StreamingContext} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FlatSpec} private[plugin] trait TemporalSparkContext extends FlatSpec with BeforeAndAfterAll with BeforeAndAfter { val conf = new SparkConf() .setAppName("simulator-test") .setIfMissing("spark.master", "local[*]") @transient private var _sc: SparkContext = _ @transient private var _ssc: StreamingContext = _ def sc: SparkContext = _sc def ssc: StreamingContext = _ssc override def beforeAll() { _sc = new SparkContext(conf) _ssc = new StreamingContext(sc, Seconds(2)) } override def afterAll() : Unit = { if(ssc != null){ ssc.stop(stopSparkContext = false, stopGracefully = false) _ssc = null } if (sc != null){ sc.stop() _sc = null } System.gc() } }
Example 129
Source File: StreamingTestExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.stat.test.{BinarySample, StreamingTest} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.util.Utils object StreamingTestExample { def main(args: Array[String]) { if (args.length != 3) { // scalastyle:off println System.err.println( "Usage: StreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>") // scalastyle:on println System.exit(1) } val dataDir = args(0) val batchDuration = Seconds(args(1).toLong) val numBatchesTimeout = args(2).toInt val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample") val ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint { val dir = Utils.createTempDir() dir.toString } // $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => BinarySample(label.toBoolean, value.toDouble) }) val streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch") val out = streamingTest.registerStream(data) out.print() // $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout out.foreachRDD { rdd => timeoutCounter -= 1 val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _) if (timeoutCounter == 0 || anySignificant) rdd.context.stop() } ssc.start() ssc.awaitTermination() } }
Example 130
Source File: StreamingKMeansExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 131
Source File: QueueStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.Queue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new Queue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into rddQueue for (i <- 1 to 30) { rddQueue.synchronized { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) } Thread.sleep(1000) } ssc.stop() } }
Example 132
Source File: CustomReceiver.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 133
Source File: SqlNetworkWordCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 134
Source File: HdfsWordCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object HdfsWordCount { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: HdfsWordCount <directory>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("HdfsWordCount") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 135
Source File: NetworkWordCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 136
Source File: CountingInAStreamExpUpdateStateByKey.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object CountingInAStreamExpUpdateStateByKey { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(1)) ssc.checkpoint(checkpointFolder) val lines = ssc.socketTextStream(host, port.toInt) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(word => (word, 1)) .updateStateByKey((values: Seq[(Int)], state: Option[(Int)]) => { var value = state.getOrElse(0) values.foreach(i => { value += i }) Some(value) }) wordCounts.foreachRDD(rdd => { println("{") val localCollection = rdd.collect() println(" size:" + localCollection.length) localCollection.foreach(r => println(" " + r)) println("}") }) ssc.start() ssc.awaitTermination() } }
Example 137
Source File: CountingInAStreamExpBatchCounting.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object CountingInAStreamExpBatchCounting { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .master("local[3]") .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(2)) ssc.checkpoint(checkpointFolder) val lines = ssc.socketTextStream(host, port.toInt) val words = lines.flatMap(line => line.toLowerCase.split(" ")) val wordCounts = words.map(word => (word, 1)) .reduceByKey((a,b) => a + b) wordCounts.foreachRDD(rdd => { println("{") val localCollection = rdd.collect() println(" size:" + localCollection.length) localCollection.foreach(r => println(" " + r)) println("}") }) ssc.start() ssc.awaitTermination() } }
Example 138
Source File: EnrichmentInAStream.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object EnrichmentInAStream { def main(args:Array[String]): Unit = { def main(args:Array[String]): Unit = { val host = args(0) val port = args(1) val checkpointFolder = args(2) val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } val ssc = new StreamingContext(sparkSession.sparkContext.getConf, Seconds(1)) ssc.checkpoint(checkpointFolder) val lines = ssc.socketTextStream(host, port.toInt) val words = lines.flatMap(_.split(" ")) words.foreachRDD(rdd => rdd.foreachPartition(wordIt => { //make connection to storage layer // May use static connection wordIt.foreach(word => { word.toUpperCase //write to storage location }) })) ssc.start() ssc.awaitTermination() } } }
Example 139
Source File: AMQPServerStreamSuite.scala From streaming-amqp with Apache License 2.0 | 5 votes |
package io.radanalytics.streaming.amqp import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.amqp.AMQPUtils import org.apache.spark.streaming.{Duration, Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkFunSuite} import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually import scala.concurrent.duration._ class AMQPServerStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter { private val batchDuration: Duration = Seconds(1) private val master: String = "local[2]" private val appName: String = this.getClass().getSimpleName() private val address: String = "my_address" private val checkpointDir: String = "/tmp/spark-streaming-amqp-tests" private var conf: SparkConf = _ private var ssc: StreamingContext = _ private var amqpTestUtils: AMQPTestUtils = _ before { conf = new SparkConf().setMaster(master).setAppName(appName) conf.set("spark.streaming.receiver.writeAheadLog.enable", "true") ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint(checkpointDir) amqpTestUtils = new AMQPTestUtils() amqpTestUtils.setup() } after { if (ssc != null) { ssc.stop() } if (amqpTestUtils != null) { amqpTestUtils.teardown() } } test("AMQP receive server") { val sendMessage = "Spark Streaming & AMQP" val max = 10 val delay = 100l amqpTestUtils.startAMQPServer(sendMessage, max, delay) val converter = new AMQPBodyFunction[String] val receiveStream = AMQPUtils.createStream(ssc, amqpTestUtils.host, amqpTestUtils.port, amqpTestUtils.username, amqpTestUtils.password, address, converter, StorageLevel.MEMORY_ONLY) var receivedMessage: List[String] = List() receiveStream.foreachRDD(rdd => { if (!rdd.isEmpty()) { receivedMessage = receivedMessage ::: rdd.collect().toList } }) ssc.start() eventually(timeout(10000 milliseconds), interval(1000 milliseconds)) { assert(receivedMessage.length == max) } ssc.stop() amqpTestUtils.stopAMQPServer() } }
Example 140
Source File: ReceiverWithoutOffsetIT.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.datasource.models.{InputSentences, StopConditions} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ReceiverWithoutOffsetIT extends TemporalDataSuite { test("DataSource Receiver should read all the records on each batch without offset conditions") { sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val rdd = sc.parallelize(registers) sqlContext.createDataFrame(rdd, schema).registerTempTable(tableName) ssc = new StreamingContext(sc, Seconds(1)) val totalEvents = ssc.sparkContext.accumulator(0L, "Number of events received") val inputSentences = InputSentences( s"select * from $tableName", StopConditions(stopWhenEmpty = true, finishContextWhenEmpty = true), initialStatements = Seq.empty[String] ) val distributedStream = DatasourceUtils.createStream(ssc, inputSentences, datasourceParams) distributedStream.start() distributedStream.foreachRDD(rdd => { val streamingEvents = rdd.count() log.info(s" EVENTS COUNT : \t $streamingEvents") totalEvents += streamingEvents log.info(s" TOTAL EVENTS : \t $totalEvents") if (!rdd.isEmpty()) assert(streamingEvents === totalRegisters.toLong) }) ssc.start() ssc.awaitTerminationOrTimeout(10000L) assert(totalEvents.value === totalRegisters.toLong * 10) } }
Example 141
Source File: ReceiverNotStopContextIT.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetConditions, OffsetField} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ReceiverNotStopContextIT extends TemporalDataSuite { test("DataSource Receiver should read all the records in one streaming batch") { sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val rdd = sc.parallelize(registers) sqlContext.createDataFrame(rdd, schema).registerTempTable(tableName) ssc = new StreamingContext(sc, Seconds(1)) val totalEvents = ssc.sparkContext.accumulator(0L, "Number of events received") val inputSentences = InputSentences( s"select * from $tableName", OffsetConditions(OffsetField("idInt")), initialStatements = Seq.empty[String] ) val distributedStream = DatasourceUtils.createStream(ssc, inputSentences, datasourceParams) distributedStream.start() distributedStream.foreachRDD(rdd => { totalEvents += rdd.count() }) ssc.start() ssc.awaitTerminationOrTimeout(15000L) assert(totalEvents.value === totalRegisters.toLong) } }
Example 142
Source File: ReceiverLimitedIT.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetConditions, OffsetField, StopConditions} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ReceiverLimitedIT extends TemporalDataSuite { test("DataSource Receiver should read the records limited on each batch") { sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val rdd = sc.parallelize(registers) sqlContext.createDataFrame(rdd, schema).registerTempTable(tableName) ssc = new StreamingContext(sc, Seconds(1)) val totalEvents = ssc.sparkContext.accumulator(0L, "Number of events received") val inputSentences = InputSentences( s"select * from $tableName", OffsetConditions(OffsetField("idInt"), limitRecords = 1000), StopConditions(stopWhenEmpty = true, finishContextWhenEmpty = true), initialStatements = Seq.empty[String] ) val distributedStream = DatasourceUtils.createStream(ssc, inputSentences, datasourceParams) // Start up the receiver. distributedStream.start() // Fires each time the configured window has passed. distributedStream.foreachRDD(rdd => { totalEvents += rdd.count() }) ssc.start() // Start the computation ssc.awaitTerminationOrTimeout(15000L) // Wait for the computation to terminate assert(totalEvents.value === totalRegisters.toLong) } }
Example 143
Source File: ReceiverBasicIT.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetConditions, OffsetField, StopConditions} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ReceiverBasicIT extends TemporalDataSuite { test ("DataSource Receiver should read all the records in one streaming batch") { sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val rdd = sc.parallelize(registers) sqlContext.createDataFrame(rdd, schema).registerTempTable(tableName) ssc = new StreamingContext(sc, Seconds(1)) val totalEvents = ssc.sparkContext.accumulator(0L, "Number of events received") val inputSentences = InputSentences( s"select * from $tableName", OffsetConditions(OffsetField("idInt")), StopConditions(stopWhenEmpty = true, finishContextWhenEmpty = true), initialStatements = Seq.empty[String] ) val distributedStream = DatasourceUtils.createStream(ssc, inputSentences, datasourceParams) distributedStream.start() distributedStream.foreachRDD(rdd => { val streamingEvents = rdd.count() log.info(s" EVENTS COUNT : \t $streamingEvents") totalEvents += streamingEvents log.info(s" TOTAL EVENTS : \t $totalEvents") val streamingRegisters = rdd.collect() if (!rdd.isEmpty()) assert(streamingRegisters === registers.reverse) }) ssc.start() ssc.awaitTerminationOrTimeout(15000L) assert(totalEvents.value === totalRegisters.toLong) } }
Example 144
Source File: StreamHQL.scala From spark-cep with Apache License 2.0 | 5 votes |
import java.util.Properties import kafka.consumer.ConsumerConfig import org.I0Itec.zkclient.ZkClient import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.streaming.StreamSQLContext import org.apache.spark.sql.streaming.sources.MessageDelimiter import org.apache.spark.streaming.dstream.ConstantInputDStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import redis.RedisManager import scala.util.parsing.json.JSON class TabDelimiter extends MessageDelimiter { override val delimiter = "\t" } object StreamDDL { def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val query = args(0) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc)) streamSqlContext.command(query) new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop() } } object StreamHQL { object Redis { var initialized = false var manager: RedisManager = _ def init(confMap: Map[String, String]) { if (initialized == false) { manager = new RedisManager( confMap("redis.shards"), confMap("redis.sentinels"), confMap("redis.database").toInt) manager.init initialized = true } } } def removeConsumerGroup(zkQuorum: String, groupId: String) { val properties = new Properties() properties.put("zookeeper.connect", zkQuorum) properties.put("group.id", groupId) val conf = new ConsumerConfig(properties) val zkClient = new ZkClient(conf.zkConnect) zkClient.deleteRecursive(s"/consumers/${conf.groupId}") zkClient.close() } def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]] val qid = args(1) val query = args(2) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val hc = new HiveContext(sc) val streamSqlContext = new StreamSQLContext(ssc, hc) val redisExpireSec = confMap("redis.expire.sec").toInt ssc.checkpoint(s"checkpoint/$qid") hc.setConf("spark.streaming.query.id", qid) hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions")) removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid) val result = streamSqlContext.sql(query) val schema = result.schema result.foreachRDD((rdd, time) => { rdd.foreachPartition(partition => { Redis.init(confMap) val jedis = Redis.manager.getResource val pipe = jedis.pipelined partition.foreach(record => { val seq = record.toSeq(schema) val ts = time.milliseconds / 1000 val hkey = seq.take(seq.size - 1).mkString(".") pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString) pipe.expire(qid + "." + ts, redisExpireSec) }) pipe.sync Redis.manager.returnResource(jedis) }) }) ssc.start() ssc.awaitTermination() ssc.stop() } }
Example 145
Source File: AkkaUtilsSuite.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.akka import scala.concurrent.duration._ import akka.actor.{Props, SupervisorStrategy} import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class AkkaUtilsSuite extends SparkFunSuite { test("createStream") { val ssc: StreamingContext = new StreamingContext("local[2]", "test", Seconds(1000)) try { // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test") val test2: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, supervisorStrategy = SupervisorStrategy.defaultStrategy) val test4: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) val test5: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) val test6: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null, SupervisorStrategy.defaultStrategy) } finally { ssc.stop() } } } class TestActor extends ActorReceiver { override def receive: Receive = { case m: String => store(m) case m => store(m, 10.seconds) } }
Example 146
Source File: CloudantStreaming.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.sql.cloudant import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext, Time} import org.apache.bahir.cloudant.CloudantReceiver object CloudantStreaming { def main(args: Array[String]) { val spark = SparkSession.builder() .appName("Cloudant Spark SQL External Datasource in Scala") .master("local[*]") .getOrCreate() // Create the context with a 10 seconds batch size val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) import spark.implicits._ val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map( "cloudant.host" -> "examples.cloudant.com", "database" -> "sales"))) changes.foreachRDD((rdd: RDD[String], time: Time) => { // Get the singleton instance of SparkSession println(s"========= $time =========")// scalastyle:ignore // Convert RDD[String] to Dataset[String] val changesDataFrame = spark.read.json(rdd.toDS()) if (changesDataFrame.schema.nonEmpty) { changesDataFrame.printSchema() var hasDelRecord = false var hasMonth = false for (field <- changesDataFrame.schema.fieldNames) { if ("_deleted".equals(field)) { hasDelRecord = true } if ("month".equals(field)) { hasMonth = true } } if (hasDelRecord) { changesDataFrame.filter(changesDataFrame("_deleted")).select("*").show() } if (hasMonth) { changesDataFrame.filter(changesDataFrame("month") === "May").select("*").show(5) changesDataFrame.createOrReplaceTempView("sales") val salesInMayCountsDataFrame = spark.sql( s""" |select rep, amount |from sales |where month = "May" """.stripMargin) salesInMayCountsDataFrame.show(5) } } }) ssc.start() // run streaming for 60 secs Thread.sleep(60000L) ssc.stop(true) } }
Example 147
Source File: CloudantStreamingSelector.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.sql.cloudant import java.util.concurrent.atomic.AtomicLong import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{ Seconds, StreamingContext, Time } import org.apache.bahir.cloudant.CloudantReceiver object CloudantStreamingSelector { def main(args: Array[String]) { val spark = SparkSession.builder() .appName("Cloudant Spark SQL External Datasource in Scala") .master("local[*]") .getOrCreate() import spark.implicits._ // Create the context with a 10 seconds batch size val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) val curTotalAmount = new AtomicLong(0) val curSalesCount = new AtomicLong(0) var batchAmount = 0L val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map( "cloudant.host" -> "examples.cloudant.com", "database" -> "sales", "selector" -> "{\"month\":\"May\", \"rep\":\"John\"}"))) changes.foreachRDD((rdd: RDD[String], time: Time) => { // Get the singleton instance of SQLContext println(s"========= $time =========") // scalastyle:ignore val changesDataFrame = spark.read.json(rdd.toDS()) if (changesDataFrame.schema.nonEmpty) { changesDataFrame.select("*").show() batchAmount = changesDataFrame.groupBy().sum("amount").collect()(0).getLong(0) curSalesCount.getAndAdd(changesDataFrame.count()) curTotalAmount.getAndAdd(batchAmount) println("Current sales count:" + curSalesCount)// scalastyle:ignore println("Current total amount:" + curTotalAmount)// scalastyle:ignore } else { ssc.stop() } }) ssc.start() ssc.awaitTermination() } }
Example 148
Source File: ZeroMQWordCount.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming.zeromq import scala.language.implicitConversions import scala.util.Random import org.apache.log4j.{Level, Logger} import org.zeromq.ZContext import org.zeromq.ZMQ import org.zeromq.ZMQException import org.zeromq.ZMsg import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.zeromq.ZeroMQUtils object ZeroMQWordCount { def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println System.err.println("Usage: ZeroMQWordCount <zeroMqUrl> <topic>") // scalastyle:on println System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath). Logger.getRootLogger.setLevel(Level.WARN) val Seq(url, topic) = args.toSeq val sparkConf = new SparkConf().setAppName("ZeroMQWordCount") // Check Spark configuration for master URL, set it to local if not present. if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } // Create the context and set the batch size. val ssc = new StreamingContext(sparkConf, Seconds(2)) val lines = ZeroMQUtils.createTextStream( ssc, url, true, Seq(topic.getBytes) ) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 149
Source File: TwitterLocations.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import org.apache.log4j.{Level, Logger} import twitter4j.FilterQuery import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ object TwitterLocations { def main(args: Array[String]) { if (args.length < 4 || args.length % 4 != 0) { System.err.println("Usage: TwitterLocations <consumer key> <consumer secret> " + "<access token> <access token secret> " + "[<latitude-south-west> <longitude-south-west>" + " <latitude-north-east> <longitude-north-east> ...]") System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) { Logger.getRootLogger.setLevel(Level.WARN) } // Set the system properties so that Twitter4j library used by twitter stream // can use them to generate OAuth credentials val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) // Get bounding boxes of locations for which to retrieve Tweets from command line val locationArgs = args.takeRight(args.length - 4) val boundingBoxes = if (locationArgs.length == 0) { System.out.println("No location bounding boxes specified, using defaults for New York City") val nycSouthWest = Array(-74.0, 40.0) val nycNorthEast = Array(-73.0, 41.0) Array(nycSouthWest, nycNorthEast) } else { locationArgs.map(_.toDouble).sliding(2, 2).toArray } val sparkConf = new SparkConf().setAppName("TwitterLocations") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(2)) val locationsQuery = new FilterQuery().locations(boundingBoxes : _*) // Print Tweets from the specified coordinates // This includes Tweets geo-tagged in the bounding box defined by the coordinates // As well as Tweets tagged in places inside of the bounding box TwitterUtils.createFilteredStream(ssc, None, Some(locationsQuery)) .map(tweet => { val latitude = Option(tweet.getGeoLocation).map(l => s"${l.getLatitude},${l.getLongitude}") val place = Option(tweet.getPlace).map(_.getName) val location = latitude.getOrElse(place.getOrElse("(no location)")) val text = tweet.getText.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') s"$location\t$text" }) .print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 150
Source File: TwitterAlgebirdHLL.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import com.twitter.algebird.HyperLogLog._ import com.twitter.algebird.HyperLogLogMonoid import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ // scalastyle:off val BIT_SIZE = 12 val filters = args val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(5)) val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER) val users = stream.map(status => status.getUser.getId) val hll = new HyperLogLogMonoid(BIT_SIZE) var globalHll = hll.zero var userSet: Set[Long] = Set() val approxUsers = users.mapPartitions(ids => { ids.map(id => hll.create(id)) }).reduce(_ + _) val exactUsers = users.map(id => Set(id)).reduce(_ ++ _) approxUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() globalHll += partial println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt)) println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt)) } }) exactUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() userSet ++= partial println("Exact distinct users this batch: %d".format(partial.size)) println("Exact distinct users overall: %d".format(userSet.size)) println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1 ) * 100)) } }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 151
Source File: TwitterPopularTags.scala From bahir with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.twitter import org.apache.log4j.{Level, Logger} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf object TwitterPopularTags { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " + "<access token> <access token secret> [<filters>]") System.exit(1) } // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger.getAllAppenders.hasMoreElements) { Logger.getRootLogger.setLevel(Level.WARN) } val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) val filters = args.takeRight(args.length - 4) // Set the system properties so that Twitter4j library used by twitter stream // can use them to generate OAuth credentials System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) val sparkConf = new SparkConf().setAppName("TwitterPopularTags") // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]") } val ssc = new StreamingContext(sparkConf, Seconds(2)) val stream = TwitterUtils.createStream(ssc, None, filters) val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) // Print popular hashtags topCounts60.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) topCounts10.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 152
Source File: TwitterStreamSuite.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import java.util.UUID import scala.collection.mutable import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually import org.scalatest.time import org.scalatest.time.Span import twitter4j.{FilterQuery, Status, TwitterFactory} import twitter4j.auth.{Authorization, NullAuthorization} import org.apache.spark.ConditionalSparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends ConditionalSparkFunSuite with Eventually with BeforeAndAfter with Logging { def shouldRunTest(): Boolean = sys.env.get("ENABLE_TWITTER_TESTS").contains("1") var ssc: StreamingContext = _ before { ssc = new StreamingContext("local[2]", this.getClass.getSimpleName, Seconds(1)) } after { if (ssc != null) { ssc.stop() } } test("twitter input stream") { val filters = Seq("filter1", "filter2") val query = new FilterQuery().language("fr,es") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test7: ReceiverInputDStream[Status] = TwitterUtils.createFilteredStream( ssc, Some(authorization), Some(query), StorageLevel.MEMORY_AND_DISK_SER_2) } testIf("messages received", () => TwitterStreamSuite.this.shouldRunTest()) { val userId = TwitterFactory.getSingleton.updateStatus( UUID.randomUUID().toString ).getUser.getId val receiveStream = TwitterUtils.createFilteredStream( ssc, None, Some(new FilterQuery().follow(userId)) ) @volatile var receivedMessages: mutable.Set[Status] = mutable.Set() receiveStream.foreachRDD { rdd => for (element <- rdd.collect()) { receivedMessages += element } receivedMessages } ssc.start() val nbOfMsg = 2 var publishedMessages: List[String] = List() (1 to nbOfMsg).foreach( _ => { publishedMessages = UUID.randomUUID().toString :: publishedMessages } ) eventually(timeout(Span(15, time.Seconds)), interval(Span(1000, time.Millis))) { publishedMessages.foreach( m => if (!receivedMessages.map(m => m.getText).contains(m.toString)) { TwitterFactory.getSingleton.updateStatus(m) } ) assert( publishedMessages.map(m => m.toString).toSet .subsetOf(receivedMessages.map(m => m.getText)) ) } } }
Example 153
Source File: FlumeWordCount.scala From ml-in-scala with The Unlicense | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.flume._ object FlumeWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("FlumeWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/flume_check") val hostPort=args(0).split(":") System.out.println("Opening a sink at host: [" + hostPort(0) + "] port: [" + hostPort(1).toInt + "]") val lines = FlumeUtils.createPollingStream(ssc, hostPort(0), hostPort(1).toInt, StorageLevel.MEMORY_ONLY) val words = lines .map(e => new String(e.event.getBody.array)).map(_.toLowerCase).flatMap(_.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 154
Source File: KafkaWordCount.scala From ml-in-scala with The Unlicense | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka._ object KafkaWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/kafka_check") System.out.println("Opening a Kafka consumer at zk: [" + args(0) + "] for group group-1 and topic example") val lines = KafkaUtils.createStream(ssc, args(0), "group-1", Map("example" -> 1), StorageLevel.MEMORY_ONLY) val words = lines .flatMap(_._2.toLowerCase.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 155
Source File: SparkCommon.scala From spark-tutorial with Apache License 2.0 | 5 votes |
package com.tutorial.utils import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkCommon { lazy val conf = { new SparkConf(false) .setMaster("local[*]") .setAppName("Spark Tutorial") } lazy val sparkContext = new SparkContext(conf) lazy val sparkSQLContext = SQLContext.getOrCreate(sparkContext) lazy val streamingContext = StreamingContext.getActive() .getOrElse(new StreamingContext(sparkContext, Seconds(2))) }
Example 156
Source File: StreamingTestExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.stat.test.{BinarySample, StreamingTest} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.util.Utils object StreamingTestExample { def main(args: Array[String]) { if (args.length != 3) { // scalastyle:off println System.err.println( "Usage: StreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>") // scalastyle:on println System.exit(1) } val dataDir = args(0) val batchDuration = Seconds(args(1).toLong) val numBatchesTimeout = args(2).toInt val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample") val ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint { val dir = Utils.createTempDir() dir.toString } // $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => BinarySample(label.toBoolean, value.toDouble) }) val streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch") val out = streamingTest.registerStream(data) out.print() // $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout out.foreachRDD { rdd => timeoutCounter -= 1 val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _) if (timeoutCounter == 0 || anySignificant) rdd.context.stop() } ssc.start() ssc.awaitTermination() } }
Example 157
Source File: StreamingKMeansExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 158
Source File: QueueStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.Queue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new Queue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into rddQueue for (i <- 1 to 30) { rddQueue.synchronized { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) } Thread.sleep(1000) } ssc.stop() } }
Example 159
Source File: CustomReceiver.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 160
Source File: SqlNetworkWordCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 161
Source File: HdfsWordCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object HdfsWordCount { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: HdfsWordCount <directory>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("HdfsWordCount") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 162
Source File: NetworkWordCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 163
Source File: LinearRegression.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import org.apache.spark.{Logging, SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter.TwitterUtils object LinearRegression extends Logging { def main(args: Array[String]) { log.info("Parsing applications arguments") val conf = new ConfArguments() .setAppName("twitter-stream-ml-linear-regression") .parse(args.toList) log.info("Initializing session stats...") val session = new SessionStats(conf).open log.info("Initializing Spark Machine Learning Model...") MllibHelper.reset(conf) val model = new StreamingLinearRegressionWithSGD() .setNumIterations(conf.numIterations) .setStepSize(conf.stepSize) .setMiniBatchFraction(conf.miniBatchFraction) .setInitialWeights(Vectors.zeros(MllibHelper.numFeatures)) log.info("Initializing Spark Context...") val sc = new SparkContext(conf.sparkConf) log.info("Initializing Streaming Spark Context... {} sec/batch", conf.seconds) val ssc = new StreamingContext(sc, Seconds(conf.seconds)) log.info("Initializing Twitter stream...") val stream = TwitterUtils.createStream(ssc, None) .filter(MllibHelper.filtrate) .map(MllibHelper.featurize) .cache() log.info("Initializing prediction model...") val count = sc.accumulator(0L, "count") stream.foreachRDD({ rdd => if (rdd.isEmpty) log.debug("batch: 0") else { val realPred = rdd.map{ lb => (lb.label, Utils.round(model.latestModel.predict(lb.features))) } val batch = rdd.count count += batch val real = realPred.map(_._1) val pred = realPred.map(_._2) val realStdev = Utils.round(real.stdev) val predStdev = Utils.round(pred.stdev) val mse = Utils.round(realPred.map{case(v, p) => math.pow((v - p), 2)}.mean()) if (log.isDebugEnabled) { log.debug("count: {}", count) // batch, mse (training mean squared error) log.debug("batch: {}, mse: {}", batch, mse) log.debug("stdev (real, pred): ({}, {})", realStdev.toLong, predStdev.toLong) log.debug("value (real, pred): {} ...", realPred.take(10).toArray) } session.update(count.value, batch, mse, realStdev, predStdev, real.toArray, pred.toArray); } }) log.info("Initializing training model...") // training after prediction model.trainOn(stream) // Start the streaming computation ssc.start() log.info("Initialization complete.") ssc.awaitTermination() } }
Example 164
Source File: StreamingKMeansExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } }
Example 165
Source File: QueueStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.SynchronizedQueue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new SynchronizedQueue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into for (i <- 1 to 30) { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) Thread.sleep(1000) } ssc.stop() } }
Example 166
Source File: CustomReceiver.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import java.io.{InputStreamReader, BufferedReader, InputStream} import java.net.Socket import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 167
Source File: TwitterAlgebirdHLL.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import com.twitter.algebird.HyperLogLogMonoid import com.twitter.algebird.HyperLogLog._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf // scalastyle:off val BIT_SIZE = 12 val filters = args val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL") val ssc = new StreamingContext(sparkConf, Seconds(5)) val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER) val users = stream.map(status => status.getUser.getId) val hll = new HyperLogLogMonoid(BIT_SIZE) var globalHll = hll.zero var userSet: Set[Long] = Set() val approxUsers = users.mapPartitions(ids => { ids.map(id => hll(id)) }).reduce(_ + _) val exactUsers = users.map(id => Set(id)).reduce(_ ++ _) approxUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() globalHll += partial println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt)) println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt)) } }) exactUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() userSet ++= partial println("Exact distinct users this batch: %d".format(partial.size)) println("Exact distinct users overall: %d".format(userSet.size)) println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1 ) * 100)) } }) ssc.start() ssc.awaitTermination() } }
Example 168
Source File: ZeroMQWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import akka.actor.ActorSystem import akka.actor.actorRef2Scala import akka.zeromq._ import akka.zeromq.Subscribe import akka.util.ByteString import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.zeromq._ import scala.language.implicitConversions import org.apache.spark.SparkConf // scalastyle:on object ZeroMQWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: ZeroMQWordCount <zeroMQurl> <topic>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Seq(url, topic) = args.toSeq val sparkConf = new SparkConf().setAppName("ZeroMQWordCount") // Create the context and set the batch size val ssc = new StreamingContext(sparkConf, Seconds(2)) def bytesToStringIterator(x: Seq[ByteString]): Iterator[String] = x.map(_.utf8String).iterator // For this stream, a zeroMQ publisher should be running. val lines = ZeroMQUtils.createStream(ssc, url, Subscribe(topic), bytesToStringIterator _) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 169
Source File: HdfsWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object HdfsWordCount { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: HdfsWordCount <directory>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("HdfsWordCount") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 170
Source File: NetworkWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 171
Source File: TwitterPopularTags.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.SparkContext._ import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf object TwitterPopularTags { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " + "<access token> <access token secret> [<filters>]") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) val filters = args.takeRight(args.length - 4) // Set the system properties so that Twitter4j library used by twitter stream // can use them to generat OAuth credentials System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) val sparkConf = new SparkConf().setAppName("TwitterPopularTags") val ssc = new StreamingContext(sparkConf, Seconds(2)) val stream = TwitterUtils.createStream(ssc, None, filters) val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) // Print popular hashtags topCounts60.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) topCounts10.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) ssc.start() ssc.awaitTermination() } }
Example 172
Source File: MQTTWordCount.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.eclipse.paho.client.mqttv3._ import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.mqtt._ import org.apache.spark.SparkConf object MQTTWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println( "Usage: MQTTWordCount <MqttbrokerUrl> <topic>") System.exit(1) } val Seq(brokerUrl, topic) = args.toSeq val sparkConf = new SparkConf().setAppName("MQTTWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) val lines = MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) val words = lines.flatMap(x => x.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 173
Source File: TwitterStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 174
Source File: ZeroMQStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class ZeroMQStreamSuite extends SparkFunSuite { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val publishUrl = "abc" val subscribe = new Subscribe(null.asInstanceOf[ByteString]) val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]] // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving ssc.stop() } }
Example 175
Source File: WeatherDataStream.scala From spark-scala with Creative Commons Zero v1.0 Universal | 5 votes |
package com.supergloo import com.killrweather.data.Weather.RawWeatherData import kafka.serializer.StringDecoder import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka.KafkaUtils parsedWeatherStream.map { weather => (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip) }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip) } def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = { val parsedWeatherStream = rawWeatherStream.map(_._2.split(",")) .map(RawWeatherData(_)) parsedWeatherStream } }
Example 176
Source File: TestSparkContext.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
// scalastyle:off header.matches trait TestSparkStreamingContext extends TestSparkContext { self: Suite => implicit lazy val streaming: StreamingContext = StreamingContext.getActiveOrCreate(() => new StreamingContext(sc, Seconds(1)) ) override def afterAll: Unit = { streaming.stop(stopSparkContext = false) super[TestSparkContext].afterAll } }
Example 177
Source File: StreamingLinearRegression.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLinearRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression") //批次间隔 val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingLinearRegressionWithSGD()//(SGD随机梯度下降) //initialWeights初始取值,默认是0向量 .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 178
Source File: StreamingLogisticRegression.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") //批次间隔 val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) //LabeledPoint标记点是局部向量,向量可以是密集型或者稀疏型,每个向量会关联了一个标签(label) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) //SGD基于梯度下降,仅支持2分类 val model = new StreamingLogisticRegressionWithSGD() //initialWeights初始取值,默认是0向量 .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 179
Source File: StreamingKMeansExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") //批次间隔 val ssc = new StreamingContext(conf, Seconds(3.toLong)) //文件流,训练目录,解析向量 val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) //测试目录 val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() //聚类的个数 .setK(args(3).toInt) //直接设置衰减因子 .setDecayFactor(1.0) //随机中心数 .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData)//对数据集进行聚类训练 //predict 对新的数据点进行所属聚类的预测 model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 180
Source File: QueueStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.SynchronizedQueue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} //队列流 object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream").setMaster("local[2]") // Create the context 创建上下文,批次间隔 val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream //创建队列,通过RDDs可以推到一个queueinputdstream val rddQueue = new SynchronizedQueue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing //创建queueinputdstream用它做一些处理 val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into //创造和推动一些RDD for (i <- 1 to 9) { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) Thread.sleep(1000) } ssc.stop() } }
Example 181
Source File: CustomReceiver.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{InputStreamReader, BufferedReader, InputStream} import java.net.Socket import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver import java.io.File import java.io.FileInputStream private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) //连接机器 logInfo("Connected to " + host + ":" + port) //获取网络连接输入流 println("isConnected:"+socket.isConnected()) val socketInput=socket.getInputStream() // //val inputFile=new File("../data/mllib/als/testCustomReceiver.data") // val in = new FileInputStream(inputFile) // val in = new FileInputStream(socketInput) val reader = new BufferedReader(new InputStreamReader(socketInput, "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput)//存储数据 userInput = reader.readLine()//读取数据 println("userInput:"+userInput) } reader.close()//关闭流 socket.close()//关闭连接 logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 182
Source File: SqlNetworkWordCount.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Time, Seconds, StreamingContext} import org.apache.spark.util.IntParam import org.apache.spark.sql.SQLContext import org.apache.spark.storage.StorageLevel object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance } } // scalastyle:on println
Example 183
Source File: TwitterStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 184
Source File: FlumePollingStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.net.InetSocketAddress import scala.collection.JavaConversions._ import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets.UTF_8 import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext} import org.apache.spark.util.{ManualClock, Utils} private def testMultipleTimes(test: () => Unit): Unit = { var testPassed = false var attempt = 0 while (!testPassed && attempt < maxAttempts) { try { test() testPassed = true } catch { case e: Exception if Utils.isBindCollision(e) => logWarning("Exception when running flume polling test: " + e) attempt += 1 } } assert(testPassed, s"Test failed after $attempt attempts!") } private def testFlumePolling(): Unit = { try { val port = utils.startSingleSink() writeAndVerify(Seq(port)) utils.assertChannelsAreEmpty() } finally { utils.close() } } private def testFlumePollingMultipleHost(): Unit = { try { val ports = utils.startMultipleSinks() writeAndVerify(ports) utils.assertChannelsAreEmpty() } finally { utils.close() } } def writeAndVerify(sinkPorts: Seq[Int]): Unit = { // Set up the streaming context and input streams //设置流上下文和输入流 val ssc = new StreamingContext(conf, batchDuration) val addresses = sinkPorts.map(port => new InetSocketAddress("localhost", port)) val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK, utils.eventsPerBatch, 5) val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]] with SynchronizedBuffer[Seq[SparkFlumeEvent]] val outputStream = new TestOutputStream(flumeStream, outputBuffer) outputStream.register() ssc.start() try { utils.sendDatAndEnsureAllDataHasBeenReceived() val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] clock.advance(batchDuration.milliseconds) // The eventually is required to ensure that all data in the batch has been processed. //最终需要确保批处理中的所有数据已被处理 eventually(timeout(10 seconds), interval(100 milliseconds)) { val flattenOutputBuffer = outputBuffer.flatten val headers = flattenOutputBuffer.map(_.event.getHeaders.map { case kv => (kv._1.toString, kv._2.toString) }).map(mapAsJavaMap) val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8)) utils.assertOutput(headers, bodies) } } finally { ssc.stop() } } }
Example 185
Source File: ZeroMQStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class ZeroMQStreamSuite extends SparkFunSuite { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val publishUrl = "abc" val subscribe = new Subscribe(null.asInstanceOf[ByteString]) val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]] // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving ssc.stop() } }
Example 186
Source File: Parsing.scala From meetup-stream with Apache License 2.0 | 5 votes |
package util import core._ import org.joda.time.DateTime import org.json4s.DefaultFormats import org.json4s._ import org.json4s.native.JsonMethods._ import org.joda.time.DateTime import org.apache.spark.Partitioner import org.apache.spark.streaming.Seconds import scala.util.Try object Parsing { @transient implicit val formats = DefaultFormats def parseEvent(eventJson: String):Option[Event]={ Try({ val json=parse(eventJson).camelizeKeys val event=json.extract[Event] event }).toOption } def parseRsvp(rsvpJson: String)={ Try({ val json=parse(rsvpJson).camelizeKeys val member=(json \ "member").extract[Member] val event=(json \ "event").extract[MemberEvent] val response=(json \ "response").extract[String] (member, event, response) }).toOption } }
Example 187
Source File: StreamingTestExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.stat.test.{BinarySample, StreamingTest} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.util.Utils object StreamingTestExample { def main(args: Array[String]) { if (args.length != 3) { // scalastyle:off println System.err.println( "Usage: StreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>") // scalastyle:on println System.exit(1) } val dataDir = args(0) val batchDuration = Seconds(args(1).toLong) val numBatchesTimeout = args(2).toInt val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample") val ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint { val dir = Utils.createTempDir() dir.toString } // $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => BinarySample(label.toBoolean, value.toDouble) }) val streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch") val out = streamingTest.registerStream(data) out.print() // $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout out.foreachRDD { rdd => timeoutCounter -= 1 val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _) if (timeoutCounter == 0 || anySignificant) rdd.context.stop() } ssc.start() ssc.awaitTermination() } }
Example 188
Source File: StreamingKMeansExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 189
Source File: QueueStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.Queue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new Queue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into rddQueue for (i <- 1 to 30) { rddQueue.synchronized { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) } Thread.sleep(1000) } ssc.stop() } }
Example 190
Source File: CustomReceiver.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo(s"Connecting to $host : $port") socket = new Socket(host, port) logInfo(s"Connected to $host : $port") val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart(s"Error connecting to $host : $port", e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 191
Source File: SqlNetworkWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 192
Source File: HdfsWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object HdfsWordCount { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: HdfsWordCount <directory>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("HdfsWordCount") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 193
Source File: NetworkWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 194
Source File: StreamingTestExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.stat.test.{BinarySample, StreamingTest} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.util.Utils object StreamingTestExample { def main(args: Array[String]) { if (args.length != 3) { // scalastyle:off println System.err.println( "Usage: StreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>") // scalastyle:on println System.exit(1) } val dataDir = args(0) val batchDuration = Seconds(args(1).toLong) val numBatchesTimeout = args(2).toInt val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample") val ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint({ val dir = Utils.createTempDir() dir.toString }) // $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => BinarySample(label.toBoolean, value.toDouble) }) val streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch") val out = streamingTest.registerStream(data) out.print() // $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout out.foreachRDD { rdd => timeoutCounter -= 1 val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _) if (timeoutCounter == 0 || anySignificant) rdd.context.stop() } ssc.start() ssc.awaitTermination() } }
Example 195
Source File: StreamingKMeansExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 196
Source File: QueueStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.SynchronizedQueue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new SynchronizedQueue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into for (i <- 1 to 30) { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) Thread.sleep(1000) } ssc.stop() } }
Example 197
Source File: CustomReceiver.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{InputStreamReader, BufferedReader, InputStream} import java.net.Socket import org.apache.spark.{SparkConf, Logging} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), "UTF-8")) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 198
Source File: TwitterAlgebirdHLL.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import com.twitter.algebird.HyperLogLogMonoid import com.twitter.algebird.HyperLogLog._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf // scalastyle:off val BIT_SIZE = 12 val filters = args val sparkConf = new SparkConf().setAppName("TwitterAlgebirdHLL") val ssc = new StreamingContext(sparkConf, Seconds(5)) val stream = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_ONLY_SER) val users = stream.map(status => status.getUser.getId) val hll = new HyperLogLogMonoid(BIT_SIZE) var globalHll = hll.zero var userSet: Set[Long] = Set() val approxUsers = users.mapPartitions(ids => { ids.map(id => hll(id)) }).reduce(_ + _) val exactUsers = users.map(id => Set(id)).reduce(_ ++ _) approxUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() globalHll += partial println("Approx distinct users this batch: %d".format(partial.estimatedSize.toInt)) println("Approx distinct users overall: %d".format(globalHll.estimatedSize.toInt)) } }) exactUsers.foreachRDD(rdd => { if (rdd.count() != 0) { val partial = rdd.first() userSet ++= partial println("Exact distinct users this batch: %d".format(partial.size)) println("Exact distinct users overall: %d".format(userSet.size)) println("Error rate: %2.5f%%".format(((globalHll.estimatedSize / userSet.size.toDouble) - 1 ) * 100)) } }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 199
Source File: ZeroMQWordCount.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import akka.actor.ActorSystem import akka.actor.actorRef2Scala import akka.zeromq._ import akka.zeromq.Subscribe import akka.util.ByteString import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.zeromq._ import scala.language.implicitConversions import org.apache.spark.SparkConf // scalastyle:on object ZeroMQWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: ZeroMQWordCount <zeroMQurl> <topic>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Seq(url, topic) = args.toSeq val sparkConf = new SparkConf().setAppName("ZeroMQWordCount") // Create the context and set the batch size val ssc = new StreamingContext(sparkConf, Seconds(2)) def bytesToStringIterator(x: Seq[ByteString]): Iterator[String] = x.map(_.utf8String).iterator // For this stream, a zeroMQ publisher should be running. val lines = ZeroMQUtils.createStream(ssc, url, Subscribe(topic), bytesToStringIterator _) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 200
Source File: SqlNetworkWordCount.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Time, Seconds, StreamingContext} import org.apache.spark.util.IntParam import org.apache.spark.sql.SQLContext import org.apache.spark.storage.StorageLevel object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance } } // scalastyle:on println