org.apache.spark.streaming.StreamingContext Scala Examples
The following examples show how to use org.apache.spark.streaming.StreamingContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingKafka10.scala From BigData-News with Apache License 2.0 | 7 votes |
package com.vita.spark import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka010.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe object StreamingKafka10 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("streaming") .getOrCreate() val sc = spark.sparkContext val ssc = new StreamingContext(sc, Seconds(5)) val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "node6:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "0001", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) val topics = Array("weblogs") val stream = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) val lines = stream.map(x => x.value()) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 2
Source File: SqlNetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 3
Source File: L5-15KafkaDirect.scala From prosparkstreaming with Apache License 2.0 | 6 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountDirectApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Set(topic) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 4
Source File: StreamingKafka8.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import kafka.serializer.StringDecoder import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} class StreamingKafka8 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[2]") .appName("streaming") .getOrCreate() val sc = spark.sparkContext val ssc = new StreamingContext(sc, Seconds(5)) // Create direct kafka stream with brokers and topics val topicsSet = Set("weblogs") val kafkaParams = Map[String, String]("metadata.broker.list" -> "node5:9092") val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val lines = kafkaStream.map(x => x._2) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 5
Source File: Test.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.test import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} /** * 这是一个接收来自网络端口的信息 * 参数 spark集群的主节点地址,网络通信的节点地址,网络通信的端口,每个多长时间作为一个单位进行执行任务 * local[*] localhost 8888 5 */ object Test { case class Person(username: String, usercount: Int) def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[2]") .appName("hdfsTest") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(1)) val lines = ssc.socketTextStream("localhost", 9999) val words = lines.flatMap(_.split(" ")) words.print() println() ssc.start() ssc.awaitTermination() } }
Example 6
Source File: StreamingTestExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.stat.test.{BinarySample, StreamingTest} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.util.Utils object StreamingTestExample { def main(args: Array[String]) { if (args.length != 3) { // scalastyle:off println System.err.println( "Usage: StreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>") // scalastyle:on println System.exit(1) } val dataDir = args(0) val batchDuration = Seconds(args(1).toLong) val numBatchesTimeout = args(2).toInt val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample") val ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint { val dir = Utils.createTempDir() dir.toString } // $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => BinarySample(label.toBoolean, value.toDouble) }) val streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch") val out = streamingTest.registerStream(data) out.print() // $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout out.foreachRDD { rdd => timeoutCounter -= 1 val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _) if (timeoutCounter == 0 || anySignificant) rdd.context.stop() } ssc.start() ssc.awaitTermination() } }
Example 7
Source File: StreamingKMeansExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 8
Source File: QueueStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.Queue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new Queue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into rddQueue for (i <- 1 to 30) { rddQueue.synchronized { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) } Thread.sleep(1000) } ssc.stop() } }
Example 9
Source File: CustomReceiver.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 10
Source File: HdfsWordCount.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object HdfsWordCount { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: HdfsWordCount <directory>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("HdfsWordCount") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 11
Source File: NetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 12
Source File: KinesisInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 13
Source File: KafkaStreamSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } } }
Example 14
Source File: FlumeInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.net.InetSocketAddress import java.nio.ByteBuffer import java.util.concurrent.Executors import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.avro.ipc.NettyServer import org.apache.avro.ipc.specific.SpecificResponder import org.apache.flume.source.avro.{AvroFlumeEvent, AvroSourceProtocol, Status} import org.jboss.netty.channel.{ChannelPipeline, ChannelPipelineFactory, Channels} import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.Utils private[streaming] class FlumeInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, storageLevel: StorageLevel, enableDecompression: Boolean ) extends ReceiverInputDStream[SparkFlumeEvent](_ssc) { override def getReceiver(): Receiver[SparkFlumeEvent] = { new FlumeReceiver(host, port, storageLevel, enableDecompression) } } private[streaming] class CompressionChannelPipelineFactory extends ChannelPipelineFactory { def getPipeline(): ChannelPipeline = { val pipeline = Channels.pipeline() val encoder = new ZlibEncoder(6) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) pipeline } } }
Example 15
Source File: FlumeStreamSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 16
Source File: QueueInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 17
Source File: SocketInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io._ import java.net.{ConnectException, Socket} import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.NextIterator private[streaming] class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { private var socket: Socket = _ def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port) } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close() socket = null logInfo(s"Closed socket to $host:$port") } } } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8)) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 18
Source File: StreamingTab.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 19
Source File: InputInfoTrackerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Duration, StreamingContext, Time} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 20
Source File: SubscriberListener.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark.spark import org.apache.spark.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerReceiverError, StreamingListenerReceiverStarted, StreamingListenerReceiverStopped} class SubscriberListener(ssc: StreamingContext) extends StreamingListener with Logging { override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = { logInfo("onReceiverError") } override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { logInfo("onReceiverStarted") } override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = { logInfo("onReceiverStopped") ssc.stop() } }
Example 21
Source File: StreamHelper.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import kafka.KafkaHelper import kafka.common.TopicAndPartition import kafka.consumer.PartitionTopicInfo import kafka.message.MessageAndMetadata import kafka.serializer.Decoder import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.{Logging, SparkException} import scala.reflect.ClassTag case class StreamHelper(kafkaParams: Map[String, String]) extends Logging { // helper for kafka zookeeper lazy val kafkaHelper = KafkaHelper(kafkaParams) lazy val kc = new KafkaCluster(kafkaParams) // 1. get leader's earliest and latest offset // 2. get consumer offset // 3-1. if (2) is bounded in (1) use (2) for stream // 3-2. else use (1) by "auto.offset.reset" private def getStartOffsets(topics: Set[String]): Map[TopicAndPartition, Long] = { lazy val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) lazy val consumerOffsets = kafkaHelper.getConsumerOffsets(topics.toSeq) { for { topicPartitions <- kc.getPartitions(topics).right smallOffsets <- kc.getEarliestLeaderOffsets(topicPartitions).right largeOffsets <- kc.getLatestLeaderOffsets(topicPartitions).right } yield { { for { tp <- topicPartitions } yield { val co = consumerOffsets.getOrElse(tp, PartitionTopicInfo.InvalidOffset) val so = smallOffsets.get(tp).map(_.offset).get val lo = largeOffsets.get(tp).map(_.offset).get logWarning(s"$tp: $co $so $lo") if (co >= so && co <= lo) { (tp, co) } else { (tp, reset match { case Some("smallest") => so case _ => lo }) } } }.toMap } }.fold(errs => throw new SparkException(errs.mkString("\n")), ok => ok) } def createStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topics: Set[String]): InputDStream[(K, V)] = { type R = (K, V) val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key(), mmd.message()) kafkaHelper.registerConsumerInZK(topics) new DirectKafkaInputDStream[K, V, KD, VD, R](ssc, kafkaParams, getStartOffsets(topics), messageHandler) } def commitConsumerOffsets(offsets: HasOffsetRanges): Unit = { val offsetsMap = { for { range <- offsets.offsetRanges if range.fromOffset < range.untilOffset } yield { logDebug(range.toString()) TopicAndPartition(range.topic, range.partition) -> range.untilOffset } }.toMap kafkaHelper.commitConsumerOffsets(offsetsMap) } def commitConsumerOffset(range: OffsetRange): Unit = { if (range.fromOffset < range.untilOffset) { try { val tp = TopicAndPartition(range.topic, range.partition) logDebug("Committed offset " + range.untilOffset + " for topic " + tp) kafkaHelper.commitConsumerOffset(tp, range.untilOffset) } catch { case t: Throwable => // log it and let it go logWarning("exception during commitOffsets", t) throw t } } } def commitConsumerOffsets[R](stream: InputDStream[R]): Unit = { stream.foreachRDD { rdd => commitConsumerOffsets(rdd.asInstanceOf[HasOffsetRanges]) } } }
Example 22
Source File: TestStreamingSpec.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import org.specs2.mutable.Specification import org.specs2.specification.BeforeAfterAll class TestStreamingSpec extends Specification with BeforeAfterAll { private val master = "local[2]" private val appName = "test_streaming" private val batchDuration = Seconds(1) private var sc: SparkContext = _ private var ssc: StreamingContext = _ override def beforeAll(): Unit = { val conf = new SparkConf() .setMaster(master) .setAppName(appName) ssc = new StreamingContext(conf, batchDuration) sc = ssc.sparkContext } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() } } }
Example 23
Source File: FlumeStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.flume._ class FlumeStream extends ConfigurableStreamingStop{ override var batchDuration: Int = _ override val authorEmail: String = "[email protected]" override val description: String = "Get data from flume" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var hostname:String =_ var port:Int=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String].toInt val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("hostname of the slave machine to which the flume data will be sent, the hostName must be one of the cluster worker node").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port of the slave machine to which the flume data will be sent, the port should be greater than 10000").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor descriptor = batchDuration :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/FlumeStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def getDStream(ssc: StreamingContext): DStream[String] = { val flumeStream = FlumeUtils.createStream(ssc, hostname, port) flumeStream.map(e => new String(e.event.getBody.array(), "UTF-8")) } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {} }
Example 24
Source File: SocketTextStreamByWindow.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.DStream class SocketTextStreamByWindow extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket by window" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ var windowDuration:Int = _ var slideDuration:Int = _ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] windowDuration=MapUtil.get(map,key="windowDuration").asInstanceOf[String].toInt slideDuration=MapUtil.get(map,key="slideDuration").asInstanceOf[String].toInt val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data ").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) val windowDuration = new PropertyDescriptor().name("windowDuration").displayName("windowDuration").description("the window duration, the unit is seconds").defaultValue("").required(true) val slideDuration = new PropertyDescriptor().name("slideDuration").displayName("slideDuration").description("the slide duration, the unit is seconds").defaultValue("").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor descriptor = batchDuration :: descriptor descriptor = windowDuration :: descriptor descriptor = slideDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStreamByWindow.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.window(Seconds(windowDuration),Seconds(slideDuration)) //dstream.reduceByWindow(_ + _,Seconds(windowDuration),Seconds(slideDuration)) } }
Example 25
Source File: TextFileStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream class TextFileStream extends ConfigurableStreamingStop{ override var batchDuration: Int = _ override val authorEmail: String = "[email protected]" override val description: String = "Get text file streaming data" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var directory:String =_ override def setProperties(map: Map[String, Any]): Unit = { directory=MapUtil.get(map,key="directory").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val directory = new PropertyDescriptor().name("directory").displayName("directory").description("HDFS directory to monitor for new file. Files must be written to the monitored directory by \"moving\" them from another location within the same file system ").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = directory :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/TextFileStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.textFileStream(directory) dstream } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {} }
Example 26
Source File: KafkaStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka010.KafkaUtils import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe class KafkaStream extends ConfigurableStreamingStop{ override var batchDuration: Int = _ override val authorEmail: String = "[email protected]" override val description: String = "Read data from kafka" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var brokers:String = _ var groupId:String = _ var topics:Array[String] = _ override def setProperties(map: Map[String, Any]): Unit = { brokers=MapUtil.get(map,key="brokers").asInstanceOf[String] groupId=MapUtil.get(map,key="groupId").asInstanceOf[String] topics=MapUtil.get(map,key="topics").asInstanceOf[String].split(",").map(x => x.trim) val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val brokers = new PropertyDescriptor().name("brokers").displayName("brokers").description("kafka brokers, seperated by ','").defaultValue("").required(true) val groupId = new PropertyDescriptor().name("groupId").displayName("groupId").description("kafka consumer group").defaultValue("group").required(true) val topics = new PropertyDescriptor().name("topics").displayName("topics").description("kafka topics").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = brokers :: descriptor descriptor = groupId :: descriptor descriptor = topics :: descriptor descriptor = batchDuration :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/KafkaStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def getDStream(ssc: StreamingContext): DStream[String] = { val kafkaParams = Map[String, Object]( "bootstrap.servers" -> brokers, "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> groupId, "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false:java.lang.Boolean) ) val stream = KafkaUtils.createDirectStream[String,String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) stream.map(record => record.key() + "," + record.value()) //stream.asInstanceOf[DStream[ConsumerRecord]] } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {} }
Example 27
Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver} import org.apache.spark.streaming.{Seconds, StreamingContext} class SocketTextStream extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ //var schema:String=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] //schema=MapUtil.get(map,key="schema").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor //descriptor = schema :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession](); val socketDF = spark .readStream .format("socket") .option("host",hostname) .option("port",port) .load() out.write(socketDF) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.asInstanceOf[DStream[String]] } }
Example 28
Source File: StreamingLogisticRegression.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.mllib import com.bigchange.util.{FileUtil, TimeUtil} import org.apache.spark.SparkConf import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingLogisticRegression { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>") System.exit(1) } val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(args(3).toInt)) model.trainOn(trainingData) // model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() model.predictOnValues(testData.map(lp => (lp.label, lp.features))).map(x => x._1 +"\t" +x._2).foreachRDD(rdd =>{ val value = rdd.collect() FileUtil.normalFileWriter("F:\\datatest\\ai\\StreamingLogisticRegression\\"+TimeUtil.getCurrentHour,value) }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 29
Source File: MonitorHDFSDirFiles.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.basic object MonitorHDFSDirFiles { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: <directory>") System.exit(1) } val sparkConf = new SparkConf().setAppName("MonitorHDFSDirFiles") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _).foreachRDD(rdd =>{ val arr = rdd.collect() arr.foreach(println) }) ssc.start() ssc.awaitTermination() } }
Example 30
Source File: KafkaWordCount.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.basic import java.util import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object KafkaWordCount { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <zkQuorum> <group> <topics> <numThreads>") System.exit(1) } val Array(zkQuorum, group, topics, numThreads) = args val sparkConf = new SparkConf().setAppName("KafkaWordCount"). set("spark.streaming.receiver.writeAheadLog.enable", "true"). set("spark.streaming.kafka.maxRatePerPartition", "1000") val ssc = new StreamingContext(sparkConf, Seconds(2)) // 设置 checkpoint,这是考虑到了有 window 操作,window 操作一般是需要进行 checkpoint ssc.checkpoint("checkpoint") val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap // createStream 返回的是一个 Tuple2,具有 key,value,这里只关注 value. // 注意这里是 Receiver-based 方式(还提供了 non-receiver 模式),默认配置下,这种方式是会在 receiver 挂掉 // 丢失数据的,需要设置 Write Ahead, 上面我们已经配置了, 那么存储 level 也可以进行相应调整. val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK_SER).map(_._2) val words = lines.flatMap(_.split(" ")) // 统计的是 10 分钟内的单词数量,每隔 10 秒统计 1 次 val wordCounts = words.map(x => (x, 1L)) .reduceByKeyAndWindow(_ + _, _ - _, Seconds(10), Seconds(2), 2). filter(x => x._2 > 0) wordCounts.print() ssc.start() ssc.awaitTermination() } } // Produces some random words between 1 and 100. object KafkaWordCountProducer { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: <metadataBrokerList> <topic> " + "<messagesPerSec> <wordsPerMessage>") System.exit(1) } // 需要注意的是这里是 broker list,为 host:port,host:port 形式 val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args // Zookeeper connection properties val props = new util.HashMap[String, Object]() props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) // Send some messages while (true) { (1 to messagesPerSec.toInt).foreach { messageNum => val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(100).toString) .mkString(" ") val message = new ProducerRecord[String, String](topic, null, str) producer.send(message) } Thread.sleep(1000) } } }
Example 31
Source File: StreamingSimpleModel.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.streaming import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingSimpleModel { def main(args: Array[String]) { val ssc = new StreamingContext("local","test",Seconds(10)) val stream = ssc.socketTextStream("localhost",9999) val numberFeatures = 100 val zeroVector = DenseVector.zeros[Double](numberFeatures) val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.dense(zeroVector.data)) .setNumIterations(1) .setStepSize(0.01) val labeledStream = stream.map { event => val split = event.split("\t") val y = split(0).toDouble val features = split(1).split(",").map(_.toDouble) LabeledPoint(label = y, features = Vectors.dense(features)) } model.trainOn(labeledStream) // 使用DStream的转换算子 val predictAndTrue = labeledStream.transform { rdd => val latestModel = model.latestModel() rdd.map { point => val predict = latestModel.predict(point.features) predict - point.label } } // 计算MSE predictAndTrue.foreachRDD { rdd => val mse = rdd.map(x => x * x).mean() val rmse = math.sqrt(mse) println(s"current batch, MSE: $mse, RMSE:$rmse") } ssc.start() ssc.awaitTermination() } }
Example 32
Source File: StreamingKVExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.streaming import java.util.UUID import kafka.serializer.StringDecoder import com.basho.riak.spark._ import com.basho.riak.spark.streaming._ import com.basho.riak.spark.util.RiakObjectConversionUtil import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Durations, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object StreamingKVExample { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf(true) .setAppName("Simple Spark Streaming to Riak KV Demo") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") setSparkOpt(sparkConf, "kafka.broker", "127.0.0.1:9092") val sc = new SparkContext(sparkConf) val streamCtx = new StreamingContext(sc, Durations.seconds(15)) val kafkaProps = Map[String, String]( "metadata.broker.list" -> sparkConf.get("kafka.broker"), "client.id" -> UUID.randomUUID().toString ) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streamCtx, kafkaProps, Set[String]("ingest-kv") ) map { case (key, value) => val obj = RiakObjectConversionUtil.to(value) obj.setContentType("application/json") obj } saveToRiak "test-data" streamCtx.start() println("Spark streaming context started. Spark UI could be found at http://SPARK_MASTER_HOST:4040") println("NOTE: if you're running job on the 'local' master open http://localhost:4040") streamCtx.awaitTermination() } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 33
Source File: StreamingTSExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.streaming import java.util.UUID import kafka.serializer.StringDecoder import org.apache.spark.sql.Row import org.apache.spark.streaming.Durations import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.{SparkConf, SparkContext} import com.basho.riak.spark.streaming._ import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat object StreamingTSExample { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf(true) .setAppName("Simple Spark Streaming to Riak TS Demo") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") setSparkOpt(sparkConf, "kafka.broker", "127.0.0.1:9092") val sc = new SparkContext(sparkConf) val streamCtx = new StreamingContext(sc, Durations.seconds(15)) val kafkaProps = Map[String, String]( "metadata.broker.list" -> sparkConf.get("kafka.broker"), "client.id" -> UUID.randomUUID().toString ) KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streamCtx, kafkaProps, Set[String]("ingest-ts") ) map { case (key, value) => val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) val wr = mapper.readValue(value, classOf[Map[String,String]]) Row( wr("weather"), wr("family"), DateTime.parse(wr("time"),DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")).getMillis, wr("temperature"), wr("humidity"), wr("pressure")) } saveToRiakTS "ts_weather_demo" streamCtx.start() println("Spark streaming context started. Spark UI could be found at http://SPARK_MASTER_HOST:4040") println("NOTE: if you're running job on the 'local' master open http://localhost:4040") streamCtx.awaitTermination() } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 34
Source File: RiakTSStreamingRDD.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import com.basho.riak.spark.rdd.connector.RiakConnector import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD} import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.StreamingContext import scala.reflect.ClassTag class RiakTSStreamingRDD[R] private[spark]( ssc: StreamingContext, connector: RiakConnector, bucketName: String, schema: Option[StructType] = None, columnNames: Option[Seq[String]] = None, whereConstraints: Option[(String, Seq[Any])] = None, filters: Array[Filter] = Array(), tsRangeFieldName: Option[String] = None, quantum: Option[Long] = None, query: Option[String] = None, readConf: ReadConf = ReadConf())( implicit ct: ClassTag[R]) extends RiakTSRDD[R]( sc = ssc.sparkContext, connector = connector, bucketName = bucketName, schema = schema, columnNames = columnNames, whereConstraints = whereConstraints, filters = filters, tsRangeFieldName = tsRangeFieldName, quantum = quantum, query = query, readConf = readConf)
Example 35
Source File: RiakStreamingRDD.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import com.basho.riak.spark.query.QueryData import com.basho.riak.spark.rdd.{ReadConf, RiakRDD} import com.basho.riak.spark.rdd.connector.RiakConnector import com.basho.riak.spark.rdd.mapper.ReadDataMapperFactory import org.apache.spark.streaming.StreamingContext import scala.reflect.ClassTag class RiakStreamingRDD[R] private[spark]( ssc: StreamingContext, connector: RiakConnector, bucketType: String, bucketName: String, queryData: Option[QueryData[_]] = None, readConf: ReadConf = ReadConf())( implicit ct: ClassTag[R], @transient rdmf: ReadDataMapperFactory[R]) extends RiakRDD[R]( sc = ssc.sparkContext, connector = connector, bucketType = bucketType, bucketName = bucketName, queryData = queryData, readConf = readConf)
Example 36
Source File: StreamingContextFunctions.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import com.basho.riak.client.core.query.Namespace import com.basho.riak.spark.SparkContextFunctions import com.basho.riak.spark.rdd.{ReadConf, RiakRDD, RiakTSRDD} import com.basho.riak.spark.rdd.connector.RiakConnector import com.basho.riak.spark.rdd.mapper.ReadDataMapperFactory import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.StreamingContext import scala.reflect.ClassTag class StreamingContextFunctions(ssc: StreamingContext) extends SparkContextFunctions(ssc.sparkContext) { override def riakTSTable[T](bucketName: String, readConf: ReadConf, schema: Option[StructType] )(implicit ct: ClassTag[T], connector: RiakConnector ): RiakTSRDD[T] = new RiakTSStreamingRDD[T](ssc, connector, bucketName, schema) override def riakBucket[T](bucketName: String, bucketType: String )(implicit connector: RiakConnector, ct: ClassTag[T], rdmf: ReadDataMapperFactory[T] ): RiakRDD[T] = new RiakStreamingRDD[T](ssc, connector, bucketType, bucketName, readConf = ReadConf(ssc.sparkContext.getConf)) override def riakBucket[T](ns: Namespace )(implicit ct: ClassTag[T], rdmf: ReadDataMapperFactory[T] ): RiakRDD[T] = riakBucket(ns.getBucketNameAsString, ns.getBucketTypeAsString) }
Example 37
Source File: SparkStreamingFixture.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import org.apache.spark.{Logging, SparkContext} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.junit.{After, Before} trait SparkStreamingFixture extends Logging { protected var sc: SparkContext protected var ssc: StreamingContext = _ protected val batchDuration = Seconds(1) @Before def startStreamingContext(): Unit = { ssc = new StreamingContext(sc, batchDuration) logInfo("Streaming context created") } @After def stopStreamingContext(): Unit = { Option(ssc).foreach(_.stop()) logInfo("Streaming context stopped") } }
Example 38
Source File: SparkStreaming_6_KafkaDirectStream.scala From HadoopLearning with MIT License | 5 votes |
package com.c503.streaming import com.utils.{ConfManager, SparkConf} import org.apache.spark.streaming.kafka010._ import org.apache.spark.streaming.{Seconds, StreamingContext} //执行数据 dataStream.foreachRDD(rdd => { rdd.foreach(partition => { var msg = "topic=" + partition.topic() + "\n" msg += "partition=" + partition.partition() + "\n" msg += "offset=" + partition.offset() + "\n" msg += "timestamp=" + partition.timestamp() + "\n" msg += "checksum=" + partition.checksum() + "\n" msg += "key=" + partition.key() + "\n" msg += "value=" + partition.value() + "\n" println(msg) }) //手动管理kafka的offset dataStream.asInstanceOf[CanCommitOffsets].commitAsync(rdd.asInstanceOf[HasOffsetRanges].offsetRanges) }) context.start() context.awaitTermination() } }
Example 39
Source File: SparkStreaming_1_1_local_TextFile.scala From HadoopLearning with MIT License | 5 votes |
package com.c503.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object SparkStreaming_1_1_local_TextFile { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("SparkStreaming_1_1_local_TextFile").setMaster("local[2]") val sc = new StreamingContext(sparkConf, Seconds(1)) val lines = sc.textFileStream("/Users/liuxm/A_study/idea_ws/mapreduce/") println(lines) val words = lines.flatMap(_.split(" ")) val pairs = words.map((_, 1)) val wordCounts = pairs.reduceByKey(_ + _) wordCounts.foreachRDD(rdd => { println("*" * 30) rdd.sortBy(x => x._2, false).foreach(e => { println(e) }) }) sc.start() sc.awaitTermination() } }
Example 40
Source File: Streaming.scala From scala-spark-cab-rides-predictions with MIT License | 5 votes |
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter import com.amazonaws.services.kinesis.model.Record import com.google.gson.Gson import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext} object Trials extends App { import org.apache.log4j.{Level, Logger} Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) //session setup System.setProperty("hadoop.home.dir", "C:\\winutils") val sparkSession = SparkSession.builder() .master("local[*]") .appName("test") .getOrCreate() val sc = sparkSession.sparkContext val ssc = new StreamingContext(sc, Seconds(10)) val sqlContext = sparkSession.sqlContext //creates an array of strings from raw byte array def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",") //converts records to map of key value pair and then json def recordHandler = (record: Record) => { val gson = new Gson val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage) gson.toJson(map) } case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String) val stream_cab = KinesisInputDStream.builder .streamingContext(ssc) .streamName("cab_rides") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) val stream_weather = KinesisInputDStream.builder .streamingContext(ssc) .streamName("weather") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) //creating dataframe, can be stored as temp view val cabSchema = Encoders.product[CabPrice].schema stream_cab.foreachRDD(rdd => { import sqlContext.implicits._ //val xx: Dataset[String] = rdd.toDS() val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS()) df.show() }) ssc.start() ssc.awaitTermination() }
Example 41
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase import java.io.File import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 42
Source File: SKRSpec.scala From spark-kafka-writer with Apache License 2.0 | 5 votes |
package com.github.benfradet.spark.kafka.writer import java.util.concurrent.atomic.AtomicInteger import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer} import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.scalatest.concurrent.Eventually import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} import scala.collection.mutable.ArrayBuffer import scala.util.Random import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec case class Foo(a: Int, b: String) trait SKRSpec extends AnyWordSpec with Matchers with BeforeAndAfterEach with BeforeAndAfterAll with Eventually { val sparkConf = new SparkConf() .setMaster("local[1]") .setAppName(getClass.getSimpleName) var ktu: KafkaTestUtils = _ override def beforeAll(): Unit = { ktu = new KafkaTestUtils ktu.setup() } override def afterAll(): Unit = { SKRSpec.callbackTriggerCount.set(0) if (ktu != null) { ktu.tearDown() ktu = null } } var topic: String = _ var ssc: StreamingContext = _ var spark: SparkSession = _ override def afterEach(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (spark != null) { spark.stop() spark = null } } override def beforeEach(): Unit = { ssc = new StreamingContext(sparkConf, Seconds(1)) spark = SparkSession.builder .config(sparkConf) .getOrCreate() topic = s"topic-${Random.nextInt()}" ktu.createTopics(topic) } def collect(ssc: StreamingContext, topic: String): ArrayBuffer[String] = { val kafkaParams = Map( "bootstrap.servers" -> ktu.brokerAddress, "auto.offset.reset" -> "earliest", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "test-collect" ) val results = new ArrayBuffer[String] KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Set(topic), kafkaParams) ).map(_.value()) .foreachRDD { rdd => results ++= rdd.collect() () } results } val producerConfig = Map( "bootstrap.servers" -> "127.0.0.1:9092", "key.serializer" -> classOf[StringSerializer].getName, "value.serializer" -> classOf[StringSerializer].getName ) } object SKRSpec { val callbackTriggerCount = new AtomicInteger() }
Example 43
Source File: StreamingExample.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.spark.examples import com.pygmalios.reactiveinflux._ import com.pygmalios.reactiveinflux.spark._ import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.joda.time.DateTime import scala.concurrent.duration._ object StreamingExample extends App { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Example") val ssc = new StreamingContext(conf, Seconds(1)) val point1 = Point( time = DateTime.now(), measurement = "measurement1", tags = Map( "tagKey1" -> "tagValue1", "tagKey2" -> "tagValue2"), fields = Map( "fieldKey1" -> "fieldValue1", "fieldKey2" -> 10.7) ) // Provide settings for reactiveinflux implicit val params = ReactiveInfluxDbName("example") implicit val awaitAtMost = 1.second // Create DStream of Influx points val queue = new scala.collection.mutable.Queue[RDD[Point]] val queueStream: DStream[Point] = ssc.queueStream(queue) // Add single RDD with a single Influx point to the DStream queue.enqueue(ssc.sparkContext.parallelize(Seq(point1))) // Save DStream to Influx queueStream.saveToInflux() // Start Spark streaming ssc.start() ssc.awaitTermination() }
Example 44
Source File: Predict.scala From spark-twitter-sentiment with Apache License 2.0 | 5 votes |
package com.dhruv import org.apache.spark.SparkConf import org.apache.spark.mllib.classification.NaiveBayesModel import org.apache.spark.streaming.twitter._ import org.apache.spark.streaming.{Seconds, StreamingContext} object Predict { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: " + this.getClass.getSimpleName + " <modelDirectory> ") System.exit(1) } val Array(modelFile) = Utils.parseCommandLineWithTwitterCredentials(args) println("Initializing Streaming Spark Context...") val conf = new SparkConf().setAppName(this.getClass.getSimpleName) val ssc = new StreamingContext(conf, Seconds(5)) println("Initializing Twitter stream...") val tweets = TwitterUtils.createStream(ssc, Utils.getAuth) val statuses = tweets.filter(_.getLang == "en").map(_.getText) println("Initalizaing the Naive Bayes model...") val model = NaiveBayesModel.load(ssc.sparkContext, modelFile.toString) val labeled_statuses = statuses .map(t => (t, model.predict(Utils.featurize(t)))) labeled_statuses.print() // Start the streaming computation println("Initialization complete.") ssc.start() ssc.awaitTermination() } }
Example 45
Source File: MSNBCStreamingExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingExample extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999) val sequences: DStream[Array[Array[Int]]] = rawSequences .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 46
Source File: MSNBCStreamingAdvanced.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingAdvanced extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999) val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": ")) .map(kv => (kv(0).toInt, kv(1))) val countIds = events.map(e => (e._1, 1)) val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _) def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = { Some(runningCount.getOrElse(0) + newValues.sum) } val runningCounts = countIds.updateStateByKey[Int](updateFunction _) val duration = Seconds(20) val slide = Seconds(10) val rawSequences: DStream[(Int, String)] = events .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide) val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2) .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 47
Source File: StreamingDemo.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} def createSchema(): Unit = { CassandraConnector(conf).withSessionDo { session => session.execute(s"DROP KEYSPACE IF EXISTS $CassandraKeyspace") session.execute(s"CREATE KEYSPACE IF NOT EXISTS $CassandraKeyspace WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") session.execute(s""" CREATE TABLE IF NOT EXISTS $CassandraKeyspace.$CassandraTable ( event text, interval text, dimension text, subtotal counter, PRIMARY KEY((event, interval), dimension) ) WITH CLUSTERING ORDER BY (dimension ASC) """) } } }
Example 48
Source File: PersistStreamByInterval.scala From spark-streaming-demo with Apache License 2.0 | 5 votes |
package com.datastax.examples.meetup import com.datastax.examples.meetup.model.MeetupRsvp import com.datastax.examples.meetup.model.EventInterval import com.datastax.examples.meetup.websocket._ import com.datastax.spark.connector._ import com.datastax.spark.connector.streaming._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, Minutes, StreamingContext} import org.apache.spark.streaming.StreamingContext._ class PersistStreamByInterval extends Serializable { val tableColumns = SomeColumns("event", "interval", "dimension", "subtotal") def start(ssc: StreamingContext, websocket: String, keyspace: String, table: String) { val stream = ssc.receiverStream[MeetupRsvp](new WebSocketReceiver(websocket, StorageLevel.MEMORY_ONLY_SER)) //stream.checkpoint(Seconds(60)) //stream.repartition(2) // Filter Accepted RSVP val rsvpAccepted = stream.filter(_.response == "yes") // Number of attendees by Country val rsvpByCountry = rsvpAccepted .map( rsvp => (rsvp.group.group_country, rsvp.guests + 1) ) .reduceByKey(_ + _) .map{ case (country, attendees) => ("attending", EventInterval.All, country, attendees) } rsvpByCountry.saveToCassandra(keyspace, table, tableColumns) // Trending Topics val trendingTopics = rsvpAccepted .flatMap( rsvp => rsvp.group.group_topics ) .map( topic => (topic.topic_name, 1) ) .reduceByKeyAndWindow((a:Int,b:Int) => a+b, Minutes(5), Seconds(10)) .filter( t => t._2 > 5 ) // min threshold = 5 .transform( (rdd, time) => rdd.map { case (topic, count) => ("trending", EventInterval.Seconds(time), topic, count)} ) trendingTopics.saveToCassandra(keyspace, table, tableColumns) ssc.start() ssc.awaitTermination() } }
Example 49
Source File: TestAdditionInWindow.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{ StreamingContext, Seconds } import org.apache.spark.SparkConf object TestAdditionInWindow { def main(args: Array[String]): Unit = { val ssc = new StreamingContext(new SparkConf().setAppName("TestAdditionJob"), Seconds(1)) val msg = ssc.socketTextStream("localhost", 9999) msg .map(data => ("sum", data.toInt)) .reduceByKey(_ + _) .window(Seconds(3), Seconds(2)) .print() ssc.start() ssc.awaitTermination() } }
Example 50
Source File: TestUpdateStateByKey.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{StreamingContext, Duration} import org.apache.spark.SparkConf object TestUpdateStateByKey { val checkpointDir: String = "hdfs://localhost:9000/user/hduser/spark-chkpt" def main(args: Array[String]): Unit = { val ssc = StreamingContext.getOrCreate(checkpointDir, createFunc _) ssc.start() ssc.awaitTermination() } def updateFunc(values: Seq[Int], state: Option[Int]): Option[Int] = { Some(values.size + state.getOrElse(0)) } def createFunc(): StreamingContext = { val ssc = new StreamingContext(new SparkConf().setAppName("TestUpdateStateByKeyJob"), Duration(2000)) ssc.checkpoint(checkpointDir) ssc.socketTextStream("localhost", 9999) .flatMap(_.split(" ")) .map((_, 1)) .updateStateByKey(updateFunc _) .checkpoint(Duration(10000)) .print() ssc } }
Example 51
Source File: TestStreamingListener.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.{ StreamingContext, Seconds } import org.apache.spark.streaming.scheduler.{ StreamingListener, StreamingListenerBatchStarted, StreamingListenerBatchCompleted } import org.apache.spark.SparkConf object TestStreamingListener { def main(args: Array[String]): Unit = { val ssc = new StreamingContext(new SparkConf().setAppName("TestStreamingListenerJob"), Seconds(5)) ssc.addStreamingListener(new MyStreamingListener()) ssc .socketTextStream("localhost", 9999) .flatMap(_.split(" ")) .count() .print() ssc.start() ssc.awaitTermination() } } class MyStreamingListener extends StreamingListener { override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = { println(">>> Batch started...records in batch = " + batchStarted.batchInfo.numRecords) } override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { println(">>> Batch completed...time taken (ms) = " + batchCompleted.batchInfo.totalDelay) } }
Example 52
Source File: TestMapWithState.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples.streaming import org.apache.spark.streaming.StreamingContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, State, StateSpec } def mappingFunc(key: String, value: Option[Int], state: State[Int]): Option[(String, Int)] = { val sum = value.getOrElse(0) + state.getOption().getOrElse(0) // updating the state of non-idle keys... // To call State.update(...) we need to check State.isTimingOut() == false, // else there will be NoSuchElementException("Cannot update the state that is timing out") if (state.isTimingOut()) println(key + " key is timing out...will be removed.") else state.update(sum) Some((key, sum)) } }
Example 53
Source File: RedisInputDStream.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.streaming import com.redislabs.provider.redis.RedisConfig import org.apache.curator.utils.ThreadUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.dstream.ReceiverInputDStream import redis.clients.jedis._ import scala.reflect.{ClassTag, classTag} import scala.util.control.NonFatal keys.foreach{ key => executorPool.submit(new MessageHandler(redisConfig.connectionForKey(key), key)) } } finally { executorPool.shutdown() } } def onStop() { } private class MessageHandler(conn: Jedis, key: String) extends Runnable { def run() { try { while(!isStopped) { val response = conn.blpop(2, key) if (response == null || response.isEmpty) { // no-op } else if (classTag[T] == classTag[String]) { store(response.get(1).asInstanceOf[T]) } else if (classTag[T] == classTag[(String, String)]) { store((response.get(0), response.get(1)).asInstanceOf[T]) } else { throw new scala.Exception("Unknown Redis Streaming type") } } } catch { case NonFatal(e) => restart("Error receiving data", e) } finally { onStop() } } } }
Example 54
Source File: redisStreamingFunctions.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.streaming import com.redislabs.provider.redis.{ReadWriteConfig, RedisConfig} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream def createRedisStreamWithoutListname(keys: Array[String], storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2) (implicit redisConf: RedisConfig = RedisConfig.fromSparkConf(ssc.sparkContext.getConf)): RedisInputDStream[String] = { new RedisInputDStream(ssc, keys, storageLevel, redisConf, classOf[String]) } def createRedisXStream(consumersConfig: Seq[ConsumerConfig], storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2) (implicit redisConfig: RedisConfig = RedisConfig.fromSparkConf(ssc.sparkContext.getConf)): InputDStream[StreamItem] = { val readWriteConfig = ReadWriteConfig.fromSparkConf(ssc.sparkContext.getConf) val receiver = new RedisStreamReceiver(consumersConfig, redisConfig, readWriteConfig, storageLevel) ssc.receiverStream(receiver) } } trait RedisStreamingFunctions { implicit def toRedisStreamingContext(ssc: StreamingContext): RedisStreamingContext = new RedisStreamingContext(ssc) }
Example 55
Source File: SparkStreamingRedisSuite.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis import com.redislabs.provider.redis.env.Env import com.redislabs.provider.redis.util.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} import org.scalatest.{BeforeAndAfterEach, FunSuite} trait SparkStreamingRedisSuite extends FunSuite with Env with BeforeAndAfterEach with Logging { override protected def beforeEach(): Unit = { super.beforeEach() spark = SparkSession.builder().config(conf).getOrCreate() sc = spark.sparkContext ssc = new StreamingContext(sc, Seconds(1)) } override protected def afterEach(): Unit = { ssc.stop() spark.stop System.clearProperty("spark.driver.port") super.afterEach() } }
Example 56
Source File: Env.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.env import com.redislabs.provider.redis.RedisConfig import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.StreamingContext import org.apache.spark.{SparkConf, SparkContext} trait Env { val conf: SparkConf var spark: SparkSession = _ var sc: SparkContext = _ var ssc: StreamingContext = _ val redisHost = "127.0.0.1" val redisPort = 6379 val redisAuth = "passwd" val redisConfig: RedisConfig }
Example 57
Source File: CustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { println("Connecting to " + host + ":" + port) socket = new Socket(host, port) println("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() println("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 58
Source File: TFLCustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object TFLCustomReceiver { private val url = "https://api.tfl.gov.uk/Line/circle/Arrivals?stopPointId=940GZZLUERC&app_id=a73727f3&app_key=dc8150560a2422afae2b70cf291c4327" def main(args: Array[String]) { // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("TFLCustomReceiver") val ssc = new StreamingContext(sparkConf, Seconds(300)) val lines = ssc.receiverStream(new TFLCustomReceiver(url)) lines.print() ssc.start() ssc.awaitTermination() } } class TFLCustomReceiver(url: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { def onStart() { // Start the thread that receives data over a connection new Thread("Http Receiver") { override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself if isStopped() returns false } private def receive() { var userInput: String = null var httpClient: DefaultHttpClient = null var getRequest: HttpGet = null try { // Connect to host:port httpClient = new DefaultHttpClient(); getRequest = new HttpGet(url); getRequest.addHeader("accept", "application/json"); while(!isStopped) { val response = httpClient.execute(getRequest); if (response.getStatusLine().getStatusCode() != 200) { throw new RuntimeException("Failed : HTTP error code : "+ response.getStatusLine().getStatusCode()); } val reader = new BufferedReader(new InputStreamReader((response.getEntity().getContent()))); userInput = reader.readLine() while(userInput != null) { store(userInput) //println(userInput) userInput = reader.readLine() } reader.close() Thread.sleep(60*1000) } httpClient.close() // Restart in an attempt to connect again when server is active again //restart("Trying to connect again") } catch { case e: java.net.ConnectException => // restart if could not connect to server restart("Error connecting to " + url, e) case t: Throwable => // restart if there is any other error restart("Error receiving data", t) } } }
Example 59
Source File: TFLStreamingApp.scala From Learning-Spark-SQL with MIT License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object TFLStreamingApp { def main(args: Array[String]) { val conf = new SparkConf().setAppName("TFLStreaming") val ssc = new StreamingContext(conf, Seconds(300)) val stream = ssc.receiverStream(new TFLArrivalPredictionsByLine()) println("Before") stream.print() println("After") if (args.length > 2) { stream.saveAsTextFiles(args(2)) } ssc.start() ssc.awaitTermination() } }
Example 60
Source File: gihyo_6_2_1_Sample.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_2_1_Sample { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val wordCounts = run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(_.split(" ")) val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) wordCounts.print } }
Example 61
Source File: gihyo_6_3_Join.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Join { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], joinStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = joinStream.map(x => (x, Array("attribute2", "attribute3", "attribute4"))) val linesKVW = lines1KV.join(lines2KV) linesKVW.print } }
Example 62
Source File: gihyo_6_3_Reduce.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Reduce { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val windowLineCount = stream.reduce((x, y) => x + "," + y) windowLineCount.print } }
Example 63
Source File: gihyo_6_3_reduceByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.reduceByWindow((x, y) => x + y, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 64
Source File: gihyo_6_3_KafkaStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import kafka.serializer.StringDecoder import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_KafkaStream { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val brokerList = args(0) val consumeTopic = args(1) val checkpointDir = args(2) val saveDir = args(3) val f = createStreamingContext(brokerList, consumeTopic, checkpointDir, saveDir) // StreamingContextの取得 val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(brokerList: String, consumeTopic: String, checkpointDir: String, saveDir: String): () => StreamingContext = { () => { System.out.println(values) Some(running.getOrElse(0) + values.length) } def run(stream: InputDStream[(String, String)], saveDir: String, windowLength: Int = 30, slideInterval: Int = 5) { val baseStream = stream.transform(rdd => { val t = (Long.MaxValue - System.currentTimeMillis) rdd.map(x => (x._1, x._2 + ", " + t)) }).map(x => { val splitVal = x._2.split(",") val userVal = splitVal(0).split(":") val actionVal = splitVal(1).split(":") val pageVal = splitVal(2).split(":") val timestamp = splitVal(3) (actionVal(1), userVal(1), pageVal(1), timestamp) }) baseStream.persist() val accountStream = baseStream.filter(_._1 == "view") .map(x => x._2) .countByValue() val totalUniqueUser = accountStream .updateStateByKey[Int](updateStateByKeyFunction _) .count() .map(x => "totalUniqueUser:" + x) val baseStreamPerTirty = baseStream .window(Seconds(windowLength), Seconds(slideInterval)) .filter(_._1 == "view") baseStreamPerTirty.persist() val pageViewPerTirty = baseStreamPerTirty .count() .map(x => "PageView:" + x) val uniqueUserPerTirty = baseStreamPerTirty .map(x => x._2) .countByValue() .count() .map(x => "UniqueUser:" + x) val pageViewStream = baseStream .filter(_._1 == "view") .map(x => x._3) .count() .map(x => "PageView:" + x) val outputStream = totalUniqueUser .union(pageViewPerTirty) .union(uniqueUserPerTirty) .union(pageViewStream) .reduce((x, y) => x + ", " + y) .saveAsTextFiles(saveDir) } } // scalastyle:on println
Example 65
Source File: gihyo_6_3_TwitterStream.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.atilika.kuromoji.Token import twitter4j.Status import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object gihyo_6_3_TwitterStream { def main(args: Array[String]) { if (args.length != 7) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val Array(cKey, cSecret, aToken, aSecret, cDir, tagDir, wordDir) = args System.setProperty("twitter4j.oauth.consumerKey", cKey) System.setProperty("twitter4j.oauth.consumerSecret", cSecret) System.setProperty("twitter4j.oauth.accessToken", aToken) System.setProperty("twitter4j.oauth.accessTokenSecret", aSecret) val f = createStreamingContext(cDir, tagDir, wordDir) val ssc = StreamingContext.getOrCreate(cDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext(checkpointDir: String, tagDir: String, wordDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.registerKryoClasses(Array(classOf[UserDic])) val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val twitterStream = TwitterUtils.createStream(ssc, None) run(sc, twitterStream, tagDir, wordDir) ssc } } def run(sc: SparkContext, stream: InputDStream[Status], tagDir: String, wordDir: String) { val tokenizer = sc.broadcast(UserDic.getInstance) val tweets = stream.map(tweet => tweet.getText()) tweets.persist() val TweetText = tweets .flatMap(text => { val tokens = tokenizer.value.tokenize(text).toArray tokens.filter(t => { val token = t.asInstanceOf[Token] ((token.getPartOfSpeech.indexOf("名詞") > -1 && token.getPartOfSpeech.indexOf("一般") > -1) || token.getPartOfSpeech.indexOf("カスタム名詞") > -1) && token.getSurfaceForm.length > 1 && !(token.getSurfaceForm matches "^[a-zA-Z]+$|^[0-9]+$") }).map(t => t.asInstanceOf[Token].getSurfaceForm) }) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) val TweetTags = tweets .flatMap(tweet => tweet.split(" ").filter(_.startsWith("#"))) .countByValue() .map(x => (x._2, x._1)) .transform(_.sortByKey(false)) .map(x => (x._2, x._1)) TweetText.saveAsTextFiles(wordDir) TweetTags.saveAsTextFiles(tagDir) } } // scalastyle:on println
Example 66
Source File: gihyo_6_3_Union.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.KafkaUtils object gihyo_6_3_Union { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHosts = args(0) val consumerGroup = args(1) val targetTopics = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val KafkaStreams = (1 to 5).map { i => KafkaUtils.createStream(ssc, targetHosts, consumerGroup, Map(targetTopics -> 1)) } run(ssc, KafkaStreams) ssc.start ssc.awaitTermination } def run(ssc: StreamingContext, streams: IndexedSeq[InputDStream[(String, String)]]) { val unionedStream = ssc.union(streams) unionedStream.print } }
Example 67
Source File: gihyo_6_3_flatMap.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_flatMap { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val words = stream.flatMap(line => line.split(" ")) words.print } }
Example 68
Source File: gihyo_6_3_Repartition.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Repartition { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val repartitionData = stream.repartition(3) // scalastyle:off println repartitionData.foreachRDD(rdd => println(s"partition size: ${rdd.partitions.size.toString}")) // scalastyle:on println repartitionData.print } }
Example 69
Source File: gihyo_6_3_Count.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Count { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val lineCount = stream.window(Seconds(windowLength), Seconds(slideInterval)).count lineCount.print } }
Example 70
Source File: gihyo_6_3_Map.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Map { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val lineCount = stream.map(line => (line, 1)) lineCount.print } }
Example 71
Source File: gihyo_6_3_Cogroup.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Cogroup { def main(args: Array[String]) { if (args.length != 4) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost1 = args(0) val targetHostPort1 = args(1).toInt val targetHost2 = args(2) val targetHostPort2 = args(3).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines1 = ssc.socketTextStream(targetHost1, targetHostPort1) val lines2 = ssc.socketTextStream(targetHost2, targetHostPort2) run(lines1, lines2) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], otherStream: InputDStream[String]) { val lines1KV = stream.map(x => (x, "attribute1")) val lines2KV = otherStream.map(x => (x, "attribute2")) val linesKVW = lines1KV.cogroup(lines2KV) linesKVW.print } }
Example 72
Source File: gihyo_6_3_reduceByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKey { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countKeyValue = stream.map(x => (x, 1)).reduceByKey((x, y) => x + y) countKeyValue.print } }
Example 73
Source File: gihyo_6_3_reduceByKeyAndWindow_efficient.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow_efficient { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow( (a: Int, b: Int) => a + b, (a: Int, b: Int) => a - b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 74
Source File: gihyo_6_3_Transform.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Transform { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))) run(lines, blackList) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], blackList: RDD[(String, String)]) { val userList = stream.map(x => (x, "action:Login")).transform(rdd => { val tmpUserList = rdd.leftOuterJoin(blackList) tmpUserList.filter(user => (user._2._2 == None)) }) userList.print } }
Example 75
Source File: gihyo_6_3_reduceByKeyAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_reduceByKeyAndWindow { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.map(x => (x, 1)) .reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 76
Source File: gihyo_6_3_countByValueAndWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 // scalastyle:off println import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValueAndWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val f = createStreamingContext(targetHost, targetHostPort, checkpointDir) val ssc = StreamingContext.getOrCreate(checkpointDir, f) sys.ShutdownHookThread { System.out.println("Gracefully stopping SparkStreaming Application") ssc.stop(true, true) System.out.println("SparkStreaming Application stopped") } ssc.start ssc.awaitTermination } def createStreamingContext( targetHost: String, targetHostPort: Int, checkpointDir: String): () => StreamingContext = { () => { val conf = new SparkConf().setAppName("gihyoSample_Application") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint(checkpointDir) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc } } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByValueAndWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } } // scalastyle:on println
Example 77
Source File: gihyo_6_3_updateStateByKey.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_updateStateByKey { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val userList = stream.map(x => (x, 1)).updateStateByKey[Int](updateStateByKeyFunction _) userList.print } def updateStateByKeyFunction(values: Seq[Int], running: Option[Int]): Option[Int] = { Some(running.getOrElse(0) + values.size) } }
Example 78
Source File: gihyo_6_3_Filter.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Filter { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val overData = stream.filter(line => line.length > 5) overData.print } }
Example 79
Source File: gihyo_6_3_countByWindow.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByWindow { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val checkpointDir = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) ssc.checkpoint(checkpointDir) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.countByWindow(Seconds(windowLength), Seconds(slideInterval)) userList.print } }
Example 80
Source File: gihyo_6_3_Window.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_Window { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String], windowLength: Int = 10, slideInterval: Int = 5) { val userList = stream.window(Seconds(windowLength), Seconds(slideInterval)).countByValue() userList.print } }
Example 81
Source File: gihyo_6_3_countByValue.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.InputDStream object gihyo_6_3_countByValue { def main(args: Array[String]) { if (args.length != 2) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHost = args(0) val targetHostPort = args(1).toInt val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val lines = ssc.socketTextStream(targetHost, targetHostPort) run(lines) ssc.start ssc.awaitTermination } def run(stream: InputDStream[String]) { val countValue = stream.countByValue() countValue.print } }
Example 82
Source File: TestStreamingContext.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark import org.scalatest.{BeforeAndAfterEach, Suite} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{StreamingContext, Seconds} import jp.gihyo.spark.ch06.UserDic private[spark] trait TestStreamingContext extends BeforeAndAfterEach { self: Suite => @transient var ssc: StreamingContext = _ @transient var sc: SparkContext = _ val master = "local[2]" val appN = "StreamingUnitTest" val bd = Seconds(1) override def beforeEach() { super.beforeEach() val conf = new SparkConf().setMaster(master) .setAppName(appN) .set("spark.streaming.clock", "org.apache.spark.util.ManualClock") .registerKryoClasses(Array(classOf[UserDic])) ssc = new StreamingContext(conf, bd) sc = ssc.sparkContext } override def afterEach() { try { if (ssc != null) { // stop with sc ssc.stop(true) } ssc = null; } finally { super.afterEach() } } }
Example 83
Source File: AvroRandomExtractor.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.spark.examples.avro import com.memsql.spark.etl.api._ import com.memsql.spark.etl.utils.PhaseLogger import org.apache.spark.streaming.StreamingContext import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types._ import org.apache.avro.Schema import org.apache.avro.generic.GenericData import org.apache.avro.io.{DatumWriter, EncoderFactory} import org.apache.avro.specific.SpecificDatumWriter import java.io.ByteArrayOutputStream // Generates an RDD of byte arrays, where each is a serialized Avro record. class AvroRandomExtractor extends Extractor { var count: Int = 1 var generator: AvroRandomGenerator = null var writer: DatumWriter[GenericData.Record] = null var avroSchema: Schema = null def schema: StructType = StructType(StructField("bytes", BinaryType, false) :: Nil) val parser: Schema.Parser = new Schema.Parser() override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { val userConfig = config.asInstanceOf[UserExtractConfig] val avroSchemaJson = userConfig.getConfigJsValue("avroSchema") match { case Some(s) => s case None => throw new IllegalArgumentException("avroSchema must be set in the config") } count = userConfig.getConfigInt("count").getOrElse(1) avroSchema = parser.parse(avroSchemaJson.toString) writer = new SpecificDatumWriter(avroSchema) generator = new AvroRandomGenerator(avroSchema) } override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { val rdd = sqlContext.sparkContext.parallelize((1 to count).map(_ => Row({ val out = new ByteArrayOutputStream val encoder = EncoderFactory.get().binaryEncoder(out, null) val avroRecord: GenericData.Record = generator.next().asInstanceOf[GenericData.Record] writer.write(avroRecord, encoder) encoder.flush out.close out.toByteArray }))) Some(sqlContext.createDataFrame(rdd, schema)) } }
Example 84
Source File: AvroTransformerSpec.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package test import com.memsql.spark.connector.MemSQLContext import com.memsql.spark.etl.api.{UserTransformConfig, UserExtractConfig} import com.memsql.spark.examples.avro.{AvroTransformer, AvroRandomExtractor} import org.apache.spark.streaming.{StreamingContext, Seconds} import test.util.{Fixtures, UnitSpec, LocalSparkContext} import spray.json._ class AvroTransformerSpec extends UnitSpec with LocalSparkContext { var ssc: StreamingContext = _ var msc: MemSQLContext = _ override def beforeEach(): Unit = { super.beforeEach() ssc = new StreamingContext(sc, Seconds(1)) msc = new MemSQLContext(sc) } val avroConfig = Fixtures.avroConfig.parseJson val extractConfig = UserExtractConfig(class_name = "Test", value = avroConfig) val transformConfig = UserTransformConfig(class_name = "Test", value = avroConfig) "AvroRandomTransformer" should "emit a dataframe of properly deserialized data" in { val extractor = new AvroRandomExtractor val transformer = new AvroTransformer extractor.initialize(null, null, extractConfig, 0, null) transformer.initialize(null, transformConfig, null) val maybeDf = extractor.next(null, 0, msc, null, 0, null) assert(maybeDf.isDefined) val extractedDf = maybeDf.get val transformedDf = transformer.transform(msc, extractedDf, null, null) val rows = transformedDf.collect() for (row <- rows) { assert(row(0).isInstanceOf[Boolean]) assert(row(1).isInstanceOf[Double]) assert(row(2).isInstanceOf[Float]) assert(row(3).isInstanceOf[Int]) assert(row(4).isInstanceOf[Long]) assert(row(5) === null) assert(row(6).isInstanceOf[String]) assert(row(7).isInstanceOf[String]) } } }
Example 85
Source File: ThriftRandomExtractor.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.spark.examples.thrift import com.memsql.spark.etl.api._ import com.memsql.spark.etl.utils.PhaseLogger import org.apache.spark.SparkContext import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types._ import org.apache.spark.streaming.StreamingContext import org.apache.thrift.protocol.TBinaryProtocol import org.apache.thrift.{TBase, TFieldIdEnum, TSerializer} class ThriftRandomExtractor extends Extractor { var count: Int = 1 var thriftType: Class[_] = null var serializer: TSerializer = null def schema: StructType = StructType(StructField("bytes", BinaryType, false) :: Nil) override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { val userConfig = config.asInstanceOf[UserExtractConfig] val className = userConfig.getConfigString("className") match { case Some(s) => s case None => throw new IllegalArgumentException("className must be set in the config") } thriftType = Class.forName(className) serializer = new TSerializer(new TBinaryProtocol.Factory()) count = userConfig.getConfigInt("count").getOrElse(1) } override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { val rdd = sqlContext.sparkContext.parallelize((1 to count).map(_ => Row({ val thriftObject = ThriftRandomGenerator.next(thriftType).asInstanceOf[TBase[_ <: TBase[_, _], _ <: TFieldIdEnum]] serializer.serialize(thriftObject) }))) Some(sqlContext.createDataFrame(rdd, schema)) } }
Example 86
Source File: CheckpointingKafkaExtractor.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.spark.examples.kafka import com.memsql.spark.etl.api.{UserExtractConfig, PhaseConfig, ByteArrayExtractor} import com.memsql.spark.etl.utils.PhaseLogger import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.StreamingContext import kafka.serializer.{DefaultDecoder, StringDecoder} import org.apache.spark.streaming.kafka.{CheckpointedDirectKafkaInputDStream, CheckpointedKafkaUtils} import org.apache.spark.streaming.dstream.InputDStream class CheckpointingKafkaExtractor extends ByteArrayExtractor { var CHECKPOINT_DATA_VERSION = 1 var dstream: CheckpointedDirectKafkaInputDStream[String, Array[Byte], StringDecoder, DefaultDecoder, Array[Byte]] = null var zkQuorum: String = null var topic: String = null override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { val kafkaConfig = config.asInstanceOf[UserExtractConfig] zkQuorum = kafkaConfig.getConfigString("zk_quorum").getOrElse { throw new IllegalArgumentException("\"zk_quorum\" must be set in the config") } topic = kafkaConfig.getConfigString("topic").getOrElse { throw new IllegalArgumentException("\"topic\" must be set in the config") } } def extract(ssc: StreamingContext, extractConfig: PhaseConfig, batchDuration: Long, logger: PhaseLogger): InputDStream[Array[Byte]] = { val kafkaParams = Map[String, String]( "memsql.zookeeper.connect" -> zkQuorum ) val topics = Set(topic) dstream = CheckpointedKafkaUtils.createDirectStreamFromZookeeper[String, Array[Byte], StringDecoder, DefaultDecoder]( ssc, kafkaParams, topics, batchDuration, lastCheckpoint) dstream } override def batchCheckpoint: Option[Map[String, Any]] = { dstream match { case null => None case default => { val currentOffsets = dstream.getCurrentOffsets.map { case (tp, offset) => Map("topic" -> tp.topic, "partition" -> tp.partition, "offset" -> offset) } Some(Map("offsets" -> currentOffsets, "zookeeper" -> zkQuorum, "version" -> CHECKPOINT_DATA_VERSION)) } } } override def batchRetry: Unit = { if (dstream.prevOffsets != null) { dstream.setCurrentOffsets(dstream.prevOffsets) } } }
Example 87
Source File: CheckpointedDirectKafkaInputDStream.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka prevOffsets = currentOffsets currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset) prevOffsets == currentOffsets match { case false => Some(rdd) case true => None } } def getCurrentOffsets(): Map[TopicAndPartition, Long] = currentOffsets def setCurrentOffsets(offsets: Map[TopicAndPartition, Long]): Unit = { currentOffsets = offsets } }
Example 88
Source File: FlumeWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.flume._ object FlumeWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("FlumeWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/flume_check") val hostPort=args(0).split(":") System.out.println("Opening a sink at host: [" + hostPort(0) + "] port: [" + hostPort(1).toInt + "]") val lines = FlumeUtils.createPollingStream(ssc, hostPort(0), hostPort(1).toInt, StorageLevel.MEMORY_ONLY) val words = lines .map(e => new String(e.event.getBody.array)).map(_.toLowerCase).flatMap(_.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 89
Source File: KafkaWordCount.scala From Mastering-Scala-Machine-Learning with MIT License | 5 votes |
package org.akozlov.chapter03 import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka._ object KafkaWordCount { def main(args: Array[String]) { // Create the context with a 2 second batch size val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KafkaWordCount") val ssc = new StreamingContext(sparkConf, Seconds(2)) ssc.checkpoint("/tmp/kafka_check") System.out.println("Opening a Kafka consumer at zk: [" + args(0) + "] for group group-1 and topic example") val lines = KafkaUtils.createStream(ssc, args(0), "group-1", Map("example" -> 1), StorageLevel.MEMORY_ONLY) val words = lines .flatMap(_._2.toLowerCase.split("\\W+")) .map(word => (word, 1L)) .reduceByKeyAndWindow(_+_, _-_, Seconds(6), Seconds(2)).print ssc.start() ssc.awaitTermination() } }
Example 90
Source File: StreamingTask.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import java.util.concurrent.TimeUnit import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming._ import com.datastax.sparkstress.RowGenerator.PerfRowGenerator import com.datastax.sparkstress.RowTypes._ import com.datastax.sparkstress.SparkStressImplicits._ import com.datastax.sparkstress.StressTask._ import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{StreamingContext, _} import scala.reflect.ClassTag abstract class StreamingTask[rowType]( val config: Config, val ss: SparkSession) (implicit ct:ClassTag[rowType]) extends StressTask { val ssc = new StreamingContext(ss.sparkContext, Seconds(config.streamingBatchIntervalSeconds)) val opsPerBatch = (config.numReceivers * config.receiverThroughputPerBatch) val estimatedReqRuntime: Long = ((config.totalOps / opsPerBatch) * config.streamingBatchIntervalSeconds) + 10 val terminationTime: Long = { if (config.terminationTimeMinutes == 0) { estimatedReqRuntime } else { val newTerminationTime: Long = TimeUnit.MINUTES.toSeconds(config.terminationTimeMinutes) if (estimatedReqRuntime <= newTerminationTime) { println(s"Using the estimated runtime (${estimatedReqRuntime} secs}) required to stream ${config.totalOps} since it is <= the requested runtime (${newTerminationTime} secs).") estimatedReqRuntime } else { println(s"Converting requested runtime of ${config.terminationTimeMinutes} min to ${newTerminationTime} secs.") newTerminationTime } } } def setupCQL() = { val cc = CassandraConnector(ss.sparkContext.getConf) cc.withSessionDo { session => if (config.deleteKeyspace) { println(s"Destroying Keyspace") session.execute(s"DROP KEYSPACE IF EXISTS ${config.keyspace}") } val kscql = getKeyspaceCql(config.keyspace, getLocalDC(cc), config.replicationFactor) val tbcql = getTableCql(config.table) println( s"""Running the following create statements\n$kscql\n${tbcql.mkString("\n")}""") session.execute(kscql) session.execute(s"USE ${config.keyspace}") for (cql <- tbcql) session.execute(cql) } printf("Done Setting up CQL Keyspace/Table\n") } def getTableCql(tbName: String): Seq[String] override def getGenerator: RowGenerator[PerfRowClass] = generator override def dstreamOps(dstream: DStream[PerfRowClass]): Unit = dstream.saveToCassandra(config.keyspace, config.table) }
Example 91
Source File: L10-9Graph.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.graphx.Edge import org.apache.spark.graphx.Graph import org.apache.spark.graphx.Graph.graphToGraphOps import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object UserRankApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .foreachRDD(rdd => { val edges = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]]) }) .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0))) val vertices = rdd.map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "user_id").extract[String]) }) .map(r => (r.hashCode.toLong, r)) val tolerance = 0.0001 val graph = Graph(vertices, edges, "defaultUser") .subgraph(vpred = (id, idStr) => idStr != "defaultUser") val pr = graph.pageRank(tolerance).cache graph.outerJoinVertices(pr.vertices) { (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs) }.vertices.top(10) { Ordering.by(_._2._1) }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1))) }) ssc.start() ssc.awaitTermination() } }
Example 92
Source File: L10-2DataProc.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.HashPartitioner import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.JsonAST.JNothing import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object DataProcApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) ssc.socketTextStream(hostname, port.toInt) .map(r => { implicit val formats = DefaultFormats parse(r) }) .filter(jvalue => { jvalue \ "attributes" \ "Wi-Fi" != JNothing }) .map(jvalue => { implicit val formats = DefaultFormats ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int]) }) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .print() ssc.start() ssc.awaitTermination() } }
Example 93
Source File: L5-7MultipleSocketStreams.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearMultiApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: TripByYearMultiApp <appname> <hostname> <base_port> <num_of_sockets>") System.exit(1) } val Seq(appName, hostname, basePort, nSockets) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val streams = (0 to nSockets.toInt - 1).map(i => ssc.socketTextStream(hostname, basePort.toInt + i)) val uniStream = ssc.union(streams) uniStream .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 94
Source File: L5-9Mqtt.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.mqtt.MQTTUtils object YearlyDistributionApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: YearlyDistributionApp <appname> <brokerUrl> <topic> <checkpointDir>") System.exit(1) } val Seq(appName, brokerUrl, topic, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => rec.split(",")) .map(rec => (rec(1).split(" ")(0), 1)) .updateStateByKey(statefulCount) .map(pair => (pair._2, pair._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("YearlyDistribution") ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 95
Source File: L5-11FlumePull.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createPollingStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 96
Source File: L5-6SocketStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Seconds, StreamingContext } import org.apache.spark.streaming.dstream.PairDStreamFunctions import java.util.Calendar object TripByYearApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: TripByYearApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split(",")) .map(rec => (rec(13), rec(0).toInt)) .reduceByKey(_ + _) .map(pair => (pair._2, normalizeYear(pair._1))) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles("TripByYear") ssc.start() ssc.awaitTermination() } def normalizeYear(s: String): String = { try { (Calendar.getInstance().get(Calendar.YEAR) - s.toInt).toString } catch { case e: Exception => s } } }
Example 97
Source File: L5-16Twitter.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.storage.StorageLevel import twitter4j.conf.ConfigurationBuilder import twitter4j.TwitterFactory object TwitterApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: TwitterApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) val cb = new ConfigurationBuilder() cb.setOAuthConsumerKey("") cb.setOAuthConsumerSecret("") cb.setOAuthAccessToken("") cb.setOAuthAccessTokenSecret("") val twitterAuth = new TwitterFactory(cb.build()).getInstance().getAuthorization() val tweetStream = TwitterUtils.createStream(ssc, Some(twitterAuth), Array("nyc citi bike", "nyc bike share")) tweetStream.count().print() tweetStream.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 98
Source File: HttpInputDStreamAsync.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import com.ning.http.client.AsyncCompletionHandler import com.ning.http.client.AsyncHttpClient import com.ning.http.client.Response class HttpInputDStreamAsync( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiverAsync(storageLevel, url) } } class HttpReceiverAsync( storageLevel: StorageLevel, url: String) extends Receiver[String](storageLevel) with Logging { var asyncHttpClient: AsyncHttpClient = _ def onStop() { asyncHttpClient.close() logInfo("Disconnected from Http Server") } def onStart() { asyncHttpClient = new AsyncHttpClient() asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() { override def onCompleted(response: Response): Response = { store(response.getResponseBody) return response } override def onThrowable(t: Throwable) { restart("Error! Problems while connecting", t) } }); logInfo("Http Connection initiated") } } object HttpUtilsAsync { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String): DStream[String] = { new HttpInputDStreamAsync(ssc, storageLevel, url) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url) } }
Example 99
Source File: L5-11FlumePush.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.flume.FlumeUtils object DailyUserTypeDistributionApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: DailyUserTypeDistributionApp <appname> <hostname> <port> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, hostname, port, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) FlumeUtils.createStream(ssc, hostname, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) .map(rec => new String(rec.event.getBody().array()).split(",")) .map(rec => ((rec(1).split(" ")(0), rec(12)), 1)) .updateStateByKey(statefulCount) .repartition(1) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } val statefulCount = (values: Seq[Int], state: Option[Int]) => Some(values.sum + state.getOrElse(0)) }
Example 100
Source File: L5-13Kafka.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils object StationJourneyCountApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 101
Source File: L5-18Http.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HttpApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: HttpApp <appname> <outputPath>") System.exit(1) } val Seq(appName, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://www.citibikenyc.com/stations/json", interval = batchInterval) .flatMap(rec => (parse(rec) \ "stationBeanList").children) .filter(rec => { implicit val formats = DefaultFormats (rec \ "statusKey").extract[Integer] != 1 }) .map(rec => rec.filterField { case JField("id", _) => true case JField("stationName", _) => true case JField("statusValue", _) => true case _ => false }) .map(rec => { implicit val formats = DefaultFormats (rec(0)._2.extract[Integer], rec(1)._2.extract[String], rec(2)._2.extract[String]) }) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 102
Source File: L5-14KafkaCustomConf.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka.KafkaUtils import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel object StationJourneyCountCustomApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>") System.exit(1) } val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.streaming.receiver.writeAheadLog.enable", "true") val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val topics = Map[String, Int]( topic -> 1) val params = Map[String, String]( "zookeeper.connect" -> zkQuorum, "group.id" -> consumerGroupId, "bootstrap.servers" -> brokerUrl) KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2) .map(rec => rec.split(",")) .map(rec => ((rec(3), rec(7)), 1)) .reduceByKey(_ + _) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rdd => rdd.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 103
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 104
Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object ReferrerApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>") System.exit(1) } val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.externalBlockStore.url", tachyonUrl) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val clickstream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .persist(StorageLevel.OFF_HEAP) val topRefStream = clickstream .map(rec => { var prev_title = rec(3) if (!prev_title.startsWith("other")) { prev_title = "wikipedia" } (prev_title, 1) }) val topSparkStream = clickstream .filter(rec => rec(4).equals("Apache_Spark")) .map(rec => (rec(3), 1)) saveTopKeys(topRefStream, outputPathTop) saveTopKeys(topSparkStream, outputPathSpark) ssc.start() ssc.awaitTermination() } def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) } }
Example 105
Source File: L7-4UI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.concurrent.atomic.AtomicLong import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object SocialSearchApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: SocialSearchApp <appname> <hostname> <port>") System.exit(1) } val Seq(appName, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) //.set("spark.eventLog.enabled", "true") //.set("spark.eventLog.dir", "/tmp/historical") val countSearch = new AtomicLong(0) val countSocial = new AtomicLong(0) val ssc = new StreamingContext(conf, Seconds(1)) val titleStream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .filter(_(3) match { case "other-google" | "other-bing" | "other-yahoo" | "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => (rec(3), rec(4))) .cache() val searchStream = titleStream.filter(_._1 match { case "other-google" | "other-bing" | "other-yahoo" => true case _ => false }) .map(rec => rec._2) val socialStream = titleStream.filter(_._1 match { case "other-facebook" | "other-twitter" => true case _ => false }) .map(rec => rec._2) val exclusiveSearch = searchStream.transformWith(socialStream, (searchRDD: RDD[String], socialRDD: RDD[String]) => searchRDD.subtract(socialRDD)) .foreachRDD(rdd => { countSearch.addAndGet(rdd.count()) println("Exclusive count search engines: " + countSearch) }) val exclusiveSocial = socialStream.transformWith(searchStream, (socialRDD: RDD[String], searchRDD: RDD[String]) => socialRDD.subtract(searchRDD)) .foreachRDD(rdd => { countSocial.addAndGet(rdd.count()) println("Exclusive count social media: " + countSocial) }) ssc.start() ssc.awaitTermination() } }
Example 106
Source File: L4-1Voyager.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerApp <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC") val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) voyager1.map(rec => { val attrs = rec.split("\\s+") ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble)) }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 107
Source File: L4-4Kryo.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerAppKryo { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[ProtonFlux])) val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val projected = voyager1.map(rec => { val attrs = rec.split("\\s+") new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21), attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27), attrs(28)) }) val filtered = projected.filter(pflux => pflux.isSolarStorm) val yearlyBreakdown = filtered.map(rec => (rec.year, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false)) yearlyBreakdown.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 108
Source File: L8-1DataFrameAPI.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 109
Source File: L8-3-6-7DataFrameCreation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.native.Serialization.write import org.json4s.DefaultFormats object DataframeCreationApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { //val cdrs = sqlC.createDataFrame(seqToCdr(rdd)) //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect()) //val cdrs = seqToCdr(rdd).toDF() val cdrsJson = seqToCdr(rdd).map(r => { implicit val formats = DefaultFormats write(r) }) val cdrs = sqlC.read.json(cdrsJson) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 110
Source File: L8-29DataFrameExamplesJoin.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamples3App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString val gridGeo = (parse(gridFile) \ "features") val gridStr = gridGeo.children.map(r => { val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r)) val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)), ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7))) compact(render(JObject(l))) }) val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr)) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.join(gridDF, $"squareId" === $"id").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 111
Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import java.nio.file.Paths import org.apache.spark.SparkFiles object CdrStreamingSparkRApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ ssc.sparkContext.addFile(rScriptPath) val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) val master = hiveC.sparkContext.getConf.get("spark.master") val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD((rdd, time) => { val iTableName = tableName + time.milliseconds seqToCdr(rdd).toDF().write.saveAsTable(iTableName) hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 112
Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions.desc import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr import org.json4s.DefaultFormats object CdrDataframeExamplesActionsApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) counts.show(5) counts.show() println("head(5): " + counts.head(5)) println("take(5): " + counts.take(5)) println("head(): " + counts.head()) println("first(5): " + counts.first()) println("count(): " + counts.count()) println("collect(): " + counts.collect()) println("collectAsList(): " + counts.collectAsList()) println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) counts.write.format("parquet").save("/tmp/parquent" + rdd.id) counts.write.format("json").save("/tmp/json" + rdd.id) counts.write.parquet("/tmp/parquent2" + rdd.id) counts.write.json("/tmp/json2" + rdd.id) counts.write.saveAsTable("count_table") cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") val prop: java.util.Properties = new java.util.Properties() counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 113
Source File: L8-10-11UDF.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.io.Source import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.jackson.JsonMethods.parse import org.json4s.jvalue2extractable import org.json4s.string2JsonInput object CdrUDFApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrUDFApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ def getCountryCodeMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap) } def getCountryNameMapping() = { implicit val formats = org.json4s.DefaultFormats parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]] } def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = { mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound") } val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int) sqlC.udf.register("getCountryNamePartial", getCountryNamePartial) val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 114
Source File: L8-4DataFrameCreationSchema.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataframeCreationApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeApp2 <appname> <batchInterval> <hostname> <port> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } }
Example 115
Source File: L8-14-27DataFrameExamples.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamplesApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.select("squareId", "timeInterval", "countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").show() cdrs.filter("squareId = 5").show() cdrs.drop("countryCode").show() cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show() cdrs.limit(5).show() cdrs.groupBy("squareId").count().show() cdrs.groupBy("countryCode").avg("internetTrafficActivity").show() cdrs.groupBy("countryCode").max("callOutActivity").show() cdrs.groupBy("countryCode").min("callOutActivity").show() cdrs.groupBy("squareId").sum("internetTrafficActivity").show() cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show() cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show() cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode()) cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode()) cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show() cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show() cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show() cdrs.sample(true, 0.01).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 116
Source File: L8-28DataFrameExamplesOps.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrDataframeExamples2App { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamples2App <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ var previousCdrs: Option[DataFrame] = None val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates() previousCdrs match { case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show() //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show() //case Some(prevCdrs) => cdrs.except(prevCdrs).show() case None => Unit } previousCdrs = Some(cdrs) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 117
Source File: T8-3DataFrameExamplesNA.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JDouble import org.json4s.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.compact import org.json4s.native.JsonMethods.parse import org.json4s.native.JsonMethods.render import org.json4s.string2JsonInput object CdrDataframeExamplesNAApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesNAApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.na.drop("any").show() cdrs.na.fill(0, Array("squareId")).show() cdrs.na.replace("squareId", Map(0 -> 1)).show() println("Correlation: " + cdrs.stat.corr("smsOutActivity", "callOutActivity")) println("Covariance: " + cdrs.stat.cov("smsInActivity", "callInActivity")) cdrs.stat.crosstab("squareId", "countryCode").show() cdrs.stat.freqItems(Array("squareId", "countryCode"), 0.1).show() cdrs.stat.crosstab("callOutActivity", "callInActivity").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 118
Source File: L8-8Sql.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrSqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrSqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() cdrs.registerTempTable("cdrs") sqlC.sql("SELECT countryCode, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show() sqlC.dropTempTable("cdrs") }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 119
Source File: L8-35DataFrameExamplesRDD.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats object CdrDataframeExamplesRDDApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamplesRDDApp <appname> <batchInterval> <hostname> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema) val highOther = cdrs.except(highInternet) val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates() val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates() highOtherGrid.except(highInternetGrid).show() highInternetGrid.except(highOtherGrid).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 120
Source File: L8-13HiveQL.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrHiveqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { seqToCdr(rdd).toDF().registerTempTable("cdrs") hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'") hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 121
Source File: L6-6PerRecord.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppB { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreach { rec => { val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8))) client.disconnect() client.close() } } } ssc.start() ssc.awaitTermination() } }
Example 122
Source File: L6-12StaticPool.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppF { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val mqttSink = ssc.sparkContext.broadcast(MqttSinkLazy(outputBrokerUrl)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => par.foreach(message => mqttSink.value.client.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) } } ssc.start() ssc.awaitTermination() } } class MqttSinkLazy(brokerUrl: String) extends Serializable { lazy val client = { val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() sys.addShutdownHook { client.disconnect() client.close() } client } } object MqttSinkLazy { val brokerUrl = "tcp://localhost:1883" val client = new MqttSinkLazy(brokerUrl) def apply(brokerUrl: String): MqttSinkLazy = { client } }
Example 123
Source File: L6-8Static.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppD { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => par.foreach(message => MqttSink().publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) } } ssc.start() ssc.awaitTermination() } } object MqttSink { val brokerUrl = "tcp://localhost:1883" val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() sys.addShutdownHook { client.disconnect() client.close() } def apply(): MqttClient = { client } }
Example 124
Source File: L6-18Cassandra.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.Text import java.nio.ByteBuffer import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat import org.apache.cassandra.hadoop.ConfigHelper import org.apache.cassandra.thrift.ColumnOrSuperColumn import org.apache.cassandra.thrift.Column import org.apache.cassandra.utils.ByteBufferUtil import org.apache.cassandra.thrift.Mutation import java.util.Arrays object CassandraSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val jobConf = new Configuration() ConfigHelper.setOutputRpcPort(jobConf, cassandraPort) ConfigHelper.setOutputInitialAddress(jobConf, cassandraHost) ConfigHelper.setOutputColumnFamily(jobConf, keyspace, columnFamilyName) ConfigHelper.setOutputPartitioner(jobConf, "Murmur3Partitioner") rdd.map(rec => { val c = new Column() c.setName(ByteBufferUtil.bytes(columnName)) c.setValue(ByteBufferUtil.bytes(rec._2 / (windowSize / batchInterval))) c.setTimestamp(System.currentTimeMillis) val m = new Mutation() m.setColumn_or_supercolumn(new ColumnOrSuperColumn()) m.column_or_supercolumn.setColumn(c) (ByteBufferUtil.bytes(rec._1), Arrays.asList(m)) }).saveAsNewAPIHadoopFile(keyspace, classOf[ByteBuffer], classOf[List[Mutation]], classOf[ColumnFamilyOutputFormat], jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 125
Source File: L6-20CassandraConnector.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import com.datastax.spark.connector.SomeColumns import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming.toDStreamFunctions import com.datastax.spark.connector.toNamedColumnRef object CassandraConnectorSinkApp { def main(args: Array[String]) { if (args.length != 6) { System.err.println( "Usage: CassandraConnectorSinkApp <appname> <cassandraHost> <cassandraPort> <keyspace> <tableName> <columnName>") System.exit(1) } val Seq(appName, cassandraHost, cassandraPort, keyspace, tableName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.cassandra.connection.host", cassandraHost) .set("spark.cassandra.connection.port", cassandraPort) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) CassandraConnector(conf).withSessionDo { session => session.execute(s"CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }".format(keyspace)) session.execute(s"CREATE TABLE IF NOT EXISTS %s.%s (key TEXT PRIMARY KEY, %s FLOAT)".format(keyspace, tableName, columnName)) } HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .map(stock => (stock._1, stock._2 / (windowSize / batchInterval))) .saveToCassandra(keyspace, tableName) ssc.start() ssc.awaitTermination() } }
Example 126
Source File: L6-5Exception.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppA { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() rdd.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) client.disconnect() client.close() } ssc.start() ssc.awaitTermination() } }
Example 127
Source File: L6-10LazyStatic.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput import org.apache.commons.pool2.PooledObject import org.apache.commons.pool2.BasePooledObjectFactory import org.apache.commons.pool2.impl.DefaultPooledObject import org.apache.commons.pool2.impl.GenericObjectPool import org.apache.commons.pool2.ObjectPool object MqttSinkAppE { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => val mqttSink = MqttSinkPool().borrowObject() par.foreach(message => mqttSink.publish(topic, new MqttMessage(message.getBytes(StandardCharsets.UTF_8)))) MqttSinkPool().returnObject(mqttSink) } } ssc.start() ssc.awaitTermination() } } object MqttSinkPool { val poolSize = 8 val brokerUrl = "tcp://localhost:1883" val mqttPool = new GenericObjectPool[MqttClient](new MqttClientFactory(brokerUrl)) mqttPool.setMaxTotal(poolSize) sys.addShutdownHook { mqttPool.close() } def apply(): GenericObjectPool[MqttClient] = { mqttPool } } class MqttClientFactory(brokerUrl: String) extends BasePooledObjectFactory[MqttClient] { override def create() = { val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() client } override def wrap(client: MqttClient) = new DefaultPooledObject[MqttClient](client) override def validateObject(pObj: PooledObject[MqttClient]) = pObj.getObject.isConnected() override def destroyObject(pObj: PooledObject[MqttClient]) = { pObj.getObject.disconnect() pObj.getObject.close() } override def passivateObject(pObj: PooledObject[MqttClient]) = {} }
Example 128
Source File: L6-16SparkHBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object SparkHBaseBulkPutApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val hbaseConf = HBaseConfiguration.create() val hContext = new HBaseContext(ssc.sparkContext, hbaseConf) val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) put }) ssc.start() ssc.awaitTermination() } }
Example 129
Source File: L6-22Counters.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.concurrent.atomic.AtomicLong import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object StatefulCountersApp { def main(args: Array[String]) { if (args.length != 1) { System.err.println( "Usage: StatefulCountersApp <appname>") System.exit(1) } val Seq(appName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) var globalMax: AtomicLong = new AtomicLong(Long.MinValue) var globalMin: AtomicLong = new AtomicLong(Long.MaxValue) var globalCounter500: AtomicLong = new AtomicLong(0) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong)) }) .foreachRDD(rdd => { val stocks = rdd.take(10) stocks.foreach(stock => { val price = stock._2 val volume = stock._3 if (volume > globalMax.get()) { globalMax.set(volume) } if (volume < globalMin.get()) { globalMin.set(volume) } if (price > 500) { globalCounter500.incrementAndGet() } }) if (globalCounter500.get() > 1000L) { println("Global counter has reached 1000") println("Max ----> " + globalMax.get) println("Min ----> " + globalMin.get) globalCounter500.set(0) } }) ssc.start() ssc.awaitTermination() } }
Example 130
Source File: L6-24Accumulators.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.collection.mutable import org.apache.spark.AccumulableParam import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object StatefulAccumulatorsApp { object StockAccum extends AccumulableParam[mutable.HashMap[String, (Long, Long, Long)], (String, (Float, Long))] { def zero(t: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { new mutable.HashMap[String, (Long, Long, Long)]() } def addInPlace(t1: mutable.HashMap[String, (Long, Long, Long)], t2: mutable.HashMap[String, (Long, Long, Long)]): mutable.HashMap[String, (Long, Long, Long)] = { t1 ++ t2.map { case (k, v2) => (k -> { val v1 = t1.getOrElse(k, (Long.MaxValue, Long.MinValue, 0L)) val newMin = if (v2._1 < v1._1) v2._1 else v1._1 val newMax = if (v2._2 > v1._2) v2._2 else v1._2 (newMin, newMax, v1._3 + v2._3) }) } } def addAccumulator(t1: mutable.HashMap[String, (Long, Long, Long)], t2: (String, (Float, Long))): mutable.HashMap[String, (Long, Long, Long)] = { val prevStats = t1.getOrElse(t2._1, (Long.MaxValue, Long.MinValue, 0L)) val newVals = t2._2 var newCount = prevStats._3 if (newVals._1 > 500.0) { newCount += 1 } val newMin = if (newVals._2 < prevStats._1) newVals._2 else prevStats._1 val newMax = if (newVals._2 > prevStats._2) newVals._2 else prevStats._2 t1 += t2._1 -> (newMin, newMax, newCount) } } def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: StatefulAccumulatorsApp <appname> <checkpointDir>") System.exit(1) } val Seq(appName, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) val stateAccum = ssc.sparkContext.accumulable(new mutable.HashMap[String, (Long, Long, Long)]())(StockAccum) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) }) .foreachRDD(rdd => { rdd.foreach({ stock => stateAccum += (stock._1, (stock._2._1, stock._2._2)) }) for ((sym, stats) <- stateAccum.value.to) printf("Symbol: %s, Stats: %s\n", sym, stats) }) ssc.start() ssc.awaitTermination() } }
Example 131
Source File: L6-7PerPartition.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.json4s.DefaultFormats import org.json4s.JField import org.json4s.JsonAST.JObject import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object MqttSinkAppC { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: MqttSinkApp <appname> <outputBrokerUrl> <topic>") System.exit(1) } val Seq(appName, outputBrokerUrl, topic) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { val query = parse(rec) \ "query" ((query \ "results" \ "quote").children).map(rec => JObject(JField("Timestamp", query \ "created")).merge(rec)) }) .map(rec => { implicit val formats = DefaultFormats rec.children.map(f => f.extract[String]) mkString "," }) .foreachRDD { rdd => rdd.foreachPartition { par => val client = new MqttClient(outputBrokerUrl, MqttClient.generateClientId(), new MemoryPersistence()) client.connect() par.foreach(rec => client.publish(topic, new MqttMessage(rec.getBytes(StandardCharsets.UTF_8)))) client.disconnect() client.close() } } ssc.start() ssc.awaitTermination() } }
Example 132
Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.io.Text import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HBaseSinkApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val hbaseConf = HBaseConfiguration.create() hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName) hbaseConf.set("hbase.master", hbaseMaster) val jobConf = new Configuration(hbaseConf) jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName) rdd.map(rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) (rec._1, put) }).saveAsNewAPIHadoopDataset(jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 133
Source File: L6-23UpdateState.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object StatefulUpdateStateApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: StatefulUpdateStateApp <appname> <checkpointDir>") System.exit(1) } val Seq(appName, checkpointDir) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) ssc.checkpoint(checkpointDir) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], ((rec \ "LastTradePriceOnly").extract[String].toFloat, (rec \ "Volume").extract[String].toLong))) }) .updateStateByKey(updateState) .print() def updateState(values: Seq[(Float, Long)], state: Option[(Long, Long, Long)]): Option[(Long, Long, Long)] = { val volumes = values.map(s => s._2) val localMin = volumes.min val localMax = volumes.max val localCount500 = values.map(s => s._1).count(price => price > 500) val globalValues = state.getOrElse((Long.MaxValue, Long.MinValue, 0L)).asInstanceOf[(Long, Long, Long)] val newMin = if (localMin < globalValues._1) localMin else globalValues._1 val newMax = if (localMax > globalValues._2) localMax else globalValues._2 val newCount500 = globalValues._3 + localCount500 return Some(newMin, newMax, newCount500) } ssc.start() ssc.awaitTermination() } }
Example 134
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 135
Source File: AzureStreamingExample.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.examples import com.cloudera.spark.cloud.ObjectStoreExample import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} override def action( sparkConf: SparkConf, args: Array[String]): Int = { if (args.length != 3) { return usage() } sparkConf.setAppName("CloudStreaming") applyObjectStoreConfigurationOptions(sparkConf, false) val dest = args(0) val delay = Integer.valueOf(args(1)) val interval = Integer.valueOf(args(2)) // Create the context val streaming = new StreamingContext(sparkConf, Seconds(10)) try { // Create the FileInputDStream on the directory regexp and use the // stream to look for a new file renamed into it val destPath = new Path(dest) val sc = streaming.sparkContext val hc = sc.hadoopConfiguration val fs = destPath.getFileSystem(hc) rm(fs, destPath) fs.mkdirs(destPath) val sightings = sc.longAccumulator("sightings") print("===================================") print(s"Looking for text files under ${destPath}") print("===================================") val lines = streaming.textFileStream(dest) val matches = lines.map(line => { sightings.add(1) print(s"[${sightings.value}]: $line") line }) // materialize the operation matches.print() // start the streaming streaming.start() // sleep a bit to get streaming up and running Thread.sleep(delay * 1000) print("===================================") print(s"Seen ${sightings.value} lines") 0 } finally { streaming.stop(true) } } } object AzureStreamingExample { def main(args: Array[String]) { new AzureStreamingExample().run(args) } }
Example 136
Source File: SparkJob.scala From intro-to-dcos with Apache License 2.0 | 5 votes |
package de.codecentric.dcos_intro.spark import de.codecentric.dcos_intro.{Tweet, TweetDecoder} import kafka.serializer.StringDecoder import org.apache.spark.SparkConf import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import com.datastax.spark.connector.streaming._ object SparkJob { def main(args: Array[String]) { val consumerTopic = args(0) val sparkConf = new SparkConf() .setAppName(getClass.getName) .set("spark.cassandra.connection.host", s"${args(1)}") .set("spark.cassandra.connection.port", s"${args(2)}") val consumerProperties = Map("bootstrap.servers" -> args(3), "auto.offset.reset" -> "smallest") val ssc = new StreamingContext(sparkConf, Seconds(1)) val kafkaStream = KafkaUtils.createDirectStream[String, Tweet, StringDecoder, TweetDecoder]( ssc, consumerProperties, Set(consumerTopic) ) kafkaStream.map(tuple => tuple._2).saveToCassandra("dcos", "tweets") ssc.start() ssc.awaitTermination() ssc.stop() } }
Example 137
Source File: TwitterStream.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries import com.google.gson.GsonBuilder import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Minutes, StreamingContext} import org.apache.spark.{Logging, SparkConf, SparkContext} import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder import scala.util.Try object TwitterStream extends SimpleConfig with Logging { def getTwitterStream(ssc: StreamingContext, filters: Seq[String] = Nil) = { val builder = new ConfigurationBuilder() builder.setOAuthConsumerKey(twitterApiKey) builder.setOAuthConsumerSecret(twitterApiSecret) builder.setOAuthAccessToken(twitterTokenKey) builder.setOAuthAccessTokenSecret(twitterTokenSecret) val configuration = builder.build() TwitterUtils.createStream( ssc, Some(new OAuthAuthorization(configuration)), filters, StorageLevel.MEMORY_ONLY ) } def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("Twitter Extractor") val sc = new SparkContext(sparkConf) val ssc = new StreamingContext(sc, Minutes(5)) val twitterStream = getTwitterStream(ssc, args).mapPartitions({ it => val gson = new GsonBuilder().create() it map { s => Try(gson.toJson(s)) } }) twitterStream .filter(_.isSuccess) .map(_.get) .saveAsTextFiles("twitter") // Start streaming context ssc.start() ssc.awaitTermination() } }
Example 138
Source File: KappaTagging.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.tagging import com.typesafe.config.ConfigFactory import io.gzet.tagging.gdelt.GdeltTagger import io.gzet.tagging.twitter.TwitterHIS import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import twitter4j.Status import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder object KappaTagging { final val config = ConfigFactory.load().getConfig("io.gzet.kappa") final val esNodes = config.getString("esNodes") final val batchSize = config.getInt("batchSize") def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("GDELT Kappa tagging") val ssc = new StreamingContext(sparkConf, Seconds(batchSize)) val sc = ssc.sparkContext // Create a counter that can be shared accross batches val batchId = sc.longAccumulator("GZET") val twitterStream = createTwitterStream(ssc, Array[String]()) val twitterProcessor = new TwitterHIS() twitterProcessor.train(twitterStream, batchId) val gdeltStream = createGdeltStream(ssc) val gdeltProcessor = new GdeltTagger() gdeltProcessor.predict(gdeltStream, batchId) ssc.start() ssc.awaitTermination() } private def createTwitterStream(ssc: StreamingContext, filters: Array[String]): DStream[Status] = { TwitterUtils.createStream( ssc, getTwitterConfiguration, filters ) } private def getTwitterConfiguration = { val builder = new ConfigurationBuilder() builder.setOAuthConsumerKey(config.getString("apiKey")) builder.setOAuthConsumerSecret(config.getString("apiSecret")) builder.setOAuthAccessToken(config.getString("tokenKey")) builder.setOAuthAccessTokenSecret(config.getString("tokenSecret")) val configuration = builder.build() Some(new OAuthAuthorization(configuration)) } private def createGdeltStream(ssc: StreamingContext) = { val topics = Map( config.getString("kafkaTopic") -> config.getInt("kafkaTopicPartition") ) KafkaUtils.createStream( ssc, config.getString("zkQuorum"), config.getString("kafkaGroupId"), topics ).map(_._2) } }
Example 139
Source File: HashTagsStreamingSpec.scala From dataproc-pubsub-spark-streaming with Apache License 2.0 | 5 votes |
package demo import demo.HashTagsStreaming._ import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import org.scalatest._ class HashTagsStreamingSpec extends WordSpec with MustMatchers with BeforeAndAfter { private var sc: SparkContext = _ private var ssc: StreamingContext = _ before { val conf = new SparkConf().setAppName("unit-testing").setMaster("local") ssc = new StreamingContext(conf, Seconds(1)) sc = ssc.sparkContext } after { if (ssc != null) { ssc.stop() } } def getPopularTagsTestHelper(input: List[String], expected: List[Popularity]) = { val inputRDD: RDD[String] = sc.parallelize(input) val res: Array[Popularity] = extractTrendingTags(inputRDD).collect() res must have size expected.size res.map(_.tag).toList must contain theSameElementsInOrderAs expected.map(_.tag).toList res.map(_.amount).toList must contain theSameElementsInOrderAs expected.map(_.amount).toList } "getPopularTags op" should { "extract and sorts tags for single rdd" in { getPopularTagsTestHelper(List("#t1 #t2 #t3", "#t1 #t2", "#t1 #t3", "#t1 #t3 #t4"), List(("t1", 4), ("t3", 3), ("t2", 2), ("t4", 1)).map(r => Popularity(r._1, r._2))) } "sort lexicographically in case of equal occurrences" in { getPopularTagsTestHelper(List("#t1 #t2", "#t2 #t1", "#t1", "#t2"), List(("t1", 3), ("t2", 3)).map(r => Popularity(r._1, r._2))) } "bring to lowercase" in { getPopularTagsTestHelper(List("#tag1 #tag2", "#Tag1", "#tag1", "#tAG2"), List(("tag1", 3), ("tag2", 2)).map(r => Popularity(r._1, r._2))) } "remove # only from the beginning of the hashtag" in { getPopularTagsTestHelper(List("#t1 #t2", "#t#1", "#t2#"), List(("t#1", 1), ("t1", 1), ("t2", 1), ("t2#", 1)).map(r => Popularity(r._1, r._2))) } "remove empty hashtags and punctuations" in { getPopularTagsTestHelper(List("#t1 #t2, # #!?", "#t1? ##t2!"), List(("t1", 2), ("#t2", 1), ("t2", 1)).map(r => Popularity(r._1, r._2))) } "ignores non-tags" in { getPopularTagsTestHelper(List("#t1 #t2, #t3 t3", "#t3"), List(("t3", 2), ("t1", 1), ("t2", 1)).map(r => Popularity(r._1, r._2))) } } }
Example 140
Source File: TrendingHashtags.scala From dataproc-pubsub-spark-streaming with Apache License 2.0 | 5 votes |
package demo import java.nio.charset.StandardCharsets import com.google.cloud.datastore._ import demo.DataStoreConverter.saveRDDtoDataStore import demo.HashTagsStreaming.processTrendingHashTags import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.pubsub.{PubsubUtils, SparkGCPCredentials} import org.apache.spark.streaming.{Seconds, StreamingContext} object TrendingHashtags { def createContext(projectID: String, windowLength: String, slidingInterval: String, checkpointDirectory: String) : StreamingContext = { // [START stream_setup] val sparkConf = new SparkConf().setAppName("TrendingHashtags") val ssc = new StreamingContext(sparkConf, Seconds(slidingInterval.toInt)) // Set the checkpoint directory val yarnTags = sparkConf.get("spark.yarn.tags") val jobId = yarnTags.split(",").filter(_.startsWith("dataproc_job")).head ssc.checkpoint(checkpointDirectory + '/' + jobId) // Create stream val messagesStream: DStream[String] = PubsubUtils .createStream( ssc, projectID, None, "tweets-subscription", // Cloud Pub/Sub subscription for incoming tweets SparkGCPCredentials.builder.build(), StorageLevel.MEMORY_AND_DISK_SER_2) .map(message => new String(message.getData(), StandardCharsets.UTF_8)) // [END stream_setup] //process the stream processTrendingHashTags(messagesStream, windowLength.toInt, slidingInterval.toInt, 10, //decoupled handler that saves each separate result for processed to datastore saveRDDtoDataStore(_, windowLength.toInt) ) ssc } def main(args: Array[String]): Unit = { if (args.length != 5) { System.err.println( """ | Usage: TrendingHashtags <projectID> <windowLength> <slidingInterval> <totalRunningTime> | | <projectID>: ID of Google Cloud project | <windowLength>: The duration of the window, in seconds | <slidingInterval>: The interval at which the window calculation is performed, in seconds | <totalRunningTime>: Total running time for the application, in minutes. If 0, runs indefinitely until termination. | <checkpointDirectory>: Directory used to store RDD checkpoint data | """.stripMargin) System.exit(1) } val Seq(projectID, windowLength, slidingInterval, totalRunningTime, checkpointDirectory) = args.toSeq // Create Spark context val ssc = StreamingContext.getOrCreate(checkpointDirectory, () => createContext(projectID, windowLength, slidingInterval, checkpointDirectory)) // Start streaming until we receive an explicit termination ssc.start() if (totalRunningTime.toInt == 0) { ssc.awaitTermination() } else { ssc.awaitTerminationOrTimeout(1000 * 60 * totalRunningTime.toInt) } } }
Example 141
Source File: CountIntByStreaming.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.kafka import com.sev7e0.wow.spark_streaming.StreamingLogger import org.apache.kafka.clients.consumer.ConsumerConfig import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.SparkConf import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} ; object CountIntByStreaming { val brokerList = "localhost:9092" val topic = "randomCount" val groupId = "group"; val path = "temp/checkpoint/CountIntBySS"; val master = "local"; def main(args: Array[String]): Unit = { val prop = initProperties() val topics = Array(topic) //设置打印日志级别 StreamingLogger.setLoggerLevel() val sparkConf = new SparkConf() .setAppName(CountIntByStreaming.getClass.getName) .setMaster(master) //实例化StreamingContext,设置间隔两秒 val ssc = new StreamingContext(sparkConf, Seconds(2)) //设置checkpoint路径 ssc.checkpoint(path) //使用KafkaUtils获取DStream val kafkaDS = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, prop)) kafkaDS.map(record => { val value = record.value().toLong value }).reduce(_ + _).print() def initProperties(): Map[String, Object] = Map[String, Object]( ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, ConsumerConfig.GROUP_ID_CONFIG -> groupId, ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest", ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean) ) }
Example 142
Source File: 7_RecoverableNetworkWordCount.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.spark_streaming import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} import org.apache.spark.util.LongAccumulator import org.apache.spark.{SparkConf, SparkContext} object RecoverableNetworkWordCount { def main(args: Array[String]): Unit = { StreamingLogger.setLoggerLevel() val conf = new SparkConf().setMaster("local").setAppName(RecoverableNetworkWordCount.getClass.getName) val context = new StreamingContext(conf, Seconds(1)) val linesDS = context.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_2) val wordsCounts = linesDS.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) wordsCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => { val blackList = WordBlackList.getInstance(context.sparkContext) val accumulator = DropWordCounter.getInstance(context.sparkContext) val str = rdd.filter { case (word, count) => if (blackList.value.contains(word)) { accumulator.add(count) false } else { true } }.collect().mkString("[", ", ", "]") println(s"str = $str") }) } } object WordBlackList { @volatile private var instance: Broadcast[Seq[String]] = _ def getInstance(context: SparkContext): Broadcast[Seq[String]] = { if (instance == null) { synchronized { if (instance == null) { val blackList = Seq("a", "b", "c") instance = context.broadcast(blackList) } } } instance } } object DropWordCounter { @volatile private var instance: LongAccumulator = _ def getInstance(context: SparkContext): LongAccumulator = { if (instance == null) { synchronized { if (instance == null) { instance = context.longAccumulator("WordCount") } } } instance } }
Example 143
Source File: StreamingDistinct.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.spark_streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingDistinct { def main(args: Array[String]): Unit = { val nameConf = new SparkConf().setMaster("local[*]").setAppName(StreamingDistinct.getClass.getName) val context = new StreamingContext(nameConf, Seconds(2)) val l = System.currentTimeMillis() context.checkpoint("target/checkpoint/"+l+"/") val scoketDS = context.socketTextStream("localhost", 9999, storageLevel = StorageLevel.MEMORY_ONLY) val wordsDS = scoketDS.flatMap(line => line.split(" ")) val mapDS = wordsDS.map((_, 1)) //去重的话需要考虑状态。 val value1 = mapDS.updateStateByKey((value: Seq[Int], state: Option[Int]) => { var s = state.getOrElse(0) for (_ <- value) { if (s == 0){ s += 1 } } Option(s) }).map(key=>key._1) value1.print() println(value1.count()) context.start() context.awaitTermination() } }
Example 144
Source File: 5_DataFrameAndSql.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.spark_streaming import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} } object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(conf: SparkConf): SparkSession = { if (instance.==(null)) { instance = SparkSession .builder() .config(conf) .getOrCreate() } instance } }
Example 145
Source File: FileStreamReader.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.backends.spark import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import scala.io.Source import java.io.{ FileNotFoundException, IOException } import org.apache.spark.streaming.scheduler._ import org.apache.spark.streaming.StreamingContext class FileStreamReader(file: String, @transient val ssc: StreamingContext) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { def onStart() { // Start the thread that reads data from a file new Thread("FileStreamReader") { override def run() { receive() } }.start() } def onStop() { // There is nothing to do here } private def receive() { try { for (line <- Source.fromFile(file).getLines()) { store(line) //Thread sleep 1000 // for testing } //stop("stopped ...") // stop receiver //ssc.stop() //SparkStream.ssc.stop(true, true) // stop streaming context gracefully } catch { case ex: FileNotFoundException => println(s"Could not find $file file.") case ex: IOException => println(s"Had an IOException during reading $file file") } finally { stop("Stopped Receiver") ssc.stop(true, true) SparkStream.ssc.stop(true, true) //sys.exit() } } } class FileReader(ssc: StreamingContext) { def readFile(file: String) = ssc.receiverStream(new FileStreamReader(file, ssc)) } object FileStreamReader { implicit def customFileStreamReader(ssc: StreamingContext) = new FileReader(ssc) }
Example 146
Source File: SparkStream.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.test import java.lang.Thread._ import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerReceiverStarted} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterEach, Suite} trait SparkStream extends BeforeAndAfterEach { this: Suite with RemoteTest => protected var sc: SparkContext = _ protected var ssc: StreamingContext = _ private lazy val config: SparkConf = new SparkConf().setMaster("local[8]").setAppName(this.getClass.getName).set("spark.driver.host","127.0.0.1") override protected def beforeEach(): Unit = { sc = new SparkContext(config) ssc = new StreamingContext(sc, Seconds(1)) super.beforeEach() } override protected def afterEach(): Unit = { ssc.stop(stopSparkContext = true) sc.stop() super.afterEach() } protected def executeAfterReceiverStarted(block: => Unit) = { ssc.addStreamingListener(new StreamingListener { override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { sleep(1000) block } }) } }
Example 147
Source File: StreamingUtils.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.test import java.time.{Duration => JDuration} import java.util.concurrent.TimeUnit import java.util.{List => JList} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import scala.annotation.meta.param import scala.collection.JavaConverters._ import scala.concurrent.duration.Duration import scala.reflect.ClassTag object StreamingUtils { class TestReceiver[T](of: Seq[T], streamItemEvery: Duration) extends Receiver[T](StorageLevel.MEMORY_ONLY) { override def onStart(): Unit = { of.foreach { item => Thread.sleep(streamItemEvery.toMillis) store(item) } } override def onStop(): Unit = {} } class TestInputDStream[T: ClassTag](@(transient@param) ssc_ : StreamingContext, of: Seq[T], streamItemEvery: Duration) extends ReceiverInputDStream[T](ssc_) { override def getReceiver(): Receiver[T] = new TestReceiver[T](of, streamItemEvery) } def createJavaReceiverDInputStream[T](jssc: JavaStreamingContext, of: JList[T], streamItemEvery: JDuration): JavaReceiverInputDStream[T] = { implicit val cmt: ClassTag[T] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]] JavaReceiverInputDStream.fromReceiverInputDStream(new TestInputDStream[T](jssc.ssc, of.asScala, Duration(streamItemEvery.getNano, TimeUnit.NANOSECONDS))) } }
Example 148
Source File: InfinispanInputDStream.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.stream import java.nio._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.infinispan.client.hotrod.annotation._ import org.infinispan.client.hotrod.event.{ClientCacheEntryCustomEvent, ClientEvent} import org.infinispan.client.hotrod.{DataFormat, RemoteCache, RemoteCacheManager} import org.infinispan.commons.configuration.ClassWhiteList import org.infinispan.commons.io.UnsignedNumeric import org.infinispan.spark._ import org.infinispan.spark.config.ConnectorConfiguration import org.infinispan.spark.rdd.RemoteCacheManagerBuilder class InfinispanInputDStream[K, V](@transient val ssc_ : StreamingContext, storage: StorageLevel, configuration: ConnectorConfiguration, includeState: Boolean = false) extends ReceiverInputDStream[(K, V, ClientEvent.Type)](ssc_) { override def getReceiver(): Receiver[(K, V, ClientEvent.Type)] = new EventsReceiver(storage, configuration, includeState) } private class EventsReceiver[K, V](storageLevel: StorageLevel, configuration: ConnectorConfiguration, includeState: Boolean) extends Receiver[(K, V, ClientEvent.Type)](storageLevel) { @transient private lazy val listener = if (includeState) new EventListenerWithState(remoteCache.getDataFormat) else new EventListenerWithoutState(remoteCache.getDataFormat) @transient private var cacheManager: RemoteCacheManager = _ @transient private var remoteCache: RemoteCache[K, V] = _ override def onStart(): Unit = { cacheManager = RemoteCacheManagerBuilder.create(configuration) remoteCache = getCache[K, V](configuration, cacheManager) remoteCache.addClientListener(listener) } override def onStop(): Unit = { if (cacheManager != null) { cacheManager.stop() cacheManager = null } } private sealed trait EventListener { var dataFormat: DataFormat @ClientCacheEntryRemoved @ClientCacheEntryExpired def onRemove(event: ClientCacheEntryCustomEvent[Array[Byte]]) { emitEvent(event, ignoreValue = true) } @ClientCacheEntryCreated @ClientCacheEntryModified def onAddModify(event: ClientCacheEntryCustomEvent[Array[Byte]]) { emitEvent(event, ignoreValue = false) } private def emitEvent(event: ClientCacheEntryCustomEvent[Array[Byte]], ignoreValue: Boolean) = { val eventData = event.getEventData val rawData = ByteBuffer.wrap(eventData) val rawKey = readElement(rawData) val classWhiteList = new ClassWhiteList() val key: K = dataFormat.keyToObj[K](rawKey, new ClassWhiteList()) val value = if (!ignoreValue) { val rawValue = readElement(rawData) dataFormat.valueToObj[V](rawValue, classWhiteList) } else null.asInstanceOf[V] store((key, value, event.getType)) } private def readElement(in: ByteBuffer): Array[Byte] = { val length = UnsignedNumeric.readUnsignedInt(in) val element = new Array[Byte](length) in.get(element) element } } @ClientListener(converterFactoryName = "___eager-key-value-version-converter", useRawData = true, includeCurrentState = true) private class EventListenerWithState(var dataFormat: DataFormat) extends EventListener @ClientListener(converterFactoryName = "___eager-key-value-version-converter", useRawData = true, includeCurrentState = false) private class EventListenerWithoutState(var dataFormat: DataFormat) extends EventListener }
Example 149
Source File: StreamConsumerScala.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.examples.twitter import java.util.concurrent.{Executors, TimeUnit} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.streaming.{Seconds, StreamingContext} import org.infinispan.client.hotrod.RemoteCacheManager import org.infinispan.client.hotrod.configuration.ConfigurationBuilder import org.infinispan.spark.examples.twitter.Sample.{getSparkConf, runAndExit, usageStream} import org.infinispan.spark.examples.util.TwitterDStream import org.infinispan.spark.stream._ import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps object StreamConsumerScala { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.WARN) if (args.length < 2) { usageStream("StreamConsumerScala") } val infinispanHost = args(0) val duration = args(1).toLong * 1000 val conf = getSparkConf("spark-infinispan-stream-consumer-scala") val sparkContext = new SparkContext(conf) val streamingContext = new StreamingContext(sparkContext, Seconds(1)) val config = Sample.getConnectorConf(infinispanHost) val remoteCacheManager = new RemoteCacheManager(new ConfigurationBuilder().withProperties(config.getHotRodClientProperties).build()) val cache = remoteCacheManager.getCache[Long, Tweet]("default") val twitterDStream = TwitterDStream.create(streamingContext) val keyValueTweetStream = twitterDStream.map(s => (s.getId, s)) keyValueTweetStream.writeToInfinispan(config) Repeat.every(5 seconds, { val keySet = cache.keySet() val maxKey = keySet.asScala.max println(s"${keySet.size} tweets inserted in the cache") println(s"Last tweet:${Option(cache.get(maxKey)).map(_.getText).getOrElse("<no tweets received so far>")}") println() }) runAndExit(streamingContext, duration) } object Repeat { def every(d: Duration, code: => Unit) = Executors.newSingleThreadScheduledExecutor.scheduleWithFixedDelay(new Runnable { override def run(): Unit = code }, 10, d.toSeconds, TimeUnit.SECONDS) } }
Example 150
Source File: Sample.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.examples.twitter import org.apache.spark.SparkConf import org.apache.spark.streaming.StreamingContext import org.infinispan.spark.config.ConnectorConfiguration object Sample { def getSparkConf(appName: String): SparkConf = new SparkConf().setAppName(appName) .set("spark.io.compression.codec", "lz4") .set("spark.sql.warehouse.dir", "/usr/local/code") def runAndExit(context: StreamingContext, durationSeconds: Long): Unit = { context.start() context.awaitTerminationOrTimeout(durationSeconds) context.stop(stopSparkContext = false, stopGracefully = true) System.exit(0) } def getConnectorConf(host: String): ConnectorConfiguration = { val configuration = new ConnectorConfiguration().setServerList(host).setCacheName("default") configuration } def usage(className: String): Unit = usage(className, twitter = false) def usageStream(className: String): Unit = usage(className, twitter = true) private def usage(className: String, twitter: Boolean): Unit = { println(s"Usage: $className infinispan_host timeoutSeconds") if (twitter) { println("Twitter OAuth credentials should be set via system properties: ") println("-Dtwitter4j.oauth.consumerKey=... -Dtwitter4j.oauth.consumerSecret=... -Dtwitter4j.oauth.accessToken=... -Dtwitter4j.oauth.accessTokenSecret=...") System.exit(1) } } }
Example 151
Source File: HogzillaStream.scala From hogzilla with GNU General Public License v2.0 | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.hogzilla.hbase.HogHBaseRDD import org.hogzilla.initiate.HogInitiate import org.hogzilla.prepare.HogPrepare import org.hogzilla.sflow._ import org.hogzilla.http.HogHTTP import org.hogzilla.auth.HogAuth import org.hogzilla.dns.HogDNS import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.storage.StorageLevel object HogzillaStream { def main(args: Array[String]) { val sparkConf = new SparkConf() .setAppName("HogzillaStream") .setMaster("local[2]") .set("spark.executor.memory", "512m") .set("spark.default.parallelism", "16") // 160 val ssc = new StreamingContext(sparkConf, Seconds(1)) val spark = new SparkContext(sparkConf) // Get the HBase RDD val HogRDD = HogHBaseRDD.connect(spark); val lines = ssc.socketTextStream("localhost", 9999,StorageLevel.MEMORY_AND_DISK_SER) val HogRDDAuth = HogHBaseRDD.connectAuth(spark); HogAuth.run(HogRDDAuth,spark); val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() // Stop Spark spark.stop() // Close the HBase Connection HogHBaseRDD.close(); } }
Example 152
Source File: InputInfoTrackerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Time, Duration, StreamingContext} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 153
Source File: StreamingTab.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} import StreamingTab._ private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(getSparkUI(ssc), "streaming") with Logging { private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 154
Source File: SocketInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.util.control.NonFatal import org.apache.spark.streaming.StreamingContext import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import scala.reflect.ClassTag import java.io._ import java.net.{UnknownHostException, Socket} import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class SocketInputDStream[T: ClassTag]( ssc_ : StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc_) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { def onStart() { // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() // is designed to stop by itself isStopped() returns false } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 155
Source File: DataHub2OdpsDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.datahub import com.aliyun.datahub.model.RecordEntry import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.aliyun.datahub.DatahubUtils object DataHub2OdpsDemo { def transferFunc(record: RecordEntry): String = { // 这个转化函数目前只支持把DataHub Record转成String // 如果是需要多个字段的话, 那么需要处理一下拼接的逻辑 record.getString(1) } def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("DataHubStreamingDemo") .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") .getOrCreate() // 设置Batch间隔时间 val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) // checkpoint dir to oss ssc.checkpoint("oss://bucket/inputdata/") val dataStream = DatahubUtils.createStream( ssc, "projectName", "topic", "subId", "accessId", "accessKey", "endPoint", transferFunc(_), StorageLevel.MEMORY_AND_DISK ) dataStream.map(x => new String(x)).foreachRDD(rdd => { val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) import spark.implicits._ rdd.toDF("id").write.mode("append").saveAsTable("test_table") }) ssc.start() ssc.awaitTermination() } }
Example 156
Source File: KMeansStreaming.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.mutable.Queue object KMeansStreaming { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("KMean Streaming App") .config("spark.sql.warehouse.dir", ".") .config("spark.executor.memory", "2g") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(1)) Logger.getRootLogger.setLevel(Level.WARN) val irisData = IrisData.readFromFile(spark.sparkContext) val lookup = IrisData.buildLabelLookup(irisData) val trainQueue = new Queue[RDD[LabeledPoint]]() val testQueue = new Queue[RDD[LabeledPoint]]() val trainingStream = ssc.queueStream(trainQueue) val testStream = ssc.queueStream(testQueue) val model = new StreamingKMeans().setK(3) .setDecayFactor(1.0) .setRandomCenters(4, 0.0) model.trainOn(trainingStream.map(lp => lp.features)) val values = model.predictOnValues(testStream.map(lp => (lp.label, lp.features))) values.foreachRDD(n => n.foreach(v => { println(v._2, v._1, lookup(v._1.toLong)) })) ssc.start() val irisLabelPoints = irisData.map(record => IrisData.toLabelPoints(record)) val Array(trainData, test) = irisLabelPoints.randomSplit(Array(.80, .20)) trainQueue += irisLabelPoints Thread.sleep(2000) val testGroups = test.randomSplit(Array(.25, .25, .25, .25)) testGroups.foreach(group => { testQueue += group println("-" * 25) Thread.sleep(1000) }) ssc.stop() } }
Example 157
Source File: LogisticStreaming.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter13 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.mutable.Queue object LogisticStreaming { def main(args: Array[String]) { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getRootLogger.setLevel(Level.WARN) val spark = SparkSession .builder .master("local[*]") .appName("Logistic Streaming App") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ val ssc = new StreamingContext(spark.sparkContext, Seconds(2)) val rawDF = spark.read .text("../data/sparkml2/chapter13/pima-indians-diabetes.data").as[String] val buf = rawDF.rdd.map(value => { val data = value.split(",") (data.init.toSeq, data.last) }) val lps = buf.map{ case (feature: Seq[String], label: String) => val featureVector = feature.map(_.toDouble).toArray[Double] LabeledPoint(label.toDouble, Vectors.dense(featureVector)) } val trainQueue = new Queue[RDD[LabeledPoint]]() val testQueue = new Queue[RDD[LabeledPoint]]() val trainingStream = ssc.queueStream(trainQueue) val testStream = ssc.queueStream(testQueue) val numFeatures = 8 val model = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(numFeatures)) .setNumIterations(15) .setStepSize(0.5) .setMiniBatchFraction(0.25) model.trainOn(trainingStream) val result = model.predictOnValues(testStream.map(lp => (lp.label, lp.features))) result.map{ case (label: Double, prediction: Double) => (label, prediction) }.print() ssc.start() val Array(trainData, test) = lps.randomSplit(Array(.80, .20)) trainQueue += trainData Thread.sleep(4000) val testGroups = test.randomSplit(Array(.50, .50)) testGroups.foreach(group => { testQueue += group Thread.sleep(2000) }) ssc.stop() } }
Example 158
Source File: MyStreamingKMeans.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter8 import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext} object MyStreamingKMeans { def main(args: Array[String]) { val trainingDir = "../data/sparkml2/chapter8/trainingDir" val testDir = "../data/sparkml2/chapter8/testDir" val batchDuration = 10 val numClusters = 2 val numDimensions = 3 Logger.getLogger("org").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("myStreamingKMeans") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(batchDuration.toLong)) val trainingData = ssc.textFileStream(trainingDir).map(Vectors.parse) val testData = ssc.textFileStream(testDir).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(numClusters) .setDecayFactor(1.0) .setRandomCenters(numDimensions, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 159
Source File: SparkStreamingTaxiTripToHBase.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.streaming.ingestion.hbase import java.io.File import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseDStreamFunctions._ import kafka.serializer.StringDecoder import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.solr.common.cloud.ZooKeeperException import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object SparkStreamingTaxiTripToHBase { def main(args: Array[String]): Unit = { println("Java Version:" + System.getProperty("java.version")) println("Java Home:" + System.getProperties().getProperty("java.home")) val v:ZooKeeperException = null if (args.length == 0) { println("Args: <KafkaBrokerList> " + "<kafkaTopicList> " + "<numberOfSeconds>" + "<runLocal>" + "<hbaseTable>" + "<numOfSalts>" + "<checkpointDir>" + "<hbaseConfigFolder>") return } val kafkaBrokerList = args(0) val kafkaTopicList = args(1) val numberOfSeconds = args(2).toInt val runLocal = args(3).equals("l") val tableName = args(4) val numOfSalts = args(5).toInt val checkpointFolder = args(6) val hbaseConfigFolder = args(7) println("kafkaBrokerList:" + kafkaBrokerList) println("kafkaTopicList:" + kafkaTopicList) println("numberOfSeconds:" + numberOfSeconds) println("runLocal:" + runLocal) println("tableName:" + tableName) println("numOfSalts:" + numOfSalts) val sc:SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConf = new SparkConf().setAppName("Spark Streaming Ingestion to HBase") new SparkContext(sparkConf) } val ssc = new StreamingContext(sc, Seconds(numberOfSeconds)) val topicsSet = kafkaTopicList.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList) val messageStream = KafkaUtils. createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) val conf = HBaseConfiguration.create() conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL) val hbaseContext = new HBaseContext(sc, conf) val tripDStream = messageStream.map(r => { (r._1, r._2.split(",")) }).filter(r => r._2.size > 3).map(r => { (r._1, NyTaxiYellowTripBuilder.build(r._2)) }) tripDStream.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), taxi => { TaxiTripHBaseHelper.generatePut(taxi._2, numOfSalts) }) ssc.checkpoint(checkpointFolder) ssc.start() ssc.awaitTermination() } }
Example 160
Source File: KafkaStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.kafka import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} object KafkaStreamingDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("KafkaStreamingDemo") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) // 请使用OSS作为Checkpoint存储 ssc.checkpoint("oss://bucket/checkpointDir/") // kafka配置参数 val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "192.168.1.1:9200,192.168.1.2:9200,192.168.1.3:9200", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "testGroupId", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) val topics = Set("event_topic") val recordDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) val dstream = recordDstream.map(f => (f.key(), f.value())) val data: DStream[String] = dstream.map(_._2) val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) val wordAndOneDstream: DStream[(String, Int)] = wordsDStream.map((_, 1)) val result: DStream[(String, Int)] = wordAndOneDstream.reduceByKey(_ + _) result.print() ssc.start() ssc.awaitTermination() } }
Example 161
Source File: Kafka2OdpsDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.kafka import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} object Kafka2OdpsDemo { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("test") val ssc = new StreamingContext(sparkConf, Seconds(10)) // 请使用OSS作为Checkpoint存储,修改为有效OSS路径。OSS访问文档请参考 https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E ssc.checkpoint("oss://bucket/checkpointdir") // kafka配置参数 val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "localhost:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "testGroupId", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) // 创建kafka dstream val topics = Set("test") val recordDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) val dstream = recordDstream.map(f => (f.key(), f.value())) // 解析kafka数据并写入odps val data: DStream[String] = dstream.map(_._2) val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) wordsDStream.foreachRDD(rdd => { val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) import spark.implicits._ rdd.toDF("id").write.mode("append").saveAsTable("test_table") }) ssc.start() ssc.awaitTermination() } }
Example 162
Source File: LogHub2OdpsDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.loghub import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Durations, StreamingContext} import org.apache.spark.streaming.loghub.{LoghubUtils, StreamingParam} object LogHub2OdpsDemo { def buildParam(conf: SparkConf): StreamingParam = { val sp = new StreamingParam() sp.setId(conf.get("spark.logservice.accessKeyId")) sp.setSecret(conf.get("spark.logservice.accessKeySecret")) sp.setEndpoint(conf.get("spark.logservice.endpoint")) sp.setProject(conf.get("spark.logservice.project")) sp.setLogstore(conf.get("spark.logservice.logstore")) sp.setCursor(LogHubCursorPosition.END_CURSOR) sp.setGroup("test") sp.setLevel(StorageLevel.MEMORY_AND_DISK) sp } def main(args: Array[String]) { val conf = new SparkConf(true).setAppName("LogHubStreamingDemo") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Durations.seconds(5)) val lines = LoghubUtils.createStream(ssc, buildParam(conf), 1).map(line => { val str = new String(line) str }) val words = lines.flatMap(_.split(" ")) words.foreachRDD(rdd => { val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) import spark.implicits._ rdd.toDF("id").write.mode("append").saveAsTable("test_table") }) ssc.start() // Start the computation ssc.awaitTermination() // Wait for the computation to terminate } }
Example 163
Source File: LogHubStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.loghub import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.loghub.{LoghubUtils, StreamingParam} import org.apache.spark.streaming.{Durations, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object LogHubStreamingDemo { def buildParam(conf: SparkConf): StreamingParam = { val sp = new StreamingParam() sp.setId(conf.get("spark.logservice.accessKeyId")) sp.setSecret(conf.get("spark.logservice.accessKeySecret")) sp.setEndpoint(conf.get("spark.logservice.endpoint")) sp.setProject(conf.get("spark.logservice.project")) sp.setLogstore(conf.get("spark.logservice.logstore")) sp.setCursor(LogHubCursorPosition.END_CURSOR) sp.setGroup("test") sp.setLevel(StorageLevel.MEMORY_AND_DISK) sp } def main(args: Array[String]) { val conf = new SparkConf(true).setAppName("LogHubStreamingDemo") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Durations.seconds(5)) val lines = LoghubUtils.createStream(ssc, buildParam(conf), 1).map(line => { val str = new String(line) str }) val words = lines.flatMap(_.split(" ")) val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print() ssc.start() // Start the computation ssc.awaitTermination() // Wait for the computation to terminate } }
Example 164
Source File: DataHubStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.datahub import com.aliyun.datahub.model.RecordEntry import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.aliyun.datahub.DatahubUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object DataHubStreamingDemo { def transferFunc(record: RecordEntry): String = { // 这个转化函数目前只支持把DataHub Record转成String // 如果是需要多个字段的话, 那么需要处理一下拼接的逻辑 record.getString(1) } def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("DataHubStreamingDemo") .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") .getOrCreate() // 设置Batch间隔时间 val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) // checkpoint dir to oss ssc.checkpoint("oss://bucket/inputdata/") val dataStream = DatahubUtils.createStream( ssc, "projectName", "topic", "subId", "accessId", "accessKey", "endPoint", transferFunc(_), StorageLevel.MEMORY_AND_DISK ) dataStream.count().print() ssc.start() ssc.awaitTermination() } }
Example 165
Source File: QueueInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{Time, StreamingContext} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() if (oneAtATime && queue.size > 0) { buffer += queue.dequeue() } else { buffer ++= queue.dequeueAll(_ => true) } if (buffer.size > 0) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 166
Source File: Extractors.scala From streamliner-starter with Apache License 2.0 | 5 votes |
package com.memsql.streamliner.starter import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.sql.types._ import org.apache.spark.streaming.StreamingContext import com.memsql.spark.etl.api.{Extractor, PhaseConfig} import com.memsql.spark.etl.utils.PhaseLogger // This extract just returns a static range of 5 integers each batch interval class BasicExtractor extends Extractor { override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { logger.info("extracting a constant sequence DataFrame") val schema = StructType(StructField("number", IntegerType, false) :: Nil) val sampleData = List(1,2,3,4,5) val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_)) val df = sqlContext.createDataFrame(rowRDD, schema) Some(df) } }
Example 167
Source File: StreamingJob.scala From confluent-platform-spark-streaming with Apache License 2.0 | 5 votes |
package example import com.typesafe.config.ConfigFactory import io.confluent.kafka.serializers.KafkaAvroDecoder import kafka.serializer.StringDecoder import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkContext, SparkConf} object StreamingJob extends App { // Get job configuration val config = ConfigFactory.load() Logger.getLogger("example").setLevel(Level.toLevel(config.getString("loglevel"))) private val logger = Logger.getLogger(getClass) // Spark config and contexts val sparkMaster = config.getString("spark.master") val sparkConf = new SparkConf() .setMaster(sparkMaster) .setAppName("StreamingExample") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(sparkConf) val batchInterval = config.getInt("spark.batch.interval") val ssc = new StreamingContext(sc, Seconds(batchInterval)) // Create Kafka stream val groupId = config.getString("kafka.group.id") val topic = config.getString("topic") val kafkaParams = Map( "bootstrap.servers" -> config.getString("kafka.bootstrap.servers"), "schema.registry.url" -> config.getString("kafka.schema.registry.url"), "group.id" -> groupId ) @transient val kafkaStream: DStream[(String, Object)] = KafkaUtils.createDirectStream[String, Object, StringDecoder, KafkaAvroDecoder]( ssc, kafkaParams, Set(topic) ) // Load JSON strings into DataFrame kafkaStream.foreachRDD { rdd => // Get the singleton instance of SQLContext val sqlContext = SQLContext.getOrCreate(rdd.sparkContext) import sqlContext.implicits._ val topicValueStrings = rdd.map(_._2.toString) val df = sqlContext.read.json(topicValueStrings) df.printSchema() println("DataFrame count: " + df.count()) df.take(1).foreach(println) } ssc.start() ssc.awaitTermination() }
Example 168
Source File: package.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com import org.apache.spark.streaming.{Seconds, StreamingContext} package object example { def setupLogging(): Unit = { import org.apache.log4j.{Level, Logger} val rootLogger = Logger.getRootLogger rootLogger.setLevel(Level.ERROR) } def launch(logic: StreamingContext => Unit, appName:String, checkpointPath:String): Unit = { val streamingContext = new StreamingContext("local[*]", appName, Seconds(2)) setupLogging() logic.apply(streamingContext) streamingContext.checkpoint(checkpointPath) streamingContext.start() streamingContext.awaitTermination() } }
Example 169
Source File: KafkaStreamingWC.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.kafka08 import kafka.serializer.StringDecoder import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import com.example._ object KafkaStreamingWC { def main(args: Array[String]): Unit = { kafkaStreamingWC } def kafkaStreamingWC = launch(kafka08StreamingWC, "Kafka08Streaming", "checkpointing") def kafka08StreamingWC(ssc: StreamingContext) = { val brokers = "127.0.0.1:9092" val topics = "sample_topic" val topicsSet = topics.split(",").toSet val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, topicsSet) val lines = messages.map { case (_, value) => value } val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) wordCounts.print() } }
Example 170
Source File: package.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.streaming.{Seconds, StreamingContext} package object example { def setupLogging(): Unit = { import org.apache.log4j.{Level, Logger} val rootLogger = Logger.getRootLogger rootLogger.setLevel(Level.ERROR) } def kafkaParams = Map[String, Object]( "bootstrap.servers" -> "127.0.0.1:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "mygroup1", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) def launchWithCheckpointing(logic: StreamingContext => Unit, appName:String, checkpointPath:String): Unit = { val streamingContext = new StreamingContext("local[*]", appName, Seconds(2)) setupLogging() logic.apply(streamingContext) streamingContext.checkpoint(checkpointPath) streamingContext.start() streamingContext.awaitTermination() } def launchWithItself(logic: StreamingContext => Unit, appName:String): Unit = { val streamingContext = new StreamingContext("local[*]", appName, Seconds(2)) setupLogging() logic.apply(streamingContext) streamingContext.start() streamingContext.awaitTermination() } }
Example 171
Source File: KafkaStreamingLatestExample.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.kafka010 import java.{util => ju} import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010._ import org.apache.spark.{SparkContext, TaskContext} import scala.collection.JavaConversions._ import com.example._ object KafkaStreamingLatestExample { def main(args: Array[String]): Unit = { kafkaStream010Checkpointing() } def kafkaStream010Itself() = launchWithItself(kafkaStreaming010, appName = "Kafka010_DirectStream") private def kafkaStreaming010(streamingContext: StreamingContext): Unit = { val topics = Array("sample_topic") val stream = KafkaUtils.createDirectStream[String, String]( streamingContext, PreferConsistent, //It will consistently distribute partitions across all executors. Subscribe[String, String](topics, kafkaParams) ) stream.map(record => (record.key, record.value)).print() stream.foreachRDD { rdd => val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges rdd.foreachPartition { _ => val o: OffsetRange = offsetRanges(TaskContext.get.partitionId) println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}") } } storingOffsetsItself(stream) } private def storingOffsetsItself(stream: InputDStream[ConsumerRecord[String, String]]) = { stream.foreachRDD { rdd => val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) } } private def kafkaRdd010() = { val sparkContext = new SparkContext("local[*]", "kafkaRdd010") val offsetRanges = Array( // topic, partition, inclusive starting offset, exclusive ending offset OffsetRange("sample_topic", 0, 10, 20), OffsetRange("sample_topic", 1, 10, 20) ) val params = new ju.HashMap[String, Object](kafkaParams) val kafkaRDD = KafkaUtils.createRDD[String, String](sparkContext, params , offsetRanges, PreferConsistent) println(kafkaRDD.map(_.value()).first()) } }
Example 172
Source File: StreamingSample.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.samples import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import com.couchbase.spark.streaming._ import org.apache.spark.storage.StorageLevel object StreamingSample { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local[*]") .setAppName("StreamingSample") .set("com.couchbase.username", "Administrator") .set("com.couchbase.password", "password") .set("com.couchbase.bucket.beer-sample", "") val ssc = new StreamingContext(conf, Seconds(5)) ssc .couchbaseStream(from = FromBeginning, to = ToNow, storageLevel = StorageLevel.MEMORY_ONLY) .map(_.getClass) .countByValue() .print() ssc.start() ssc.awaitTermination() } }
Example 173
Source File: NetworkWordCount.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License | 5 votes |
package org.googlielmo.sparkstreamingkafka import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: NetworkWordCount <spark_master> <hostname> <port>") System.exit(1) } // Create the context with a 10 seconds batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount").setMaster(args(0)) val ssc = new StreamingContext(sparkConf, Seconds(10)) // Create a socket stream on target ip:port and count the words in input stream of \n delimited text val lines = ssc.socketTextStream(args(1), args(2).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 174
Source File: HBaseStreamingBulkPutExample.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseStreamingBulkPutExample { def main(args: Array[String]) { if (args.length < 4) { println("HBaseStreamingBulkPutExample " + "{host} {port} {tableName} {columnFamily} are missing an argument") return } val host = args(0) val port = args(1) val tableName = args(2) val columnFamily = args(3) val sparkConf = new SparkConf().setAppName("HBaseStreamingBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { val ssc = new StreamingContext(sc, Seconds(1)) val lines = ssc.socketTextStream(host, port.toInt) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.streamBulkPut[String](lines, TableName.valueOf(tableName), (putRecord) => { if (putRecord.length() > 0) { val put = new Put(Bytes.toBytes(putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar")) put } else { null } }) ssc.start() ssc.awaitTerminationOrTimeout(60000) } finally { sc.stop() } } }
Example 175
Source File: CustomReceiver.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo(s"Connecting to $host : $port") socket = new Socket(host, port) logInfo(s"Connected to $host : $port") val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart(s"Error connecting to $host : $port", e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 176
Source File: InputInfoTrackerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Duration, StreamingContext, Time} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 177
Source File: StreamingTab.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) parent.setStreamingJobProgressListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 178
Source File: SocketInputDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io._ import java.net.{ConnectException, Socket} import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.NextIterator private[streaming] class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { private var socket: Socket = _ def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port) } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close() socket = null logInfo(s"Closed socket to $host:$port") } } } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8)) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 179
Source File: QueueInputDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 180
Source File: FlumeStreamSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 181
Source File: KafkaStreamSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() stream.map(_._2).countByValue().foreachRDD { r => r.collect().foreach { kv => result.synchronized { val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(result.synchronized { sent === result }) } ssc.stop() } }
Example 182
Source File: NetworkWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 183
Source File: HdfsWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object HdfsWordCount { def main(args: Array[String]) { if (args.length < 1) { System.err.println("Usage: HdfsWordCount <directory>") System.exit(1) } StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("HdfsWordCount") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(2)) // Create the FileInputDStream on the directory and use the // stream to count words in new files created val lines = ssc.textFileStream(args(0)) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 184
Source File: SqlNetworkWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 185
Source File: StreamingApp.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package main.scala import scala.collection.mutable.{ListBuffer, Queue} import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming._ object SparkStreamingExample { def main(args: Array[String]) { val conf = sys.env.get("SPARK_AUDIT_MASTER") match { case Some(master) => new SparkConf().setAppName("Simple Streaming App").setMaster(master) case None => new SparkConf().setAppName("Simple Streaming App") } val ssc = new StreamingContext(conf, Seconds(1)) val seen = ListBuffer[RDD[Int]]() val rdd1 = ssc.sparkContext.makeRDD(1 to 100, 10) val rdd2 = ssc.sparkContext.makeRDD(1 to 1000, 10) val rdd3 = ssc.sparkContext.makeRDD(1 to 10000, 10) val queue = Queue(rdd1, rdd2, rdd3) val stream = ssc.queueStream(queue) stream.foreachRDD(rdd => seen += rdd) ssc.start() Thread.sleep(5000) def test(f: => Boolean, failureMsg: String) = { if (!f) { println(failureMsg) System.exit(-1) } } val rddCounts = seen.map(rdd => rdd.count()).filter(_ > 0) test(rddCounts.length == 3, "Did not collect three RDD's from stream") test(rddCounts.toSet == Set(100, 1000, 10000), "Did not find expected streams") println("Test succeeded") ssc.stop() } } // scalastyle:on println
Example 186
Source File: QueueStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import scala.collection.mutable.Queue import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} object QueueStream { def main(args: Array[String]) { StreamingExamples.setStreamingLogLevels() val sparkConf = new SparkConf().setAppName("QueueStream") // Create the context val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create the queue through which RDDs can be pushed to // a QueueInputDStream val rddQueue = new Queue[RDD[Int]]() // Create the QueueInputDStream and use it do some processing val inputStream = ssc.queueStream(rddQueue) val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // Create and push some RDDs into rddQueue for (i <- 1 to 30) { rddQueue.synchronized { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) } Thread.sleep(1000) } ssc.stop() } }
Example 187
Source File: StreamingKMeansExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf // $example on$ import org.apache.spark.mllib.clustering.StreamingKMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.streaming.{Seconds, StreamingContext} // $example off$ object StreamingKMeansExample { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: StreamingKMeansExample " + "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>") System.exit(1) } // $example on$ val conf = new SparkConf().setAppName("StreamingKMeansExample") val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse) val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) val model = new StreamingKMeans() .setK(args(3).toInt) .setDecayFactor(1.0) .setRandomCenters(args(4).toInt, 0.0) model.trainOn(trainingData) model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() ssc.start() ssc.awaitTermination() // $example off$ } } // scalastyle:on println
Example 188
Source File: StreamingTestExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.mllib.stat.test.{BinarySample, StreamingTest} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.util.Utils object StreamingTestExample { def main(args: Array[String]) { if (args.length != 3) { // scalastyle:off println System.err.println( "Usage: StreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>") // scalastyle:on println System.exit(1) } val dataDir = args(0) val batchDuration = Seconds(args(1).toLong) val numBatchesTimeout = args(2).toInt val conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample") val ssc = new StreamingContext(conf, batchDuration) ssc.checkpoint { val dir = Utils.createTempDir() dir.toString } // $example on$ val data = ssc.textFileStream(dataDir).map(line => line.split(",") match { case Array(label, value) => BinarySample(label.toBoolean, value.toDouble) }) val streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch") val out = streamingTest.registerStream(data) out.print() // $example off$ // Stop processing if test becomes significant or we time out var timeoutCounter = numBatchesTimeout out.foreachRDD { rdd => timeoutCounter -= 1 val anySignificant = rdd.map(_.pValue < 0.05).fold(false)(_ || _) if (timeoutCounter == 0 || anySignificant) rdd.context.stop() } ssc.start() ssc.awaitTermination() } }
Example 189
Source File: SystemArg.scala From mist with Apache License 2.0 | 5 votes |
package mist.api import mist.api.data.JsMap import org.apache.spark.{SparkContext, SparkSessionUtils} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaStreamingContext trait SystemArg[A] extends ArgDef[A] { final def validate(params: JsMap): Extraction[Unit] = Extracted(()) } object SystemArg { def apply[A](tags: Seq[String], f: => Extraction[A]): ArgDef[A] = new SystemArg[A] { override def extract(ctx: FnContext): Extraction[A] = f override def describe() = Seq(InternalArgument(tags)) } def apply[A](tags: Seq[String], f: FullFnContext => Extraction[A]): ArgDef[A] = new SystemArg[A] { override def extract(ctx: FnContext): Extraction[A] = ctx match { case c: FullFnContext => f(c) case _ => val desc = s"Unknown type of job context ${ctx.getClass.getSimpleName} " + s"expected ${FullFnContext.getClass.getSimpleName}" Failed.InternalError(desc) } override def describe() = Seq(InternalArgument(tags)) } } trait SparkArgs { val sparkContextArg: ArgDef[SparkContext] = SystemArg( Seq.empty, c => Extracted(c.sc) ) val streamingContextArg: ArgDef[StreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag), ctx => { val ssc = StreamingContext.getActiveOrCreate(() => new StreamingContext(ctx.sc, ctx.streamingDuration)) Extracted(ssc) } ) val sqlContextArg: ArgDef[SQLContext] = SystemArg(Seq(ArgInfo.SqlContextTag), ctx => sparkContextArg.map(SQLContext.getOrCreate).extract(ctx) ) // HiveContext should be cached per jvm // see #325 val hiveContextArg: ArgDef[HiveContext] = new SystemArg[HiveContext] { var cache: HiveContext = _ override def extract(ctx: FnContext): Extraction[HiveContext] = synchronized { ctx match { case c: FullFnContext => if (cache == null) cache = new HiveContext(c.sc) Extracted(cache) case _ => Failed.InternalError(s"Unknown type of job context ${ctx.getClass.getSimpleName} expected ${FullFnContext.getClass.getSimpleName}") } } override def describe(): Seq[ArgInfo] = Seq(InternalArgument( Seq(ArgInfo.HiveContextTag, ArgInfo.SqlContextTag))) } val javaSparkContextArg: ArgDef[JavaSparkContext] = sparkContextArg.map(sc => new JavaSparkContext(sc)) val javaStreamingContextArg: ArgDef[JavaStreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag), ctx => streamingContextArg.map(scc => new JavaStreamingContext(scc)).extract(ctx)) val sparkSessionArg: ArgDef[SparkSession] = SystemArg(Seq(ArgInfo.SqlContextTag), ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, false)).extract(ctx) ) val sparkSessionWithHiveArg: ArgDef[SparkSession] = SystemArg( Seq(ArgInfo.SqlContextTag, ArgInfo.HiveContextTag), ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, true)).extract(ctx)) } object SparkArgs extends SparkArgs
Example 190
Source File: StreamingExample.scala From mist with Apache License 2.0 | 5 votes |
import mist.api._ import mist.api.dsl._ import mist.api.encoding.defaults._ import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContext import scala.collection.mutable object StreamingExample extends MistFn with Logging { override def handle: Handle = { val raw = onStreamingContext((ssc: StreamingContext) => { val rddQueue = new mutable.Queue[RDD[Int]]() ssc.queueStream(rddQueue) .map(x => (x % 10, 1)) .reduceByKey(_ + _) .foreachRDD((rdd, time) => { val values = rdd.collect().toList val msg = s"time: $time, length: ${values.length}, collection: $values" logger.info(msg) }) ssc.start() (1 to 50).foreach(_ => { rddQueue.synchronized { rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) } Thread.sleep(1000) }) ssc.stop() }) raw.asHandle } }
Example 191
Source File: StreamingTab.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} import StreamingTab._ private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(getSparkUI(ssc), "streaming") with Logging { private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 192
Source File: SocketInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.util.control.NonFatal import org.apache.spark.streaming.StreamingContext import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import scala.reflect.ClassTag import java.io._ import java.net.{UnknownHostException, Socket} import org.apache.spark.Logging import org.apache.spark.streaming.receiver.Receiver private[streaming] class SocketInputDStream[T: ClassTag]( @transient ssc_ : StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc_) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { def onStart() { // Start the thread that receives data over a connection //启动接收到连接上的数据的线程 new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // There is nothing much to do as the thread calling receive() //没有什么可做的线程调用receive() // is designed to stop by itself isStopped() returns false //是为了阻止自己isstopped()返回false } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 193
Source File: QueueInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{Time, StreamingContext} private[streaming] class QueueInputDStream[T: ClassTag]( @transient ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() if (oneAtATime && queue.size > 0) { buffer += queue.dequeue() } else { buffer ++= queue.dequeueAll(_ => true) } if (buffer.size > 0) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(ssc.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { None } } }
Example 194
Source File: KinesisInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[Array[Byte]](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[Array[Byte]] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption) } }
Example 195
Source File: TwitterPopularTags.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.SparkContext._ import org.apache.spark.streaming.twitter._ import org.apache.spark.SparkConf object TwitterPopularTags { def main(args: Array[String]) { if (args.length < 4) { System.err.println("Usage: TwitterPopularTags <consumer key> <consumer secret> " + "<access token> <access token secret> [<filters>]") System.exit(1) } StreamingExamples.setStreamingLogLevels() val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) val filters = args.takeRight(args.length - 4) // Set the system properties so that Twitter4j library used by twitter stream // can use them to generat OAuth credentials System.setProperty("twitter4j.oauth.consumerKey", consumerKey) System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) System.setProperty("twitter4j.oauth.accessToken", accessToken) System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) val sparkConf = new SparkConf().setAppName("TwitterPopularTags") val ssc = new StreamingContext(sparkConf, Seconds(2)) val stream = TwitterUtils.createStream(ssc, None, filters) val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) .map{case (topic, count) => (count, topic)} .transform(_.sortByKey(false)) // Print popular hashtags topCounts60.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) topCounts10.foreachRDD(rdd => { val topList = rdd.take(10) println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) topList.foreach{case (count, tag) => println("%s (%s tweets)".format(tag, count))} }) ssc.start() ssc.awaitTermination() } } // scalastyle:on println
Example 196
Source File: KinesisInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream[T: ClassTag]( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 197
Source File: KafkaStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps import scala.util.Random import kafka.serializer.StringDecoder import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll { private var ssc: StreamingContext = _ private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { if (ssc != null) { ssc.stop() ssc = null } if (kafkaTestUtils != null) { kafkaTestUtils.teardown() kafkaTestUtils = null } } test("Kafka input stream") { val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName) ssc = new StreamingContext(sparkConf, Milliseconds(500)) val topic = "topic1" val sent = Map("a" -> 5, "b" -> 3, "c" -> 10) kafkaTestUtils.createTopic(topic) kafkaTestUtils.sendMessages(topic, sent) val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress, "group.id" -> s"test-consumer-${Random.nextInt(10000)}", "auto.offset.reset" -> "smallest") val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY) val result = new mutable.HashMap[String, Long]() with mutable.SynchronizedMap[String, Long] stream.map(_._2).countByValue().foreachRDD { r => val ret = r.collect() ret.toMap.foreach { kv => val count = result.getOrElseUpdate(kv._1, 0) + kv._2 result.put(kv._1, count) } } ssc.start() eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { assert(sent === result) } } }
Example 198
Source File: MQTTStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.concurrent.duration._ import scala.language.postfixOps import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext} class MQTTStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter { private val batchDuration = Milliseconds(500) private val master = "local[2]" private val framework = this.getClass.getSimpleName private val topic = "def" private var ssc: StreamingContext = _ private var mqttTestUtils: MQTTTestUtils = _ before { ssc = new StreamingContext(master, framework, batchDuration) mqttTestUtils = new MQTTTestUtils mqttTestUtils.setup() } after { if (ssc != null) { ssc.stop() ssc = null } if (mqttTestUtils != null) { mqttTestUtils.teardown() mqttTestUtils = null } } test("mqtt input stream") { val sendMessage = "MQTT demo for spark streaming" val receiveStream = MQTTUtils.createStream(ssc, "tcp://" + mqttTestUtils.brokerUri, topic, StorageLevel.MEMORY_ONLY) @volatile var receiveMessage: List[String] = List() receiveStream.foreachRDD { rdd => if (rdd.collect.length > 0) { receiveMessage = receiveMessage ::: List(rdd.first) receiveMessage } } ssc.start() // Retry it because we don't know when the receiver will start. eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { mqttTestUtils.publishData(topic, sendMessage) assert(sendMessage.equals(receiveMessage(0))) } ssc.stop() } }
Example 199
Source File: MQTTUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream object MQTTUtils { private[mqtt] class MQTTUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ): JavaDStream[String] = { MQTTUtils.createStream(jssc, brokerUrl, topic, storageLevel) } }
Example 200
Source File: MQTTInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import org.eclipse.paho.client.mqttv3.IMqttDeliveryToken import org.eclipse.paho.client.mqttv3.MqttCallback import org.eclipse.paho.client.mqttv3.MqttClient import org.eclipse.paho.client.mqttv3.MqttMessage import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.Receiver private[streaming] class MQTTInputDStream( ssc_ : StreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ) extends ReceiverInputDStream[String](ssc_) { private[streaming] override def name: String = s"MQTT stream [$id]" def getReceiver(): Receiver[String] = { new MQTTReceiver(brokerUrl, topic, storageLevel) } } private[streaming] class MQTTReceiver( brokerUrl: String, topic: String, storageLevel: StorageLevel ) extends Receiver[String](storageLevel) { def onStop() { } def onStart() { // Set up persistence for messages val persistence = new MemoryPersistence() // Initializing Mqtt Client specifying brokerUrl, clientID and MqttClientPersistance val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), persistence) // Callback automatically triggers as and when new message arrives on specified topic val callback = new MqttCallback() { // Handles Mqtt message override def messageArrived(topic: String, message: MqttMessage) { store(new String(message.getPayload(), "utf-8")) } override def deliveryComplete(token: IMqttDeliveryToken) { } override def connectionLost(cause: Throwable) { restart("Connection lost ", cause) } } // Set up callback for MqttClient. This needs to happen before // connecting or subscribing, otherwise messages may be lost client.setCallback(callback) // Connect to MqttBroker client.connect() // Subscribe to Mqtt topic client.subscribe(topic) } }