org.apache.spark.streaming.dstream.ReceiverInputDStream Scala Examples
The following examples show how to use org.apache.spark.streaming.dstream.ReceiverInputDStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ZeroMQStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class ZeroMQStreamSuite extends SparkFunSuite { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val publishUrl = "abc" val subscribe = new Subscribe(null.asInstanceOf[ByteString]) val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]] // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving ssc.stop() } }
Example 2
Source File: TwitterPopularTagsTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.streaming.twitter import com.github.dnvriend.TestSpec import com.github.dnvriend.spark.Tweet import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.dstream.{ DStream, ReceiverInputDStream } import org.apache.spark.streaming.twitter.TwitterUtils import org.scalatest.Ignore import pprint.Config.Colors.PPrintConfig import pprint._ import twitter4j.Status // see: https://dev.twitter.com/streaming/overview // see: https://dev.twitter.com/streaming/public // see: https://support.twitter.com/articles/20174643 // see: https://github.com/apache/bahir/blob/master/streaming-twitter/examples/src/main/scala/org/apache/spark/examples/streaming/twitter/TwitterPopularTags.scala // see: http://blog.originate.com/blog/2014/06/15/idiomatic-scala-your-options-do-not-match/ @Ignore class TwitterPopularTagsTest extends TestSpec { it should "find popular tags" in withStreamingContext(2, await = true) { spark => ssc => // val filters = Array("#scala", "#akka", "#spark", "@scala", "@akka", "@spark") val filters = Array("#summercamp", "#akka", "#scala", "#fastdata", "#spark", "#hadoop") val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val msgs: DStream[Tweet] = stream .map(Tweet(_)) msgs.foreachRDD { rdd => rdd.take(10).foreach(pprint.pprintln) } val hashTags: DStream[String] = stream .filter(_.getLang == "en") .flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) val topCounts60 = hashTags .map((_, 1)) .reduceByKeyAndWindow(_ + _, Seconds(60)) .map { case (topic, count) => (count, topic) } .transform(_.sortByKey(ascending = false)) val topCounts10 = hashTags .map((_, 1)) .reduceByKeyAndWindow(_ + _, Seconds(10)) .map { case (topic, count) => (count, topic) } .transform(_.sortByKey(false)) topCounts60.foreachRDD(rdd => { val topList = rdd.take(10) pprint.pprintln("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } }) topCounts10.foreachRDD(rdd => { val topList = rdd.take(10) pprint.pprintln("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } }) ssc.start() } }
Example 3
Source File: StreamingUtils.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.test import java.time.{Duration => JDuration} import java.util.concurrent.TimeUnit import java.util.{List => JList} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import scala.annotation.meta.param import scala.collection.JavaConverters._ import scala.concurrent.duration.Duration import scala.reflect.ClassTag object StreamingUtils { class TestReceiver[T](of: Seq[T], streamItemEvery: Duration) extends Receiver[T](StorageLevel.MEMORY_ONLY) { override def onStart(): Unit = { of.foreach { item => Thread.sleep(streamItemEvery.toMillis) store(item) } } override def onStop(): Unit = {} } class TestInputDStream[T: ClassTag](@(transient@param) ssc_ : StreamingContext, of: Seq[T], streamItemEvery: Duration) extends ReceiverInputDStream[T](ssc_) { override def getReceiver(): Receiver[T] = new TestReceiver[T](of, streamItemEvery) } def createJavaReceiverDInputStream[T](jssc: JavaStreamingContext, of: JList[T], streamItemEvery: JDuration): JavaReceiverInputDStream[T] = { implicit val cmt: ClassTag[T] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]] JavaReceiverInputDStream.fromReceiverInputDStream(new TestInputDStream[T](jssc.ssc, of.asScala, Duration(streamItemEvery.getNano, TimeUnit.NANOSECONDS))) } }
Example 4
Source File: InfinispanInputDStream.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.stream import java.nio._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.infinispan.client.hotrod.annotation._ import org.infinispan.client.hotrod.event.{ClientCacheEntryCustomEvent, ClientEvent} import org.infinispan.client.hotrod.{DataFormat, RemoteCache, RemoteCacheManager} import org.infinispan.commons.configuration.ClassWhiteList import org.infinispan.commons.io.UnsignedNumeric import org.infinispan.spark._ import org.infinispan.spark.config.ConnectorConfiguration import org.infinispan.spark.rdd.RemoteCacheManagerBuilder class InfinispanInputDStream[K, V](@transient val ssc_ : StreamingContext, storage: StorageLevel, configuration: ConnectorConfiguration, includeState: Boolean = false) extends ReceiverInputDStream[(K, V, ClientEvent.Type)](ssc_) { override def getReceiver(): Receiver[(K, V, ClientEvent.Type)] = new EventsReceiver(storage, configuration, includeState) } private class EventsReceiver[K, V](storageLevel: StorageLevel, configuration: ConnectorConfiguration, includeState: Boolean) extends Receiver[(K, V, ClientEvent.Type)](storageLevel) { @transient private lazy val listener = if (includeState) new EventListenerWithState(remoteCache.getDataFormat) else new EventListenerWithoutState(remoteCache.getDataFormat) @transient private var cacheManager: RemoteCacheManager = _ @transient private var remoteCache: RemoteCache[K, V] = _ override def onStart(): Unit = { cacheManager = RemoteCacheManagerBuilder.create(configuration) remoteCache = getCache[K, V](configuration, cacheManager) remoteCache.addClientListener(listener) } override def onStop(): Unit = { if (cacheManager != null) { cacheManager.stop() cacheManager = null } } private sealed trait EventListener { var dataFormat: DataFormat @ClientCacheEntryRemoved @ClientCacheEntryExpired def onRemove(event: ClientCacheEntryCustomEvent[Array[Byte]]) { emitEvent(event, ignoreValue = true) } @ClientCacheEntryCreated @ClientCacheEntryModified def onAddModify(event: ClientCacheEntryCustomEvent[Array[Byte]]) { emitEvent(event, ignoreValue = false) } private def emitEvent(event: ClientCacheEntryCustomEvent[Array[Byte]], ignoreValue: Boolean) = { val eventData = event.getEventData val rawData = ByteBuffer.wrap(eventData) val rawKey = readElement(rawData) val classWhiteList = new ClassWhiteList() val key: K = dataFormat.keyToObj[K](rawKey, new ClassWhiteList()) val value = if (!ignoreValue) { val rawValue = readElement(rawData) dataFormat.valueToObj[V](rawValue, classWhiteList) } else null.asInstanceOf[V] store((key, value, event.getType)) } private def readElement(in: ByteBuffer): Array[Byte] = { val length = UnsignedNumeric.readUnsignedInt(in) val element = new Array[Byte](length) in.get(element) element } } @ClientListener(converterFactoryName = "___eager-key-value-version-converter", useRawData = true, includeCurrentState = true) private class EventListenerWithState(var dataFormat: DataFormat) extends EventListener @ClientListener(converterFactoryName = "___eager-key-value-version-converter", useRawData = true, includeCurrentState = false) private class EventListenerWithoutState(var dataFormat: DataFormat) extends EventListener }
Example 5
Source File: KinesisInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream[T: ClassTag]( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 6
Source File: MQTTUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream object MQTTUtils { private[mqtt] class MQTTUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ): JavaDStream[String] = { MQTTUtils.createStream(jssc, brokerUrl, topic, storageLevel) } }
Example 7
Source File: ZeroMQStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class ZeroMQStreamSuite extends SparkFunSuite { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val publishUrl = "abc" val subscribe = new Subscribe(null.asInstanceOf[ByteString]) val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]] // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving ssc.stop() } }
Example 8
Source File: TwitterStreamSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 9
Source File: KinesisInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[Array[Byte]](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[Array[Byte]] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption) } }
Example 10
Source File: MQTTUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaDStream, JavaReceiverInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream object MQTTUtils { private[mqtt] class MQTTUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ): JavaDStream[String] = { MQTTUtils.createStream(jssc, brokerUrl, topic, storageLevel) } }
Example 11
Source File: ZeroMQStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.zeromq import akka.actor.SupervisorStrategy import akka.util.ByteString import akka.zeromq.Subscribe import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class ZeroMQStreamSuite extends SparkFunSuite { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("zeromq input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val publishUrl = "abc" val subscribe = new Subscribe(null.asInstanceOf[ByteString]) val bytesToObjects = (bytes: Seq[ByteString]) => null.asInstanceOf[Iterator[String]] // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) val test2: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = ZeroMQUtils.createStream( ssc, publishUrl, subscribe, bytesToObjects, StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving ssc.stop() } }
Example 12
Source File: FlumePollingStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.net.InetSocketAddress import scala.collection.JavaConversions._ import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.base.Charsets.UTF_8 import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually._ import org.apache.spark.{Logging, SparkConf, SparkFunSuite} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext} import org.apache.spark.util.{ManualClock, Utils} private def testMultipleTimes(test: () => Unit): Unit = { var testPassed = false var attempt = 0 while (!testPassed && attempt < maxAttempts) { try { test() testPassed = true } catch { case e: Exception if Utils.isBindCollision(e) => logWarning("Exception when running flume polling test: " + e) attempt += 1 } } assert(testPassed, s"Test failed after $attempt attempts!") } private def testFlumePolling(): Unit = { try { val port = utils.startSingleSink() writeAndVerify(Seq(port)) utils.assertChannelsAreEmpty() } finally { utils.close() } } private def testFlumePollingMultipleHost(): Unit = { try { val ports = utils.startMultipleSinks() writeAndVerify(ports) utils.assertChannelsAreEmpty() } finally { utils.close() } } def writeAndVerify(sinkPorts: Seq[Int]): Unit = { // Set up the streaming context and input streams //设置流上下文和输入流 val ssc = new StreamingContext(conf, batchDuration) val addresses = sinkPorts.map(port => new InetSocketAddress("localhost", port)) val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, addresses, StorageLevel.MEMORY_AND_DISK, utils.eventsPerBatch, 5) val outputBuffer = new ArrayBuffer[Seq[SparkFlumeEvent]] with SynchronizedBuffer[Seq[SparkFlumeEvent]] val outputStream = new TestOutputStream(flumeStream, outputBuffer) outputStream.register() ssc.start() try { utils.sendDatAndEnsureAllDataHasBeenReceived() val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] clock.advance(batchDuration.milliseconds) // The eventually is required to ensure that all data in the batch has been processed. //最终需要确保批处理中的所有数据已被处理 eventually(timeout(10 seconds), interval(100 milliseconds)) { val flattenOutputBuffer = outputBuffer.flatten val headers = flattenOutputBuffer.map(_.event.getHeaders.map { case kv => (kv._1.toString, kv._2.toString) }).map(mapAsJavaMap) val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8)) utils.assertOutput(headers, bodies) } } finally { ssc.stop() } } }
Example 13
Source File: TwitterStreamSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 14
Source File: ImageInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import java.io.InputStream import java.net.Socket import org.apache.hadoop.io.BytesWritable import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging class ImageInputDStream(@transient ssc_ : StreamingContext, host: String, port: Int, storageLevel: StorageLevel) extends ReceiverInputDStream[BytesWritable](ssc_) with Logging { override def getReceiver(): Receiver[BytesWritable] = { new ImageRecevier(host, port, storageLevel) } } class ImageRecevier(host: String, port: Int, storageLevel: StorageLevel) extends Receiver[BytesWritable](storageLevel) with Logging { override def onStart(): Unit = { new Thread("Image Socket") { setDaemon(true) override def run(): Unit = { receive() } }.start() } def receive(): Unit = { var socket: Socket = null var in: InputStream = null try { log.info("Connecting to " + host + ":" + port) socket = new Socket(host, port) log.info("Connected to " + host + ":" + port) in = socket.getInputStream val buf = new ArrayBuffer[Byte]() var bytes = new Array[Byte](1024) var len = 0 while (-1 < len) { len = in.read(bytes) if (len > 0) { buf ++= bytes } } val bw = new BytesWritable(buf.toArray) log.error("byte:::::" + bw.getLength) store(bw) log.info("Stopped receiving") restart("Retrying connecting to " + host + ":" + port) } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } finally { if (in != null) { in.close() } if (socket != null) { socket.close() log.info("Closed socket to " + host + ":" + port) } } } override def onStop(): Unit = { } }
Example 15
Source File: MQTTUtils.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream} import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream} object MQTTUtils { def createStream( jssc: JavaStreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ): JavaReceiverInputDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, brokerUrl, topic, storageLevel) } }
Example 16
Source File: KinesisInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 17
Source File: TwitterStreamSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import org.scalatest.BeforeAndAfter import twitter4j.Status import twitter4j.auth.{NullAuthorization, Authorization} import org.apache.spark.{Logging, SparkFunSuite} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends SparkFunSuite with BeforeAndAfter with Logging { val batchDuration = Seconds(1) private val master: String = "local[2]" private val framework: String = this.getClass.getSimpleName test("twitter input stream") { val ssc = new StreamingContext(master, framework, batchDuration) val filters = Seq("filter1", "filter2") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream ssc.stop() } }
Example 18
Source File: KinesisInputDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 19
Source File: TwitterStreamSuite.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.twitter import java.util.UUID import scala.collection.mutable import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually import org.scalatest.time import org.scalatest.time.Span import twitter4j.{FilterQuery, Status, TwitterFactory} import twitter4j.auth.{Authorization, NullAuthorization} import org.apache.spark.ConditionalSparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class TwitterStreamSuite extends ConditionalSparkFunSuite with Eventually with BeforeAndAfter with Logging { def shouldRunTest(): Boolean = sys.env.get("ENABLE_TWITTER_TESTS").contains("1") var ssc: StreamingContext = _ before { ssc = new StreamingContext("local[2]", this.getClass.getSimpleName, Seconds(1)) } after { if (ssc != null) { ssc.stop() } } test("twitter input stream") { val filters = Seq("filter1", "filter2") val query = new FilterQuery().language("fr,es") val authorization: Authorization = NullAuthorization.getInstance() // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) val test2: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val test3: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test4: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization)) val test5: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, Some(authorization), filters) val test6: ReceiverInputDStream[Status] = TwitterUtils.createStream( ssc, Some(authorization), filters, StorageLevel.MEMORY_AND_DISK_SER_2) val test7: ReceiverInputDStream[Status] = TwitterUtils.createFilteredStream( ssc, Some(authorization), Some(query), StorageLevel.MEMORY_AND_DISK_SER_2) } testIf("messages received", () => TwitterStreamSuite.this.shouldRunTest()) { val userId = TwitterFactory.getSingleton.updateStatus( UUID.randomUUID().toString ).getUser.getId val receiveStream = TwitterUtils.createFilteredStream( ssc, None, Some(new FilterQuery().follow(userId)) ) @volatile var receivedMessages: mutable.Set[Status] = mutable.Set() receiveStream.foreachRDD { rdd => for (element <- rdd.collect()) { receivedMessages += element } receivedMessages } ssc.start() val nbOfMsg = 2 var publishedMessages: List[String] = List() (1 to nbOfMsg).foreach( _ => { publishedMessages = UUID.randomUUID().toString :: publishedMessages } ) eventually(timeout(Span(15, time.Seconds)), interval(Span(1000, time.Millis))) { publishedMessages.foreach( m => if (!receivedMessages.map(m => m.getText).contains(m.toString)) { TwitterFactory.getSingleton.updateStatus(m) } ) assert( publishedMessages.map(m => m.toString).toSet .subsetOf(receivedMessages.map(m => m.getText)) ) } } }
Example 20
Source File: PubNubUtils.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.pubnub import java.util.{Set => JSet} import collection.JavaConverters._ import com.pubnub.api.PNConfiguration import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaReceiverInputDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream object PubNubUtils { def createStream( jssc: JavaStreamingContext, configuration: PNConfiguration, channels: JSet[String], channelGroups: JSet[String], timeToken: Option[Long], storageLevel: StorageLevel): JavaReceiverInputDStream[SparkPubNubMessage] = { createStream( jssc.ssc, configuration, Seq.empty ++ channels.asScala, Seq.empty ++ channelGroups.asScala, timeToken, storageLevel ) } }
Example 21
Source File: PubNubWordCount.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming.pubnub import com.google.gson.JsonParser import com.pubnub.api.PNConfiguration import com.pubnub.api.enums.PNReconnectionPolicy import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Milliseconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.pubnub.{PubNubUtils, SparkPubNubMessage} object PubNubWordCount { def main(args: Array[String]): Unit = { if (args.length != 3) { // scalastyle:off println System.err.println( """ |Usage: PubNubWordCount <subscribeKey> <channel> | | <subscribeKey> subscribe key | <channel> channel | <aggregationPeriodMS> aggregation period in milliseconds | """.stripMargin ) // scalastyle:on System.exit(1) } val Seq(subscribeKey, channel, aggregationPeriod) = args.toSeq val sparkConf = new SparkConf().setAppName("PubNubWordCount").setMaster("local[2]") val ssc = new StreamingContext(sparkConf, Milliseconds(aggregationPeriod.toLong)) val config = new PNConfiguration config.setSubscribeKey(subscribeKey) config.setSecure(true) config.setReconnectionPolicy(PNReconnectionPolicy.LINEAR) val pubNubStream: ReceiverInputDStream[SparkPubNubMessage] = PubNubUtils.createStream( ssc, config, Seq(channel), Seq(), None, StorageLevel.MEMORY_AND_DISK_SER_2) val wordCounts = pubNubStream .flatMap( message => new JsonParser().parse(message.getPayload) .getAsJsonObject.get("text").getAsString.split("\\s") ) .map(word => (word, 1)) .reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
Example 22
Source File: AkkaUtilsSuite.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.akka import scala.concurrent.duration._ import akka.actor.{Props, SupervisorStrategy} import org.apache.spark.SparkFunSuite import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.ReceiverInputDStream class AkkaUtilsSuite extends SparkFunSuite { test("createStream") { val ssc: StreamingContext = new StreamingContext("local[2]", "test", Seconds(1000)) try { // tests the API, does not actually test data receiving val test1: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test") val test2: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2) val test3: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, supervisorStrategy = SupervisorStrategy.defaultStrategy) val test4: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) val test5: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null) val test6: ReceiverInputDStream[String] = AkkaUtils.createStream( ssc, Props[TestActor](), "test", StorageLevel.MEMORY_AND_DISK_SER_2, () => null, SupervisorStrategy.defaultStrategy) } finally { ssc.stop() } } } class TestActor extends ActorReceiver { override def receive: Receive = { case m: String => store(m) case m => store(m, 10.seconds) } }
Example 23
Source File: AMQPInputDStream.scala From streaming-amqp with Apache License 2.0 | 5 votes |
package io.radanalytics.streaming.amqp import org.apache.qpid.proton.message.Message import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.amqp.ReliableAMQPReceiver import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import scala.reflect.ClassTag class AMQPInputDStream[T: ClassTag]( ssc: StreamingContext, host: String, port: Int, username: Option[String], password: Option[String], address: String, messageConverter: Message => Option[T], useReliableReceiver: Boolean, storageLevel: StorageLevel ) extends ReceiverInputDStream[T](ssc) { def getReceiver(): Receiver[T] = { if (!useReliableReceiver) { new AMQPReceiver(host, port, username, password, address, messageConverter, storageLevel) } else { new ReliableAMQPReceiver(host, port, username, password, address, messageConverter, storageLevel) } } }
Example 24
Source File: KinesisInputDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 25
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 26
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 27
Source File: HttpInputDStreamAsync.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import com.ning.http.client.AsyncCompletionHandler import com.ning.http.client.AsyncHttpClient import com.ning.http.client.Response class HttpInputDStreamAsync( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiverAsync(storageLevel, url) } } class HttpReceiverAsync( storageLevel: StorageLevel, url: String) extends Receiver[String](storageLevel) with Logging { var asyncHttpClient: AsyncHttpClient = _ def onStop() { asyncHttpClient.close() logInfo("Disconnected from Http Server") } def onStart() { asyncHttpClient = new AsyncHttpClient() asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() { override def onCompleted(response: Response): Response = { store(response.getResponseBody) return response } override def onThrowable(t: Throwable) { restart("Error! Problems while connecting", t) } }); logInfo("Http Connection initiated") } } object HttpUtilsAsync { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String): DStream[String] = { new HttpInputDStreamAsync(ssc, storageLevel, url) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url) } }
Example 28
Source File: gihyo_6_3_Union.scala From gihyo-spark-book-example with Apache License 2.0 | 5 votes |
package jp.gihyo.spark.ch06 import org.apache.spark.SparkConf import org.apache.spark.streaming.{StreamingContext, Seconds} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.KafkaUtils object gihyo_6_3_Union { def main(args: Array[String]) { if (args.length != 3) { new IllegalArgumentException("Invalid arguments") System.exit(1) } val targetHosts = args(0) val consumerGroup = args(1) val targetTopics = args(2) val conf = new SparkConf().setAppName("NetworkWordCount") val ssc = new StreamingContext(conf, Seconds(5)) val KafkaStreams = (1 to 5).map { i => KafkaUtils.createStream(ssc, targetHosts, consumerGroup, Map(targetTopics -> 1)) } run(ssc, KafkaStreams) ssc.start ssc.awaitTermination } def run(ssc: StreamingContext, streams: IndexedSeq[InputDStream[(String, String)]]) { val unionedStream = ssc.union(streams) unionedStream.print } }
Example 29
Source File: RedisInputDStream.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.redislabs.provider.redis.streaming import com.redislabs.provider.redis.RedisConfig import org.apache.curator.utils.ThreadUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.dstream.ReceiverInputDStream import redis.clients.jedis._ import scala.reflect.{ClassTag, classTag} import scala.util.control.NonFatal keys.foreach{ key => executorPool.submit(new MessageHandler(redisConfig.connectionForKey(key), key)) } } finally { executorPool.shutdown() } } def onStop() { } private class MessageHandler(conn: Jedis, key: String) extends Runnable { def run() { try { while(!isStopped) { val response = conn.blpop(2, key) if (response == null || response.isEmpty) { // no-op } else if (classTag[T] == classTag[String]) { store(response.get(1).asInstanceOf[T]) } else if (classTag[T] == classTag[(String, String)]) { store((response.get(0), response.get(1)).asInstanceOf[T]) } else { throw new scala.Exception("Unknown Redis Streaming type") } } } catch { case NonFatal(e) => restart("Error receiving data", e) } finally { onStop() } } } }
Example 30
Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver} import org.apache.spark.streaming.{Seconds, StreamingContext} class SocketTextStream extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ //var schema:String=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] //schema=MapUtil.get(map,key="schema").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor //descriptor = schema :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession](); val socketDF = spark .readStream .format("socket") .option("host",hostname) .option("port",port) .load() out.write(socketDF) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.asInstanceOf[DStream[String]] } }