Python pyspark.streaming.kafka.KafkaUtils.createDirectStream() Examples
The following are 7
code examples of pyspark.streaming.kafka.KafkaUtils.createDirectStream().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.streaming.kafka.KafkaUtils
, or try the search function
.
Example #1
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_kafka_direct_stream_from_offset(self): """Test the Python direct Kafka stream API with start offset specified.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} fromOffsets = {TopicAndPartition(topic, 0): long(0)} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets) self._validateStreamResult(sendData, stream)
Example #2
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_kafka_direct_stream_foreach_get_offsetRanges(self): """Test the Python direct Kafka stream foreachRDD get offsetRanges.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) offsetRanges = [] def getOffsetRanges(_, rdd): for o in rdd.offsetRanges(): offsetRanges.append(o) stream.foreachRDD(getOffsetRanges) self.ssc.start() self.wait_for(offsetRanges, 1) self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
Example #3
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_kafka_direct_stream(self): """Test the Python direct Kafka stream API.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) self._validateStreamResult(sendData, stream)
Example #4
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_kafka_direct_stream_transform_get_offsetRanges(self): """Test the Python direct Kafka stream transform get offsetRanges.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) offsetRanges = [] def transformWithOffsetRanges(rdd): for o in rdd.offsetRanges(): offsetRanges.append(o) return rdd # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together, # only the TransformedDstreams can be folded together. stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint() self.ssc.start() self.wait_for(offsetRanges, 1) self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
Example #5
Source File: ozy_streaming.py From ozymandias with MIT License | 5 votes |
def main(): """Run Spark Streaming""" conf = SparkConf() sc = SparkContext(appName='Ozymandias', conf=conf) sc.setLogLevel('WARN') with open(ROOT + 'channels.json', 'r') as f: channels = json.load(f) topics = [t['topic'] for t in channels['channels']] n_secs = 0.5 ssc = StreamingContext(sc, n_secs) stream = KafkaUtils.createDirectStream(ssc, topics, { 'bootstrap.servers':'localhost:9092', 'group.id':'ozy-group', 'fetch.message.max.bytes':'15728640', 'auto.offset.reset':'largest'}) stream.map( deserializer ).map( image_detector ).foreachRDD( message_sender) ssc.start() ssc.awaitTermination()
Example #6
Source File: process.py From kafka-compose with MIT License | 5 votes |
def create_context(): spark = get_session(SPARK_CONF) ssc = StreamingContext(spark.sparkContext, BATCH_DURATION) ssc.checkpoint(CHECKPOINT) # start offsets from beginning # won't work if we have a chackpoint offsets = {TopicAndPartition(topic, 0): 0 for topic in TOPICS} stream = KafkaUtils.createDirectStream(ssc, TOPICS, KAFKA_PARAMS, offsets) main(stream) return ssc
Example #7
Source File: tests.py From LearningApacheSpark with MIT License | 4 votes |
def test_kafka_direct_stream_transform_with_checkpoint(self): """Test the Python direct Kafka stream transform with checkpoint correctly recovered.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) offsetRanges = [] def transformWithOffsetRanges(rdd): for o in rdd.offsetRanges(): offsetRanges.append(o) return rdd self.ssc.stop(False) self.ssc = None tmpdir = "checkpoint-test-%d" % random.randint(0, 10000) def setup(): ssc = StreamingContext(self.sc, 0.5) ssc.checkpoint(tmpdir) stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams) stream.transform(transformWithOffsetRanges).count().pprint() return ssc try: ssc1 = StreamingContext.getOrCreate(tmpdir, setup) ssc1.start() self.wait_for(offsetRanges, 1) self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) # To make sure some checkpoint is written time.sleep(3) ssc1.stop(False) ssc1 = None # Restart again to make sure the checkpoint is recovered correctly ssc2 = StreamingContext.getOrCreate(tmpdir, setup) ssc2.start() ssc2.awaitTermination(3) ssc2.stop(stopSparkContext=False, stopGraceFully=True) ssc2 = None finally: shutil.rmtree(tmpdir)