Python pyspark.streaming.kafka.KafkaUtils.createStream() Examples
The following are 8
code examples of pyspark.streaming.kafka.KafkaUtils.createStream().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.streaming.kafka.KafkaUtils
, or try the search function
.
Example #1
Source File: bluecoat.py From incubator-spot with Apache License 2.0 | 6 votes |
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size): """ Parse and save bluecoat logs. :param zk: Apache ZooKeeper quorum :param topic: Apache Kafka topic (application name) :param db: Apache Hive database to save into :param db_table: table of `db` to save into :param num_of_workers: number of Apache Kafka workers :param batch_size: batch size for Apache Spark streaming context """ app_name = topic wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc, int(batch_size)) sqc = HiveContext(sc) tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder) proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row)) saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic)) ssc.start() ssc.awaitTermination()
Example #2
Source File: streaming.py From incubator-spot with Apache License 2.0 | 5 votes |
def __init__(self, ssc, zkQuorum, groupId, topics): from common.serializer import deserialize from pyspark.streaming.kafka import KafkaUtils self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics, keyDecoder=lambda x: x, valueDecoder=deserialize)
Example #3
Source File: streaming.py From incubator-spot with Apache License 2.0 | 5 votes |
def __init__(self, ssc, zkQuorum, groupId, topics): from common.serializer import deserialize from pyspark.streaming.kafka import KafkaUtils self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics, keyDecoder=lambda x: x, valueDecoder=deserialize)
Example #4
Source File: streaming.py From incubator-spot with Apache License 2.0 | 5 votes |
def __init__(self, ssc, zkQuorum, groupId, topics): from common.serializer import deserialize from pyspark.streaming.kafka import KafkaUtils self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics, keyDecoder=lambda x: x, valueDecoder=deserialize)
Example #5
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_kafka_stream(self): """Test the Python Kafka stream API.""" topic = self._randomTopic() sendData = {"a": 3, "b": 5, "c": 10} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(), "test-streaming-consumer", {topic: 1}, {"auto.offset.reset": "smallest"}) self._validateStreamResult(sendData, stream)
Example #6
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def _startContext(self, n, compressed): # Start the StreamingContext and also collect the result dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(), enableDecompression=compressed) result = [] def get_output(_, rdd): for event in rdd.collect(): if len(result) < n: result.append(event) dstream.foreachRDD(get_output) self.ssc.start() return result
Example #7
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_kinesis_stream_api(self): # Don't start the StreamingContext because we cannot test it in Jenkins kinesisStream1 = KinesisUtils.createStream( self.ssc, "myAppNam", "mySparkStream", "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2) kinesisStream2 = KinesisUtils.createStream( self.ssc, "myAppNam", "mySparkStream", "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2, "awsAccessKey", "awsSecretKey")
Example #8
Source File: tests.py From LearningApacheSpark with MIT License | 4 votes |
def test_kinesis_stream(self): if not are_kinesis_tests_enabled: sys.stderr.write( "Skipped test_kinesis_stream (enable by setting environment variable %s=1" % kinesis_test_environ_var) return import random kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000))) kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2) try: kinesisTestUtils.createStream() aWSCredentials = kinesisTestUtils.getAWSCredentials() stream = KinesisUtils.createStream( self.ssc, kinesisAppName, kinesisTestUtils.streamName(), kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(), InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY, aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey()) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) stream.foreachRDD(get_output) self.ssc.start() testData = [i for i in range(1, 11)] expectedOutput = set([str(i) for i in testData]) start_time = time.time() while time.time() - start_time < 120: kinesisTestUtils.pushData(testData) if expectedOutput == set(outputBuffer): break time.sleep(10) self.assertEqual(expectedOutput, set(outputBuffer)) except: import traceback traceback.print_exc() raise finally: self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) # Search jar in the project dir using the jar name_prefix for both sbt build and maven build because # the artifact jars are in different directories.