Python pyspark.streaming.StreamingContext() Examples
The following are 8
code examples of pyspark.streaming.StreamingContext().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.streaming
, or try the search function
.
Example #1
Source File: app.py From integrations-core with BSD 3-Clause "New" or "Revised" License | 8 votes |
def main(): # Adapted from https://github.com/apache/spark/tree/master/examples/src/main/python/streaming sc = SparkContext(appName='PythonStreamingQueue') ssc = StreamingContext(sc, 1) # Create the queue through which RDDs can be pushed to # a QueueInputDStream rddQueue = [] for _ in range(5): rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)] # Create the QueueInputDStream and use it do some processing inputStream = ssc.queueStream(rddQueue) mappedStream = inputStream.map(lambda x: (x % 10, 1)) reducedStream = mappedStream.reduceByKey(lambda a, b: a + b) reducedStream.pprint() ssc.start() time.sleep(6) ssc.stop(stopSparkContext=True, stopGraceFully=True)
Example #2
Source File: bluecoat.py From incubator-spot with Apache License 2.0 | 6 votes |
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size): """ Parse and save bluecoat logs. :param zk: Apache ZooKeeper quorum :param topic: Apache Kafka topic (application name) :param db: Apache Hive database to save into :param db_table: table of `db` to save into :param num_of_workers: number of Apache Kafka workers :param batch_size: batch size for Apache Spark streaming context """ app_name = topic wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc, int(batch_size)) sqc = HiveContext(sc) tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder) proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row)) saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic)) ssc.start() ssc.awaitTermination()
Example #3
Source File: streaming_context.py From monasca-analytics with Apache License 2.0 | 6 votes |
def create_streaming_context(spark_context, config): """ Create a streaming context with a custom Streaming Listener that will log every event. :param spark_context: Spark context :type spark_context: pyspark.SparkContext :param config: dict :return: Returns a new streaming context from the given context. :rtype: pyspark.streaming.StreamingContext """ ssc = streaming.StreamingContext(spark_context, config[ "spark_config"]["streaming"]["batch_interval"]) ssc.addStreamingListener(DriverStreamingListener) directory = os_path.expanduser("~/checkpointing") logger.info("Checkpointing to `{}`".format(directory)) # Commented out to fix a crash occurring when # phase 1 is used. The reason of the crash is still unclear # but Spark complains about the SSC being transferred # to workers. # ssc.checkpoint(directory) return ssc
Example #4
Source File: listener.py From incubator-spot with Apache License 2.0 | 5 votes |
def streaming_listener(**kwargs): ''' Initialize the Spark job. ''' Util.get_logger('SPOT.INGEST', kwargs.pop('log_level')) logger = logging.getLogger('SPOT.INGEST.COMMON.LISTENER') logger.info('Initializing Spark Streaming Listener...') dbtable = '{0}.{1}'.format(kwargs.pop('database'), kwargs['type']) topic = kwargs.pop('topic') sc = SparkContext(appName=kwargs['app_name'] or topic) logger.info('Connect to Spark Cluster as job "{0}" and broadcast variables on it.' .format(kwargs.pop('app_name') or topic)) ssc = StreamingContext(sc, batchDuration=kwargs['batch_duration']) logger.info('Streaming data will be divided into batches of {0} seconds.' .format(kwargs.pop('batch_duration'))) hsc = HiveContext(sc) logger.info('Read Hive\'s configuration to integrate with data stored in it.') import pipelines module = getattr(pipelines, kwargs.pop('type')) stream = module.StreamPipeline(ssc, kwargs.pop('zkquorum'), kwargs.pop('group_id') or topic, { topic: int(kwargs.pop('partitions')) }) schema = stream.schema segtype = stream.segtype stream.dstream\ .map(lambda x: module.StreamPipeline.parse(x))\ .filter(lambda x: bool(x))\ .foreachRDD(lambda x: store(x, hsc, dbtable, topic, schema, segtype)) ssc.start() logger.info('Start the execution of the streams.') ssc.awaitTermination()
Example #5
Source File: ozy_streaming.py From ozymandias with MIT License | 5 votes |
def main(): """Run Spark Streaming""" conf = SparkConf() sc = SparkContext(appName='Ozymandias', conf=conf) sc.setLogLevel('WARN') with open(ROOT + 'channels.json', 'r') as f: channels = json.load(f) topics = [t['topic'] for t in channels['channels']] n_secs = 0.5 ssc = StreamingContext(sc, n_secs) stream = KafkaUtils.createDirectStream(ssc, topics, { 'bootstrap.servers':'localhost:9092', 'group.id':'ozy-group', 'fetch.message.max.bytes':'15728640', 'auto.offset.reset':'largest'}) stream.map( deserializer ).map( image_detector ).foreachRDD( message_sender) ssc.start() ssc.awaitTermination()
Example #6
Source File: tests.py From LearningApacheSpark with MIT License | 4 votes |
def setUp(self): self.sc = SparkContext('local[4]', "MLlib tests") self.ssc = StreamingContext(self.sc, 1.0)
Example #7
Source File: signals.py From pyspark-examples with GNU General Public License v3.0 | 4 votes |
def isException(machine, signal): # assunzioni: da parametrizzare come parametro o letto dinamicamente da fonte exceptions = [(11,19)] return (int(machine), signal) in exceptions # Create a local StreamingContext with two working thread and batch interval of 1 second
Example #8
Source File: conftest.py From maggy with Apache License 2.0 | 4 votes |
def streaming_context(sc): return StreamingContext(sc, 1)