Python pyspark.streaming.StreamingContext() Examples

The following are 8 code examples of pyspark.streaming.StreamingContext(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.streaming , or try the search function

Example #1

Source File: app.py From integrations-core with BSD 3-Clause "New" or "Revised" License

8 votes

def main():
    # Adapted from https://github.com/apache/spark/tree/master/examples/src/main/python/streaming
    sc = SparkContext(appName='PythonStreamingQueue')
    ssc = StreamingContext(sc, 1)

    # Create the queue through which RDDs can be pushed to
    # a QueueInputDStream
    rddQueue = []
    for _ in range(5):
        rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]

    # Create the QueueInputDStream and use it do some processing
    inputStream = ssc.queueStream(rddQueue)
    mappedStream = inputStream.map(lambda x: (x % 10, 1))
    reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
    reducedStream.pprint()

    ssc.start()
    time.sleep(6)
    ssc.stop(stopSparkContext=True, stopGraceFully=True)

Example #2

Source File: bluecoat.py From incubator-spot with Apache License 2.0

6 votes

def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
    """
    Parse and save bluecoat logs.

    :param zk: Apache ZooKeeper quorum
    :param topic: Apache Kafka topic (application name)
    :param db: Apache Hive database to save into
    :param db_table: table of `db` to save into
    :param num_of_workers: number of Apache Kafka workers
    :param batch_size: batch size for Apache Spark streaming context
    """
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc, int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
    ssc.start()
    ssc.awaitTermination()

Example #3

Source File: streaming_context.py From monasca-analytics with Apache License 2.0

6 votes

def create_streaming_context(spark_context, config):
    """
    Create a streaming context with a custom Streaming Listener
    that will log every event.
    :param spark_context: Spark context
    :type spark_context: pyspark.SparkContext
    :param config: dict
    :return: Returns a new streaming context from the given context.
    :rtype: pyspark.streaming.StreamingContext
    """
    ssc = streaming.StreamingContext(spark_context, config[
        "spark_config"]["streaming"]["batch_interval"])
    ssc.addStreamingListener(DriverStreamingListener)
    directory = os_path.expanduser("~/checkpointing")
    logger.info("Checkpointing to `{}`".format(directory))
    # Commented out to fix a crash occurring when
    # phase 1 is used. The reason of the crash is still unclear
    # but Spark complains about the SSC being transferred
    # to workers.
    # ssc.checkpoint(directory)
    return ssc

Example #4

Source File: listener.py From incubator-spot with Apache License 2.0

5 votes

def streaming_listener(**kwargs):
    '''
        Initialize the Spark job.
    '''
    Util.get_logger('SPOT.INGEST', kwargs.pop('log_level'))

    logger  = logging.getLogger('SPOT.INGEST.COMMON.LISTENER')
    logger.info('Initializing Spark Streaming Listener...')

    dbtable = '{0}.{1}'.format(kwargs.pop('database'), kwargs['type'])
    topic   = kwargs.pop('topic')

    sc      = SparkContext(appName=kwargs['app_name'] or topic)
    logger.info('Connect to Spark Cluster as job "{0}" and broadcast variables on it.'
        .format(kwargs.pop('app_name') or topic))
    ssc     = StreamingContext(sc, batchDuration=kwargs['batch_duration'])
    logger.info('Streaming data will be divided into batches of {0} seconds.'
        .format(kwargs.pop('batch_duration')))
    hsc     = HiveContext(sc)
    logger.info('Read Hive\'s configuration to integrate with data stored in it.')

    import pipelines
    module  = getattr(pipelines, kwargs.pop('type'))
    stream  = module.StreamPipeline(ssc, kwargs.pop('zkquorum'),
                kwargs.pop('group_id') or topic, { topic: int(kwargs.pop('partitions')) })

    schema  = stream.schema
    segtype = stream.segtype

    stream.dstream\
        .map(lambda x: module.StreamPipeline.parse(x))\
        .filter(lambda x: bool(x))\
        .foreachRDD(lambda x: store(x, hsc, dbtable, topic, schema, segtype))

    ssc.start()
    logger.info('Start the execution of the streams.')
    ssc.awaitTermination()

Example #5

Source File: ozy_streaming.py From ozymandias with MIT License

5 votes

def main():
    """Run Spark Streaming"""
    conf = SparkConf()
    sc = SparkContext(appName='Ozymandias', conf=conf)
    sc.setLogLevel('WARN')
    
    with open(ROOT + 'channels.json', 'r') as f:
        channels = json.load(f)
    topics = [t['topic'] for t in channels['channels']]
    
    n_secs = 0.5
    ssc = StreamingContext(sc, n_secs)
    stream = KafkaUtils.createDirectStream(ssc, topics, {
                        'bootstrap.servers':'localhost:9092', 
                        'group.id':'ozy-group', 
                        'fetch.message.max.bytes':'15728640',
                        'auto.offset.reset':'largest'})
    
    stream.map(
            deserializer
        ).map(
            image_detector
        ).foreachRDD(
            message_sender)
    
    ssc.start()
    ssc.awaitTermination()

Example #6

Source File: tests.py From LearningApacheSpark with MIT License

4 votes

def setUp(self):
        self.sc = SparkContext('local[4]', "MLlib tests")
        self.ssc = StreamingContext(self.sc, 1.0)

Example #7

Source File: signals.py From pyspark-examples with GNU General Public License v3.0

4 votes

def isException(machine, signal):
# assunzioni: da parametrizzare come parametro o letto dinamicamente da fonte
  exceptions = [(11,19)]
  return (int(machine), signal) in exceptions 

# Create a local StreamingContext with two working thread and batch interval of 1 second

Example #8

Source File: conftest.py From maggy with Apache License 2.0

4 votes

def streaming_context(sc):
    return StreamingContext(sc, 1)