Python Examples of pyspark.SparkContext

Source File: app.py From integrations-core with BSD 3-Clause "New" or "Revised" License

8 votes

def main():
    # Adapted from https://github.com/apache/spark/tree/master/examples/src/main/python/streaming
    sc = SparkContext(appName='PythonStreamingQueue')
    ssc = StreamingContext(sc, 1)

    # Create the queue through which RDDs can be pushed to
    # a QueueInputDStream
    rddQueue = []
    for _ in range(5):
        rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]

    # Create the QueueInputDStream and use it do some processing
    inputStream = ssc.queueStream(rddQueue)
    mappedStream = inputStream.map(lambda x: (x % 10, 1))
    reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
    reducedStream.pprint()

    ssc.start()
    time.sleep(6)
    ssc.stop(stopSparkContext=True, stopGraceFully=True)

Source File: spark_process.py From dispel4py with Apache License 2.0

7 votes

def run():
    from pyspark import SparkContext, SparkConf

    conf = SparkConf()
    conf.setAppName('dispel4py')
    conf.set("spark.storage.memoryFraction", "0.5")
    sc = SparkContext(
        conf=conf)

    from dispel4py.new import processor
    from dispel4py.utils import load_graph

    args = parse_args()

    graph = load_graph(args.module, args.attr)
    if graph is None:
        return
    graph.flatten()

    inputs = processor.create_inputs(args, graph)

    process(sc, graph, inputs=inputs, args=args)

Source File: bluecoat.py From incubator-spot with Apache License 2.0

6 votes

def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
    """
    Parse and save bluecoat logs.

    :param zk: Apache ZooKeeper quorum
    :param topic: Apache Kafka topic (application name)
    :param db: Apache Hive database to save into
    :param db_table: table of `db` to save into
    :param num_of_workers: number of Apache Kafka workers
    :param batch_size: batch size for Apache Spark streaming context
    """
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc, int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
    ssc.start()
    ssc.awaitTermination()

Source File: common.py From LearningApacheSpark with MIT License

6 votes

def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
    return obj

Source File: cassandra_example.py From pyspark-cassandra with Apache License 2.0

6 votes

def main():
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>"
        sys.exit(-1)

    keyspace_name = sys.argv[1]
    column_family_name = sys.argv[2]

    # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md
    conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1")

    sc = SparkContext(appName="Spark + Cassandra Example",
                      conf=conf)

    # import time; time.sleep(30)
    java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil")
    print sc._jvm.CassandraJavaUtil

    users = (
        ["Mike", "Sukmanowsky"],
        ["Andrew", "Montalenti"],
        ["Keith", "Bourgoin"],
    )
    rdd = sc.parallelize(users)
    print rdd.collect()

Source File: launcher.py From spylon with BSD 3-Clause "New" or "Revised" License

6 votes

def sql_context(self, application_name):
        """Create a spark context given the parameters configured in this class.

        The caller is responsible for calling ``.close`` on the resulting spark context

        Parameters
        ----------
        application_name : string

        Returns
        -------
        sc : SparkContext
        """
        sc = self.spark_context(application_name)
        import pyspark
        sqlContext = pyspark.SQLContext(sc)
        return (sc, sqlContext)

Source File: conftest.py From maggy with Apache License 2.0

6 votes

def sc(request):
    """ fixture for creating a spark context
    Args:
        request: pytest.FixtureRequest object
    """

    assert (
        request.config.getoption("--spark-master") is not None
    ), 'No Spark Master Address provided, use --spark-master: "spark://host:port" '

    conf = (
        SparkConf()
        .setMaster(request.config.getoption("--spark-master"))
        .setAppName("pytest-pyspark-local-testing")
        .set("spark.dynamicAllocation.maxExecutors", 2)
        .set("spark.executor.instances", 2)
    )
    scont = SparkContext(conf=conf)
    request.addfinalizer(lambda: scont.stop())

    quiet_py4j()
    return scont

Source File: sparkline.py From iLID with MIT License

6 votes

def main(args):
  window_size = 600
  files = filecollector.collect(args.input_path)

  sc = SparkContext("local", "sparkline")
  pipeline = (
    sc.parallelize(files, 4)
    .map(lambda f: read_wav(f))
    .flatMap(lambda (f, signal, samplerate): sliding_audio(f, signal, samplerate))
    .map(lambda (f, signal, samplerate): downsample(f, signal, samplerate))
    .map(lambda (f, signal, samplerate): apply_melfilter(f, signal, samplerate))
    .map(lambda (f, image): (f, graphic.colormapping.to_grayscale(image, bytes=True)))
    .map(lambda (f, image): (f, graphic.histeq.histeq(image)))
    .map(lambda (f, image): (f, graphic.histeq.clamp_and_equalize(image)))
    .map(lambda (f, image): (f, graphic.windowing.cut_or_pad_window(image, window_size)))
    .map(lambda (f, image): output.image.save(f, image, args.output_path))
  )

  pipeline.collect()

#.map(lambda (f, signal, samplerate): generate_spectrograms(f, signal, samplerate))

Source File: drybell_spark.py From snorkel-tutorials with Apache License 2.0

6 votes

def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}")

Source File: finance_similarity.py From Spark-in-Finance-Quantitative-Investing with Apache License 2.0

6 votes

def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc

Source File: testconfig.py From SMV with Apache License 2.0

6 votes

def sparkSession(cls):
        if not hasattr(cls, "spark"):
            # We can't use the SparkSession Builder here, since we need to call
            # Scala side's SmvTestHive.createContext to create the HiveTestContext's
            # SparkSession.
            # So we need to
            #   * Create a java_gateway
            #   * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
            #   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
            #   * Create Scala side HiveTestContext SparkSession
            #   * Create python SparkSession
            jgw = launch_gateway(None)
            jvm = jgw.jvm
            import tempfile
            import getpass
            hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
            sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                              .set("spark.sql.hive.metastore.barrierPrefixes",
                                                   "org.apache.spark.sql.hive.execution.PairSerDe")\
                                              .set("spark.sql.warehouse.dir", hivedir)\
                                              .set("spark.ui.enabled", "false")
            sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
            jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
            cls.spark = SparkSession(sc, jss.sparkSession())
        return cls.spark

Source File: sparkcc.py From cc-pyspark with MIT License

6 votes

def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop()

Source File: fixtures.py From pytest-spark with MIT License

6 votes

def _spark_session():
    """Internal fixture for SparkSession instance.

    Yields SparkSession instance if it is supported by the pyspark
    version, otherwise yields None.

    Required to correctly initialize `spark_context` fixture after
    `spark_session` fixture.

    ..note::
        It is not possible to create SparkSession from the existing
        SparkContext.
    """

    try:
        from pyspark.sql import SparkSession
    except ImportError:
        yield
    else:
        session = SparkSession.builder \
            .config(conf=SparkConfigBuilder().get()) \
            .getOrCreate()

        yield session
        session.stop()

Source File: construct.py From bolt with Apache License 2.0

6 votes

def _argcheck(*args, **kwargs):
        """
        Check that arguments are consistent with spark array construction.

        Conditions are:
        (1) a positional argument is a SparkContext
        (2) keyword arg 'context' is a SparkContext
        (3) an argument is a BoltArraySpark, or
        (4) an argument is a nested list containing a BoltArraySpark
        """
        try:
            from pyspark import SparkContext
        except ImportError:
            return False

        cond1 = any([isinstance(arg, SparkContext) for arg in args])
        cond2 = isinstance(kwargs.get('context', None), SparkContext)
        cond3 = any([isinstance(arg, BoltArraySpark) for arg in args])
        cond4 = any([any([isinstance(sub, BoltArraySpark) for sub in arg])
                     if isinstance(arg, (tuple, list)) else False for arg in args])
        return cond1 or cond2 or cond3 or cond4

Source File: fixtures.py From pytest-spark with MIT License

6 votes

def spark_context(_spark_session):
    """Return a SparkContext instance with reduced logging
    (session scope).
    """

    if _spark_session is None:
        from pyspark import SparkContext

        # pyspark 1.x: create SparkContext instance
        sc = SparkContext(conf=SparkConfigBuilder().get())
    else:
        # pyspark 2.x: get SparkContext from SparkSession fixture
        sc = _spark_session.sparkContext

    reduce_logging(sc)
    yield sc

    if _spark_session is None:
        sc.stop()  # pyspark 1.x: stop SparkContext instance

Source File: spark_conf.py From airflow-pipeline with Apache License 2.0

6 votes

def get_spark_context(conf):
    """Get the spark context for submitting pyspark applications"""
    spark_context = None
    try:
        spark_context = SparkContext(conf=conf)

        from fncore.utils.zip_py_module import zip_py

        import fncore
        spark_context.addPyFile(zip_py(os.path.dirname(fncore.__file__)))
        import py2neo
        spark_context.addPyFile(zip_py(os.path.dirname(py2neo.__file__)))

        yield spark_context
    except:
        raise
    finally:
        if spark_context:
            spark_context.stop()

Source File: build.py From sift with MIT License

6 votes

def __call__(self):
        c = SparkConf().setAppName('Build %s' % self.model_name)

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        kwargs = self.model.prepare(sc)
        m = self.model.build(**kwargs)
        m = self.model.format_items(m)
        m = self.formatter(m)

        if self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn('Writing over output path: %s', self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
        elif self.sample > 0:
            print '\n'.join(str(i) for i in m.take(self.sample))

        log.info('Done.')

Source File: test.py From TensorFlowOnSpark with Apache License 2.0

5 votes

def setUpClass(cls):
    master = os.getenv('MASTER')
    assert master is not None, "Please start a Spark standalone cluster and export MASTER to your env."

    num_workers = os.getenv('SPARK_WORKER_INSTANCES')
    assert num_workers is not None, "Please export SPARK_WORKER_INSTANCES to your env."
    cls.num_workers = int(num_workers)

    spark_jars = os.getenv('SPARK_CLASSPATH')
    assert spark_jars, "Please add path to tensorflow/ecosystem/hadoop jar to SPARK_CLASSPATH."

    cls.conf = SparkConf().set('spark.jars', spark_jars)
    cls.sc = SparkContext(master, cls.__name__, conf=cls.conf)
    cls.spark = SparkSession.builder.getOrCreate()

Source File: testing.py From sparkit-learn with Apache License 2.0

5 votes

def setUp(self):
        class_name = self.__class__.__name__
        self.sc = SparkContext('local[2]', class_name)
        self.sc._jvm.System.setProperty("spark.ui.showConsoleProgress", "false")
        log4j = self.sc._jvm.org.apache.log4j
        log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL)

Source File: conftest.py From bolt with Apache License 2.0

5 votes

def sc():
    from pyspark import SparkContext
    sc = SparkContext(appName="bolt-tests", master="local[2]")
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)
    return sc

Source File: classification.py From spark-cluster-deployment with Apache License 2.0

5 votes

def _test():
    import doctest
    globs = globals().copy()
    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)

Source File: _common.py From spark-cluster-deployment with Apache License 2.0

5 votes

def _test():
    import doctest
    globs = globals().copy()
    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)

Source File: blizzard2012.py From tacotron2 with BSD 3-Clause "New" or "Revised" License

5 votes

def text_and_path_rdd(self, sc: SparkContext):
        return sc.parallelize(
            self._extract_all_text_and_path())

Source File: conftest.py From example_dataproc_twitter with MIT License

5 votes

def spark_context():
    py_files = ['dataproc/jobs/base.py',
                'dataproc/jobs/dimsum.py'] 
    sc = pyspark.SparkContext(pyFiles=py_files)
    yield sc
    sc.stop()

Source File: run_jobs.py From example_dataproc_twitter with MIT License

5 votes

def main():
    alg = get_alg(sys.argv[1:]).algorithm
    if alg:
        job = JobsFactory._factor_alg(alg)()
        args = job.process_base_sysargs(
            [e for e in sys.argv[1:] if 'algorithm' not in e])
        with pyspark.SparkContext() as sc:
            job.run(sc, args)

Source File: run_jobs.py From example_dataproc_twitter with MIT License

5 votes

def main():
    alg = get_alg(sys.argv[1:]).algorithm
    if alg:
        job = JobsFactory._factor_alg(alg)()
        args = job.process_base_sysargs(
            [e for e in sys.argv[1:] if 'algorithm' not in e])
        with pyspark.SparkContext() as sc:
            job.run(sc, args)

Source File: listener.py From incubator-spot with Apache License 2.0

5 votes

def streaming_listener(**kwargs):
    '''
        Initialize the Spark job.
    '''
    Util.get_logger('SPOT.INGEST', kwargs.pop('log_level'))

    logger  = logging.getLogger('SPOT.INGEST.COMMON.LISTENER')
    logger.info('Initializing Spark Streaming Listener...')

    dbtable = '{0}.{1}'.format(kwargs.pop('database'), kwargs['type'])
    topic   = kwargs.pop('topic')

    sc      = SparkContext(appName=kwargs['app_name'] or topic)
    logger.info('Connect to Spark Cluster as job "{0}" and broadcast variables on it.'
        .format(kwargs.pop('app_name') or topic))
    ssc     = StreamingContext(sc, batchDuration=kwargs['batch_duration'])
    logger.info('Streaming data will be divided into batches of {0} seconds.'
        .format(kwargs.pop('batch_duration')))
    hsc     = HiveContext(sc)
    logger.info('Read Hive\'s configuration to integrate with data stored in it.')

    import pipelines
    module  = getattr(pipelines, kwargs.pop('type'))
    stream  = module.StreamPipeline(ssc, kwargs.pop('zkquorum'),
                kwargs.pop('group_id') or topic, { topic: int(kwargs.pop('partitions')) })

    schema  = stream.schema
    segtype = stream.segtype

    stream.dstream\
        .map(lambda x: module.StreamPipeline.parse(x))\
        .filter(lambda x: bool(x))\
        .foreachRDD(lambda x: store(x, hsc, dbtable, topic, schema, segtype))

    ssc.start()
    logger.info('Start the execution of the streams.')
    ssc.awaitTermination()

Source File: ljspeech.py From tacotron2 with BSD 3-Clause "New" or "Revised" License

5 votes

def text_and_path_rdd(self, sc: SparkContext):
        return sc.parallelize(
            self._extract_all_text_and_path())

Source File: ClimatologySpark2.py From incubator-sdap-nexus with Apache License 2.0

5 votes

def configureSpark(sparkConfig, appName, memoryPerExecutor='4G', coresPerExecutor=1):
    mode, numExecutors, numPartitions = sparkConfig.split(',')
    numExecutors = int(numExecutors)
    print >> sys.stderr, 'numExecutors = ', numExecutors
    numPartitions = int(numPartitions)
    print >> sys.stderr, 'numPartitions = ', numPartitions
    if mode == 'multicore':
        print >> sys.stderr, 'Using pysparkling'
        import pysparkling
        sc = pysparkling.Context()
    else:
        print >> sys.stderr, 'Using PySpark'
        sparkMaster = mode
        spConf = SparkConf()
        spConf.setAppName(appName)
        spConf.set("spark.executorEnv.HOME",
                   os.path.join(os.getenv('HOME'), 'spark_exec_home'))
        spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd())
        spConf.set("spark.executor.memory", memoryPerExecutor)
        print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor
        try:
            sparkMaster = SparkMasterOverride
        except:
            pass
        if sparkMaster[:5] == "mesos":
            spConf.set("spark.cores.max", numExecutors)
        else:
            # Spark master is YARN or local[N]
            spConf.set("spark.executor.instances", numExecutors)
            spConf.set("spark.executor.cores", coresPerExecutor)
            spConf.setMaster(sparkMaster)
        sc = SparkContext(conf=spConf)
    return sc, numExecutors, numPartitions

Source File: pixelStats.py From incubator-sdap-nexus with Apache License 2.0

5 votes

def pixelStats(urls, variable, nPartitions, timeFromFilename=TimeFromFilenameDOY, groupByKeys=GroupByKeys, accumulators=Accumulators,
               cachePath=CachePath, mode='dpark', modes=Modes):
    '''Compute a global (or regional) pixel mean field in parallel, given a list of URL's pointing to netCDF files.'''
    baseKey = groupByKeys[0]
    if baseKey == 'month':
        urlsByKey = splitByMonth(urls, timeFromFilename)
    else:
        print >>sys.stderr, 'pixelStats: Unrecognized groupByKey "%s".  Must be in %s' % (baseKey, str(groupByKeys))
        sys.exit(1)

    if mode == 'sequential':
        accum = [accumulate(u, variable, accumulators) for u in urlsByKey]
        merged = reduce(combine, accum)
        stats = statsFromAccumulators(merged)

    elif mode == 'dpark':
        import dpark
        urls = dpark.parallelize(urlsByKey, nPartitions)                          # returns RDD of URL lists
        accum = urls.map(lambda urls: accumulate(urls, variable, accumulators))   # returns RDD of stats accumulators
        merged = accum.reduce(combine)                                            # merged accumulators on head node
        stats = statsFromAccumulators(merged)                                     # compute final stats from accumulators

    elif mode == 'spark':
        from pyspark import SparkContext
        sc = SparkContext(appName="PixelStats")
        urls = sc.parallelize(urlsByKey, nPartitions)                             # returns RDD of URL lists
        accum = urls.map(lambda urls: accumulate(urls, variable, accumulators))   # returns RDD of stats accumulators
        merged = accum.reduce(combine)                                            # merged accumulators on head node
        stats = statsFromAccumulators(merged)                                     # compute final stats from accumulators

    else:
        stats = None
        if mode not in modes:
            print >>sys.stderr, 'pixelStats: Unrecognized mode  "%s".  Must be in %s' % (mode, str(modes))
            sys.exit(1)
    return stats

Python pyspark.SparkContext() Examples