Python pyspark.SparkContext() Examples
The following are 30
code examples of pyspark.SparkContext().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark
, or try the search function
.
Example #1
Source File: app.py From integrations-core with BSD 3-Clause "New" or "Revised" License | 8 votes |
def main(): # Adapted from https://github.com/apache/spark/tree/master/examples/src/main/python/streaming sc = SparkContext(appName='PythonStreamingQueue') ssc = StreamingContext(sc, 1) # Create the queue through which RDDs can be pushed to # a QueueInputDStream rddQueue = [] for _ in range(5): rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)] # Create the QueueInputDStream and use it do some processing inputStream = ssc.queueStream(rddQueue) mappedStream = inputStream.map(lambda x: (x % 10, 1)) reducedStream = mappedStream.reduceByKey(lambda a, b: a + b) reducedStream.pprint() ssc.start() time.sleep(6) ssc.stop(stopSparkContext=True, stopGraceFully=True)
Example #2
Source File: spark_process.py From dispel4py with Apache License 2.0 | 7 votes |
def run(): from pyspark import SparkContext, SparkConf conf = SparkConf() conf.setAppName('dispel4py') conf.set("spark.storage.memoryFraction", "0.5") sc = SparkContext( conf=conf) from dispel4py.new import processor from dispel4py.utils import load_graph args = parse_args() graph = load_graph(args.module, args.attr) if graph is None: return graph.flatten() inputs = processor.create_inputs(args, graph) process(sc, graph, inputs=inputs, args=args)
Example #3
Source File: bluecoat.py From incubator-spot with Apache License 2.0 | 6 votes |
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size): """ Parse and save bluecoat logs. :param zk: Apache ZooKeeper quorum :param topic: Apache Kafka topic (application name) :param db: Apache Hive database to save into :param db_table: table of `db` to save into :param num_of_workers: number of Apache Kafka workers :param batch_size: batch size for Apache Spark streaming context """ app_name = topic wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc, int(batch_size)) sqc = HiveContext(sc) tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder) proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row)) saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic)) ssc.start() ssc.awaitTermination()
Example #4
Source File: common.py From LearningApacheSpark with MIT License | 6 votes |
def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, list): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data) return obj
Example #5
Source File: cassandra_example.py From pyspark-cassandra with Apache License 2.0 | 6 votes |
def main(): if len(sys.argv) != 3: print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>" sys.exit(-1) keyspace_name = sys.argv[1] column_family_name = sys.argv[2] # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1") sc = SparkContext(appName="Spark + Cassandra Example", conf=conf) # import time; time.sleep(30) java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil") print sc._jvm.CassandraJavaUtil users = ( ["Mike", "Sukmanowsky"], ["Andrew", "Montalenti"], ["Keith", "Bourgoin"], ) rdd = sc.parallelize(users) print rdd.collect()
Example #6
Source File: launcher.py From spylon with BSD 3-Clause "New" or "Revised" License | 6 votes |
def sql_context(self, application_name): """Create a spark context given the parameters configured in this class. The caller is responsible for calling ``.close`` on the resulting spark context Parameters ---------- application_name : string Returns ------- sc : SparkContext """ sc = self.spark_context(application_name) import pyspark sqlContext = pyspark.SQLContext(sc) return (sc, sqlContext)
Example #7
Source File: conftest.py From maggy with Apache License 2.0 | 6 votes |
def sc(request): """ fixture for creating a spark context Args: request: pytest.FixtureRequest object """ assert ( request.config.getoption("--spark-master") is not None ), 'No Spark Master Address provided, use --spark-master: "spark://host:port" ' conf = ( SparkConf() .setMaster(request.config.getoption("--spark-master")) .setAppName("pytest-pyspark-local-testing") .set("spark.dynamicAllocation.maxExecutors", 2) .set("spark.executor.instances", 2) ) scont = SparkContext(conf=conf) request.addfinalizer(lambda: scont.stop()) quiet_py4j() return scont
Example #8
Source File: sparkline.py From iLID with MIT License | 6 votes |
def main(args): window_size = 600 files = filecollector.collect(args.input_path) sc = SparkContext("local", "sparkline") pipeline = ( sc.parallelize(files, 4) .map(lambda f: read_wav(f)) .flatMap(lambda (f, signal, samplerate): sliding_audio(f, signal, samplerate)) .map(lambda (f, signal, samplerate): downsample(f, signal, samplerate)) .map(lambda (f, signal, samplerate): apply_melfilter(f, signal, samplerate)) .map(lambda (f, image): (f, graphic.colormapping.to_grayscale(image, bytes=True))) .map(lambda (f, image): (f, graphic.histeq.histeq(image))) .map(lambda (f, image): (f, graphic.histeq.clamp_and_equalize(image))) .map(lambda (f, image): (f, graphic.windowing.cut_or_pad_window(image, window_size))) .map(lambda (f, image): output.image.save(f, image, args.output_path)) ) pipeline.collect() #.map(lambda (f, signal, samplerate): generate_spectrograms(f, signal, samplerate))
Example #9
Source File: drybell_spark.py From snorkel-tutorials with Apache License 2.0 | 6 votes |
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
Example #10
Source File: finance_similarity.py From Spark-in-Finance-Quantitative-Investing with Apache License 2.0 | 6 votes |
def create_sc(): sc_conf = SparkConf() sc_conf.setAppName("finance-similarity-app") sc_conf.setMaster('spark://10.21.208.21:7077') sc_conf.set('spark.executor.memory', '2g') sc_conf.set('spark.executor.cores', '4') sc_conf.set('spark.cores.max', '40') sc_conf.set('spark.logConf', True) print sc_conf.getAll() sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
Example #11
Source File: testconfig.py From SMV with Apache License 2.0 | 6 votes |
def sparkSession(cls): if not hasattr(cls, "spark"): # We can't use the SparkSession Builder here, since we need to call # Scala side's SmvTestHive.createContext to create the HiveTestContext's # SparkSession. # So we need to # * Create a java_gateway # * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf) # * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir) # * Create Scala side HiveTestContext SparkSession # * Create python SparkSession jgw = launch_gateway(None) jvm = jgw.jvm import tempfile import getpass hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser()) sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\ .set("spark.sql.hive.metastore.barrierPrefixes", "org.apache.spark.sql.hive.execution.PairSerDe")\ .set("spark.sql.warehouse.dir", hivedir)\ .set("spark.ui.enabled", "false") sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate() jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc()) cls.spark = SparkSession(sc, jss.sparkSession()) return cls.spark
Example #12
Source File: sparkcc.py From cc-pyspark with MIT License | 6 votes |
def run(self): self.args = self.parse_arguments() conf = SparkConf() if self.args.spark_profiler: conf = conf.set("spark.python.profile", "true") sc = SparkContext( appName=self.name, conf=conf) sqlc = SQLContext(sparkContext=sc) self.init_accumulators(sc) self.run_job(sc, sqlc) if self.args.spark_profiler: sc.show_profiles() sc.stop()
Example #13
Source File: fixtures.py From pytest-spark with MIT License | 6 votes |
def _spark_session(): """Internal fixture for SparkSession instance. Yields SparkSession instance if it is supported by the pyspark version, otherwise yields None. Required to correctly initialize `spark_context` fixture after `spark_session` fixture. ..note:: It is not possible to create SparkSession from the existing SparkContext. """ try: from pyspark.sql import SparkSession except ImportError: yield else: session = SparkSession.builder \ .config(conf=SparkConfigBuilder().get()) \ .getOrCreate() yield session session.stop()
Example #14
Source File: construct.py From bolt with Apache License 2.0 | 6 votes |
def _argcheck(*args, **kwargs): """ Check that arguments are consistent with spark array construction. Conditions are: (1) a positional argument is a SparkContext (2) keyword arg 'context' is a SparkContext (3) an argument is a BoltArraySpark, or (4) an argument is a nested list containing a BoltArraySpark """ try: from pyspark import SparkContext except ImportError: return False cond1 = any([isinstance(arg, SparkContext) for arg in args]) cond2 = isinstance(kwargs.get('context', None), SparkContext) cond3 = any([isinstance(arg, BoltArraySpark) for arg in args]) cond4 = any([any([isinstance(sub, BoltArraySpark) for sub in arg]) if isinstance(arg, (tuple, list)) else False for arg in args]) return cond1 or cond2 or cond3 or cond4
Example #15
Source File: fixtures.py From pytest-spark with MIT License | 6 votes |
def spark_context(_spark_session): """Return a SparkContext instance with reduced logging (session scope). """ if _spark_session is None: from pyspark import SparkContext # pyspark 1.x: create SparkContext instance sc = SparkContext(conf=SparkConfigBuilder().get()) else: # pyspark 2.x: get SparkContext from SparkSession fixture sc = _spark_session.sparkContext reduce_logging(sc) yield sc if _spark_session is None: sc.stop() # pyspark 1.x: stop SparkContext instance
Example #16
Source File: spark_conf.py From airflow-pipeline with Apache License 2.0 | 6 votes |
def get_spark_context(conf): """Get the spark context for submitting pyspark applications""" spark_context = None try: spark_context = SparkContext(conf=conf) from fncore.utils.zip_py_module import zip_py import fncore spark_context.addPyFile(zip_py(os.path.dirname(fncore.__file__))) import py2neo spark_context.addPyFile(zip_py(os.path.dirname(py2neo.__file__))) yield spark_context except: raise finally: if spark_context: spark_context.stop()
Example #17
Source File: build.py From sift with MIT License | 6 votes |
def __call__(self): c = SparkConf().setAppName('Build %s' % self.model_name) log.info('Using spark master: %s', c.get('spark.master')) sc = SparkContext(conf=c) kwargs = self.model.prepare(sc) m = self.model.build(**kwargs) m = self.model.format_items(m) m = self.formatter(m) if self.output_path: log.info("Saving to: %s", self.output_path) if os.path.isdir(self.output_path): log.warn('Writing over output path: %s', self.output_path) shutil.rmtree(self.output_path) m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec') elif self.sample > 0: print '\n'.join(str(i) for i in m.take(self.sample)) log.info('Done.')
Example #18
Source File: test.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def setUpClass(cls): master = os.getenv('MASTER') assert master is not None, "Please start a Spark standalone cluster and export MASTER to your env." num_workers = os.getenv('SPARK_WORKER_INSTANCES') assert num_workers is not None, "Please export SPARK_WORKER_INSTANCES to your env." cls.num_workers = int(num_workers) spark_jars = os.getenv('SPARK_CLASSPATH') assert spark_jars, "Please add path to tensorflow/ecosystem/hadoop jar to SPARK_CLASSPATH." cls.conf = SparkConf().set('spark.jars', spark_jars) cls.sc = SparkContext(master, cls.__name__, conf=cls.conf) cls.spark = SparkSession.builder.getOrCreate()
Example #19
Source File: testing.py From sparkit-learn with Apache License 2.0 | 5 votes |
def setUp(self): class_name = self.__class__.__name__ self.sc = SparkContext('local[2]', class_name) self.sc._jvm.System.setProperty("spark.ui.showConsoleProgress", "false") log4j = self.sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL)
Example #20
Source File: conftest.py From bolt with Apache License 2.0 | 5 votes |
def sc(): from pyspark import SparkContext sc = SparkContext(appName="bolt-tests", master="local[2]") log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) return sc
Example #21
Source File: classification.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def _test(): import doctest globs = globals().copy() globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
Example #22
Source File: _common.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def _test(): import doctest globs = globals().copy() globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
Example #23
Source File: blizzard2012.py From tacotron2 with BSD 3-Clause "New" or "Revised" License | 5 votes |
def text_and_path_rdd(self, sc: SparkContext): return sc.parallelize( self._extract_all_text_and_path())
Example #24
Source File: conftest.py From example_dataproc_twitter with MIT License | 5 votes |
def spark_context(): py_files = ['dataproc/jobs/base.py', 'dataproc/jobs/dimsum.py'] sc = pyspark.SparkContext(pyFiles=py_files) yield sc sc.stop()
Example #25
Source File: run_jobs.py From example_dataproc_twitter with MIT License | 5 votes |
def main(): alg = get_alg(sys.argv[1:]).algorithm if alg: job = JobsFactory._factor_alg(alg)() args = job.process_base_sysargs( [e for e in sys.argv[1:] if 'algorithm' not in e]) with pyspark.SparkContext() as sc: job.run(sc, args)
Example #26
Source File: run_jobs.py From example_dataproc_twitter with MIT License | 5 votes |
def main(): alg = get_alg(sys.argv[1:]).algorithm if alg: job = JobsFactory._factor_alg(alg)() args = job.process_base_sysargs( [e for e in sys.argv[1:] if 'algorithm' not in e]) with pyspark.SparkContext() as sc: job.run(sc, args)
Example #27
Source File: listener.py From incubator-spot with Apache License 2.0 | 5 votes |
def streaming_listener(**kwargs): ''' Initialize the Spark job. ''' Util.get_logger('SPOT.INGEST', kwargs.pop('log_level')) logger = logging.getLogger('SPOT.INGEST.COMMON.LISTENER') logger.info('Initializing Spark Streaming Listener...') dbtable = '{0}.{1}'.format(kwargs.pop('database'), kwargs['type']) topic = kwargs.pop('topic') sc = SparkContext(appName=kwargs['app_name'] or topic) logger.info('Connect to Spark Cluster as job "{0}" and broadcast variables on it.' .format(kwargs.pop('app_name') or topic)) ssc = StreamingContext(sc, batchDuration=kwargs['batch_duration']) logger.info('Streaming data will be divided into batches of {0} seconds.' .format(kwargs.pop('batch_duration'))) hsc = HiveContext(sc) logger.info('Read Hive\'s configuration to integrate with data stored in it.') import pipelines module = getattr(pipelines, kwargs.pop('type')) stream = module.StreamPipeline(ssc, kwargs.pop('zkquorum'), kwargs.pop('group_id') or topic, { topic: int(kwargs.pop('partitions')) }) schema = stream.schema segtype = stream.segtype stream.dstream\ .map(lambda x: module.StreamPipeline.parse(x))\ .filter(lambda x: bool(x))\ .foreachRDD(lambda x: store(x, hsc, dbtable, topic, schema, segtype)) ssc.start() logger.info('Start the execution of the streams.') ssc.awaitTermination()
Example #28
Source File: ljspeech.py From tacotron2 with BSD 3-Clause "New" or "Revised" License | 5 votes |
def text_and_path_rdd(self, sc: SparkContext): return sc.parallelize( self._extract_all_text_and_path())
Example #29
Source File: ClimatologySpark2.py From incubator-sdap-nexus with Apache License 2.0 | 5 votes |
def configureSpark(sparkConfig, appName, memoryPerExecutor='4G', coresPerExecutor=1): mode, numExecutors, numPartitions = sparkConfig.split(',') numExecutors = int(numExecutors) print >> sys.stderr, 'numExecutors = ', numExecutors numPartitions = int(numPartitions) print >> sys.stderr, 'numPartitions = ', numPartitions if mode == 'multicore': print >> sys.stderr, 'Using pysparkling' import pysparkling sc = pysparkling.Context() else: print >> sys.stderr, 'Using PySpark' sparkMaster = mode spConf = SparkConf() spConf.setAppName(appName) spConf.set("spark.executorEnv.HOME", os.path.join(os.getenv('HOME'), 'spark_exec_home')) spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd()) spConf.set("spark.executor.memory", memoryPerExecutor) print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor try: sparkMaster = SparkMasterOverride except: pass if sparkMaster[:5] == "mesos": spConf.set("spark.cores.max", numExecutors) else: # Spark master is YARN or local[N] spConf.set("spark.executor.instances", numExecutors) spConf.set("spark.executor.cores", coresPerExecutor) spConf.setMaster(sparkMaster) sc = SparkContext(conf=spConf) return sc, numExecutors, numPartitions
Example #30
Source File: pixelStats.py From incubator-sdap-nexus with Apache License 2.0 | 5 votes |
def pixelStats(urls, variable, nPartitions, timeFromFilename=TimeFromFilenameDOY, groupByKeys=GroupByKeys, accumulators=Accumulators, cachePath=CachePath, mode='dpark', modes=Modes): '''Compute a global (or regional) pixel mean field in parallel, given a list of URL's pointing to netCDF files.''' baseKey = groupByKeys[0] if baseKey == 'month': urlsByKey = splitByMonth(urls, timeFromFilename) else: print >>sys.stderr, 'pixelStats: Unrecognized groupByKey "%s". Must be in %s' % (baseKey, str(groupByKeys)) sys.exit(1) if mode == 'sequential': accum = [accumulate(u, variable, accumulators) for u in urlsByKey] merged = reduce(combine, accum) stats = statsFromAccumulators(merged) elif mode == 'dpark': import dpark urls = dpark.parallelize(urlsByKey, nPartitions) # returns RDD of URL lists accum = urls.map(lambda urls: accumulate(urls, variable, accumulators)) # returns RDD of stats accumulators merged = accum.reduce(combine) # merged accumulators on head node stats = statsFromAccumulators(merged) # compute final stats from accumulators elif mode == 'spark': from pyspark import SparkContext sc = SparkContext(appName="PixelStats") urls = sc.parallelize(urlsByKey, nPartitions) # returns RDD of URL lists accum = urls.map(lambda urls: accumulate(urls, variable, accumulators)) # returns RDD of stats accumulators merged = accum.reduce(combine) # merged accumulators on head node stats = statsFromAccumulators(merged) # compute final stats from accumulators else: stats = None if mode not in modes: print >>sys.stderr, 'pixelStats: Unrecognized mode "%s". Must be in %s' % (mode, str(modes)) sys.exit(1) return stats