Python pyspark.SparkConf() Examples
The following are 30
code examples of pyspark.SparkConf().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark
, or try the search function
.
Example #1
Source File: spark_process.py From dispel4py with Apache License 2.0 | 7 votes |
def run(): from pyspark import SparkContext, SparkConf conf = SparkConf() conf.setAppName('dispel4py') conf.set("spark.storage.memoryFraction", "0.5") sc = SparkContext( conf=conf) from dispel4py.new import processor from dispel4py.utils import load_graph args = parse_args() graph = load_graph(args.module, args.attr) if graph is None: return graph.flatten() inputs = processor.create_inputs(args, graph) process(sc, graph, inputs=inputs, args=args)
Example #2
Source File: test_spark_model_export.py From mlflow with Apache License 2.0 | 6 votes |
def spark_context(): conf = pyspark.SparkConf() conf.set(key="spark.jars.packages", value='ml.combust.mleap:mleap-spark-base_2.11:0.12.0,' 'ml.combust.mleap:mleap-spark_2.11:0.12.0') max_tries = 3 for num_tries in range(max_tries): try: spark = get_spark_session(conf) return spark.sparkContext except Exception as e: if num_tries >= max_tries - 1: raise _logger.exception(e, "Attempt %s to create a SparkSession failed, retrying..." % num_tries)
Example #3
Source File: build.py From sift with MIT License | 6 votes |
def __call__(self): c = SparkConf().setAppName('Build %s' % self.model_name) log.info('Using spark master: %s', c.get('spark.master')) sc = SparkContext(conf=c) kwargs = self.model.prepare(sc) m = self.model.build(**kwargs) m = self.model.format_items(m) m = self.formatter(m) if self.output_path: log.info("Saving to: %s", self.output_path) if os.path.isdir(self.output_path): log.warn('Writing over output path: %s', self.output_path) shutil.rmtree(self.output_path) m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec') elif self.sample > 0: print '\n'.join(str(i) for i in m.take(self.sample)) log.info('Done.')
Example #4
Source File: taar_dynamo.py From telemetry-airflow with Mozilla Public License 2.0 | 6 votes |
def main(date, aws_access_key_id, aws_secret_access_key, region, table, sample_rate): # Clobber the AWS access credentials os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key APP_NAME = "TaarDynamo" conf = SparkConf().setAppName(APP_NAME) spark = SparkSession.builder.config(conf=conf).getOrCreate() date_obj = datetime.strptime(date, "%Y%m%d") - PATCH_DAYS reduction_output = run_etljob( spark, date_obj, region, table, sample_rate, aws_access_key_id, aws_secret_access_key, ) pprint(reduction_output)
Example #5
Source File: spark.py From pyFTS with GNU General Public License v3.0 | 6 votes |
def create_spark_conf(**kwargs): """ Configure the Spark master node :param kwargs: :return: """ spark_executor_memory = kwargs.get("spark_executor_memory", "2g") spark_driver_memory = kwargs.get("spark_driver_memory", "2g") url = kwargs.get("url", SPARK_ADDR) app = kwargs.get("app", 'pyFTS') conf = SparkConf() conf.setMaster(url) conf.setAppName(app) conf.set("spark.executor.memory", spark_executor_memory) conf.set("spark.driver.memory", spark_driver_memory) conf.set("spark.memory.offHeap.enabled",True) conf.set("spark.memory.offHeap.size","16g") return conf
Example #6
Source File: config.py From pytest-spark with MIT License | 6 votes |
def initialize(cls, options_from_ini=None): if cls._instance: return cls._instance from pyspark import SparkConf cls._instance = SparkConf() cls.options = dict(cls.DEFAULTS) if options_from_ini: cls.options.update(cls._parse_config(options_from_ini)) for k, v in cls.options.items(): cls._instance.set(k, v) return cls._instance
Example #7
Source File: testconfig.py From SMV with Apache License 2.0 | 6 votes |
def sparkSession(cls): if not hasattr(cls, "spark"): # We can't use the SparkSession Builder here, since we need to call # Scala side's SmvTestHive.createContext to create the HiveTestContext's # SparkSession. # So we need to # * Create a java_gateway # * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf) # * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir) # * Create Scala side HiveTestContext SparkSession # * Create python SparkSession jgw = launch_gateway(None) jvm = jgw.jvm import tempfile import getpass hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser()) sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\ .set("spark.sql.hive.metastore.barrierPrefixes", "org.apache.spark.sql.hive.execution.PairSerDe")\ .set("spark.sql.warehouse.dir", hivedir)\ .set("spark.ui.enabled", "false") sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate() jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc()) cls.spark = SparkSession(sc, jss.sparkSession()) return cls.spark
Example #8
Source File: finance_similarity.py From Spark-in-Finance-Quantitative-Investing with Apache License 2.0 | 6 votes |
def create_sc(): sc_conf = SparkConf() sc_conf.setAppName("finance-similarity-app") sc_conf.setMaster('spark://10.21.208.21:7077') sc_conf.set('spark.executor.memory', '2g') sc_conf.set('spark.executor.cores', '4') sc_conf.set('spark.cores.max', '40') sc_conf.set('spark.logConf', True) print sc_conf.getAll() sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
Example #9
Source File: test_ExtractCCLinks.py From cccatalog with MIT License | 6 votes |
def setUpClass(cls): #load sample warc files fh = open('tests/sample_wat.paths') cls.watPaths = fh.readlines() #initialize class cls.cclinks = CCLinks('CC-MAIN-2018-13', 5) cls.cclinks.output = 'tests/output/{}/parquet'.format(cls.cclinks.crawlIndex) #remove output directory if os.path.exists(cls.cclinks.output): shutil.rmtree('tests/output') #init pyspark conf = pyspark.SparkConf().setMaster('local[*]').setAppName('Test_ExtractCCLinks') cls.sc = pyspark.SparkContext.getOrCreate(conf=conf)
Example #10
Source File: sparkcc.py From cc-pyspark with MIT License | 6 votes |
def run(self): self.args = self.parse_arguments() conf = SparkConf() if self.args.spark_profiler: conf = conf.set("spark.python.profile", "true") sc = SparkContext( appName=self.name, conf=conf) sqlc = SQLContext(sparkContext=sc) self.init_accumulators(sc) self.run_job(sc, sqlc) if self.args.spark_profiler: sc.show_profiles() sc.stop()
Example #11
Source File: spark_conf.py From airflow-pipeline with Apache License 2.0 | 6 votes |
def set_spark_defaults(conf, name='spark-job'): """ Update the configuration dictionary for setting up spark, creating the dictionary if does not exist yet """ if not conf: conf = dict() home = os.path.join('/tmp', str(uuid.uuid4())) conf['SparkConfiguration'] = SparkConf()\ .setMaster('yarn-client')\ .setAppName(name)\ .set("spark.sql.shuffle.partitions", "1000")\ .set("spark.scheduler.revive.interval", "3")\ .set("spark.task.maxFailures", "0")\ .set("spark.executorEnv.HOME", home) return conf
Example #12
Source File: spark.py From qb with MIT License | 6 votes |
def create_spark_context(app_name="Quiz Bowl", configs=None) -> SparkContext: if QB_SPARK_MASTER != "": log.info("Spark master is %s" % QB_SPARK_MASTER) spark_conf = SparkConf()\ .set('spark.rpc.message.maxSize', 300)\ .setAppName(app_name)\ .setMaster(QB_SPARK_MASTER) else: spark_conf = SparkConf()\ .set('spark.rpc.message.maxSize', 300)\ .setAppName(app_name) if configs is not None: for key, value in configs: if key in ('spark.executor.cores', 'spark.max.cores'): if value > QB_MAX_CORES: log.info('Requested {r_cores} cores when the machine only has {n_cores} cores, reducing number of ' 'cores to {n_cores}'.format(r_cores=value, n_cores=QB_MAX_CORES)) value = QB_MAX_CORES spark_conf = spark_conf.set(key, value) return SparkContext.getOrCreate(spark_conf)
Example #13
Source File: test_spark.py From mlflow with Apache License 2.0 | 5 votes |
def spark(): conf = pyspark.SparkConf() return get_spark_session(conf)
Example #14
Source File: test_spark.py From mlflow with Apache License 2.0 | 5 votes |
def score_model_as_udf(model_uri, pandas_df, result_type="double"): spark = get_spark_session(pyspark.SparkConf()) spark_df = spark.createDataFrame(pandas_df) pyfunc_udf = spark_udf(spark=spark, model_uri=model_uri, result_type=result_type) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) return [x['prediction'] for x in new_df.collect()]
Example #15
Source File: ozy_streaming.py From ozymandias with MIT License | 5 votes |
def main(): """Run Spark Streaming""" conf = SparkConf() sc = SparkContext(appName='Ozymandias', conf=conf) sc.setLogLevel('WARN') with open(ROOT + 'channels.json', 'r') as f: channels = json.load(f) topics = [t['topic'] for t in channels['channels']] n_secs = 0.5 ssc = StreamingContext(sc, n_secs) stream = KafkaUtils.createDirectStream(ssc, topics, { 'bootstrap.servers':'localhost:9092', 'group.id':'ozy-group', 'fetch.message.max.bytes':'15728640', 'auto.offset.reset':'largest'}) stream.map( deserializer ).map( image_detector ).foreachRDD( message_sender) ssc.start() ssc.awaitTermination()
Example #16
Source File: taar_dynamo.py From python_mozetl with MIT License | 5 votes |
def main(date, region, table, prod_iam_role, sample_rate): APP_NAME = "HBaseAddonRecommenderView" conf = SparkConf().setAppName(APP_NAME) spark = SparkSession.builder.config(conf=conf).getOrCreate() date_obj = datetime.strptime(date, "%Y%m%d") if prod_iam_role.strip() == "": prod_iam_role = None reduction_output = run_etljob( spark, date_obj, region, table, prod_iam_role, sample_rate ) pprint(reduction_output)
Example #17
Source File: holoclean.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 5 votes |
def _init_spark(self): """ Set spark configuration :return: Spark session :return: Spark context """ conf = SparkConf() # Link PG driver to Spark conf.set("spark.executor.extraClassPath", self.holoclean_path + "/" + self.pg_driver) conf.set("spark.driver.extraClassPath", self.holoclean_path + "/" + self.pg_driver) conf.set('spark.driver.memory', '20g') conf.set('spark.executor.memory', '20g') conf.set("spark.network.timeout", "6000") conf.set("spark.rpc.askTimeout", "99999") conf.set("spark.worker.timeout", "60000") conf.set("spark.driver.maxResultSize", '70g') conf.set("spark.ui.showConsoleProgress", "false") if self.spark_cluster: conf.set("spark.master", self.spark_cluster) # Gets Spark context sc = SparkContext(conf=conf) sc.setLogLevel("OFF") sql_ctxt = SQLContext(sc) return sql_ctxt.sparkSession, sql_ctxt
Example #18
Source File: hyperparameters_tuning.py From intro_ds with Apache License 2.0 | 5 votes |
def startSpark(): """ 创建SparkContext,这是Spark程序的入口 """ conf = SparkConf().setAppName("grid search example") sc = SparkContext(conf=conf) return sc
Example #19
Source File: reagent_sql_test_base.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def getConf(self): conf = SparkConf() for k, v in DEFAULT_SPARK_CONFIG.items(): conf.set(k, v) return conf
Example #20
Source File: ClimatologySpark2.py From incubator-sdap-nexus with Apache License 2.0 | 5 votes |
def configureSpark(sparkConfig, appName, memoryPerExecutor='4G', coresPerExecutor=1): mode, numExecutors, numPartitions = sparkConfig.split(',') numExecutors = int(numExecutors) print >> sys.stderr, 'numExecutors = ', numExecutors numPartitions = int(numPartitions) print >> sys.stderr, 'numPartitions = ', numPartitions if mode == 'multicore': print >> sys.stderr, 'Using pysparkling' import pysparkling sc = pysparkling.Context() else: print >> sys.stderr, 'Using PySpark' sparkMaster = mode spConf = SparkConf() spConf.setAppName(appName) spConf.set("spark.executorEnv.HOME", os.path.join(os.getenv('HOME'), 'spark_exec_home')) spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd()) spConf.set("spark.executor.memory", memoryPerExecutor) print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor try: sparkMaster = SparkMasterOverride except: pass if sparkMaster[:5] == "mesos": spConf.set("spark.cores.max", numExecutors) else: # Spark master is YARN or local[N] spConf.set("spark.executor.instances", numExecutors) spConf.set("spark.executor.cores", coresPerExecutor) spConf.setMaster(sparkMaster) sc = SparkContext(conf=spConf) return sc, numExecutors, numPartitions
Example #21
Source File: test.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def setUpClass(cls): master = os.getenv('MASTER') assert master is not None, "Please start a Spark standalone cluster and export MASTER to your env." num_workers = os.getenv('SPARK_WORKER_INSTANCES') assert num_workers is not None, "Please export SPARK_WORKER_INSTANCES to your env." cls.num_workers = int(num_workers) spark_jars = os.getenv('SPARK_CLASSPATH') assert spark_jars, "Please add path to tensorflow/ecosystem/hadoop jar to SPARK_CLASSPATH." cls.conf = SparkConf().set('spark.jars', spark_jars) cls.sc = SparkContext(master, cls.__name__, conf=cls.conf) cls.spark = SparkSession.builder.getOrCreate()
Example #22
Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0 | 5 votes |
def main( date, aws_access_key_id, aws_secret_access_key, bucket, prefix, elastic_net_param, reg_param, min_installed_addons, client_sample_date_from, sample_rate, ): print("Sampling clients since {}".format(client_sample_date_from)) # Clobber the AWS access credentials os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key ctx = default_context() APP_NAME = "TaarEnsemble" conf = SparkConf().setAppName(APP_NAME) spark = SparkSession.builder.config(conf=conf).getOrCreate() taar_training = extract( spark, client_sample_date_from, min_installed_addons, sample_rate ) coefs = transform(ctx, spark, taar_training, reg_param, elastic_net_param) load(coefs, date, prefix, bucket)
Example #23
Source File: tests.py From pyspark-cassandra with Apache License 2.0 | 5 votes |
def setUpClass(cls): # connect to cassandra and create a keyspace for testing cls.session = Cluster().connect() cls.session.execute(''' CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; ''' % (cls.keyspace,)) cls.session.set_keyspace(CassandraTestCase.keyspace) # create a cassandra spark context cls.sc = CassandraSparkContext( conf=SparkConf().setAppName("PySpark Cassandra Test"))
Example #24
Source File: tasks.py From flask-spark-docker with MIT License | 5 votes |
def create_task(words): conf = SparkConf().setAppName('letter count') sc = SparkContext(conf=conf) seq = words.split() data = sc.parallelize(seq) counts = data.map(lambda word: (word, 1)).reduceByKey(add).collect() sc.stop() return dict(counts)
Example #25
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def getOrCreate(cls, checkpointPath, setupFunc): """ Either recreate a StreamingContext from checkpoint data or create a new StreamingContext. If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc will be used to create a new context. @param checkpointPath: Checkpoint directory used in an earlier streaming program @param setupFunc: Function to create a new context and setup DStreams """ cls._ensure_initialized() gw = SparkContext._gateway # Check whether valid checkpoint information exists in the given path ssc_option = gw.jvm.StreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath) if ssc_option.isEmpty(): ssc = setupFunc() ssc.checkpoint(checkpointPath) return ssc jssc = gw.jvm.JavaStreamingContext(ssc_option.get()) # If there is already an active instance of Python SparkContext use it, or create a new one if not SparkContext._active_spark_context: jsc = jssc.sparkContext() conf = SparkConf(_jconf=jsc.getConf()) SparkContext(conf=conf, gateway=gw, jsc=jsc) sc = SparkContext._active_spark_context # update ctx in serializer cls._transformerSerializer.ctx = sc return StreamingContext(sc, None, jssc)
Example #26
Source File: launcher.py From spylon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def spark_context(self, application_name): """Create a spark context given the parameters configured in this class. The caller is responsible for calling ``.close`` on the resulting spark context Parameters ---------- application_name : string Returns ------- sc : SparkContext """ # initialize the spark configuration self._init_spark() import pyspark import pyspark.sql # initialize conf spark_conf = pyspark.SparkConf() for k, v in self._spark_conf_helper._conf_dict.items(): spark_conf.set(k, v) log.info("Starting SparkContext") return pyspark.SparkContext(appName=application_name, conf=spark_conf)
Example #27
Source File: conftest.py From elephas with MIT License | 5 votes |
def sql_context(request): """ fixture for creating a Spark SQLContext Args: request: pytest.FixtureRequest object """ conf = (SparkConf().setMaster("local[2]").setAppName( "pytest-pyspark-local-testing")) sc = SparkContext(conf=conf) sql_context = SQLContext(sc) request.addfinalizer(lambda: sc.stop()) quiet_py4j() return sql_context
Example #28
Source File: conftest.py From elephas with MIT License | 5 votes |
def spark_context(request): """ fixture for creating a SparkContext Args: request: pytest.FixtureRequest object """ conf = (SparkConf().setMaster("local[2]").setAppName( "pytest-pyspark-local-testing")) sc = SparkContext(conf=conf) request.addfinalizer(lambda: sc.stop()) quiet_py4j() return sc
Example #29
Source File: test_deeds.py From cccatalog with MIT License | 5 votes |
def spark_context(request): conf = (SparkConf() .setMaster("spark://ec2-54-167-211-230.compute-1.amazonaws.com:7077") .setAppName("commonsmapper-pyspark-local-testing") .set ("spark.jars", "../jars/hadoop-aws-2.8.1.jar,../jars/hadoop-auth-2.8.1.jar,../jars/aws-java-sdk-1.11.212.jar,../jars/postgresql-42.1.4.jar") .set ("spark.driver.extraClassPath", "../jars/") ) sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration ().set("fs.s3n.awsAccessKeyId", os.environ ['OPEN_LEDGER_ACCESS_KEY_ID']) sc._jsc.hadoopConfiguration ().set("fs.s3n.awsSecretAccessKey", os.environ ['OPEN_LEDGER_SECRET_ACCESS_KEY']) request.addfinalizer(lambda: sc.stop()) return sc
Example #30
Source File: benchmark_spark.py From implicit with MIT License | 5 votes |
def benchmark_spark(ratings, factors, iterations=5): conf = (SparkConf() .setAppName("implicit_benchmark") .setMaster('local[*]') .set('spark.driver.memory', '16G') ) context = SparkContext(conf=conf) spark = SparkSession(context) times = {} try: ratings = convert_sparse_to_dataframe(spark, context, ratings) for rank in factors: als = ALS(rank=rank, maxIter=iterations, alpha=1, implicitPrefs=True, userCol="row", itemCol="col", ratingCol="data") start = time.time() als.fit(ratings) elapsed = time.time() - start times[rank] = elapsed / iterations print("spark. factors=%i took %.3f" % (rank, elapsed/iterations)) finally: spark.stop() return times