Python pyspark.conf.SparkConf() Examples
The following are 22
code examples of pyspark.conf.SparkConf().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.conf
, or try the search function
.
Example #1
Source File: cassandra_example.py From pyspark-cassandra with Apache License 2.0 | 6 votes |
def main(): if len(sys.argv) != 3: print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>" sys.exit(-1) keyspace_name = sys.argv[1] column_family_name = sys.argv[2] # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1") sc = SparkContext(appName="Spark + Cassandra Example", conf=conf) # import time; time.sleep(30) java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil") print sc._jvm.CassandraJavaUtil users = ( ["Mike", "Sukmanowsky"], ["Andrew", "Montalenti"], ["Keith", "Bourgoin"], ) rdd = sc.parallelize(users) print rdd.collect()
Example #2
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 6 votes |
def _create_shell_session(): """ Initialize a SparkSession for a pyspark shell session. This is called from shell.py to make error handling simpler without needing to declare local variables in that script, which would expose those to users. """ import py4j from pyspark.conf import SparkConf from pyspark.context import SparkContext try: # Try to access HiveConf, it will raise exception if Hive is not added conf = SparkConf() if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive': SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() return SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() else: return SparkSession.builder.getOrCreate() except (py4j.protocol.Py4JError, TypeError): if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive': warnings.warn("Fall back to non-hive support because failing to access HiveConf, " "please make sure you build spark with hive") return SparkSession.builder.getOrCreate()
Example #3
Source File: session.py From LearningApacheSpark with MIT License | 6 votes |
def _create_shell_session(): """ Initialize a SparkSession for a pyspark shell session. This is called from shell.py to make error handling simpler without needing to declare local variables in that script, which would expose those to users. """ import py4j from pyspark.conf import SparkConf from pyspark.context import SparkContext try: # Try to access HiveConf, it will raise exception if Hive is not added conf = SparkConf() if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive': SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() return SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() else: return SparkSession.builder.getOrCreate() except (py4j.protocol.Py4JError, TypeError): if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive': warnings.warn("Fall back to non-hive support because failing to access HiveConf, " "please make sure you build spark with hive") return SparkSession.builder.getOrCreate()
Example #4
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_user_configuration(self): """Make sure user configuration is respected (SPARK-19307)""" script = self.createTempFile("test.py", """ |from pyspark import SparkConf, SparkContext | |conf = SparkConf().set("spark.test_config", "1") |sc = SparkContext(conf = conf) |try: | if sc._conf.get("spark.test_config") != "1": | raise Exception("Cannot find spark.test_config in SparkContext's conf.") |finally: | sc.stop() """) proc = subprocess.Popen( self.sparkSubmit + ["--master", "local", script], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = proc.communicate() self.assertEqual(0, proc.returncode, msg="Process failed with error:\n {0}".format(out))
Example #5
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_profiler_disabled(self): sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) try: self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.show_profiles()) self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.dump_profiles("/tmp/abc")) finally: sc.stop()
Example #6
Source File: startup.py From sparklingml with Apache License 2.0 | 5 votes |
def registerFunction(self, ssc, jsession, function_name, params): jvm = self.gateway.jvm # If we don't have a reference to a running SparkContext # Get the SparkContext from the provided SparkSession. if not self._sc: master = ssc.master() jsc = jvm.org.apache.spark.api.java.JavaSparkContext(ssc) jsparkConf = ssc.conf() sparkConf = SparkConf(_jconf=jsparkConf) self._sc = SparkContext( master=master, conf=sparkConf, gateway=self.gateway, jsc=jsc) self._session = SparkSession.builder.getOrCreate() if function_name in functions_info: function_info = functions_info[function_name] if params: evaledParams = ast.literal_eval(params) else: evaledParams = [] func = function_info.func(*evaledParams) ret_type = function_info.returnType() self._count = self._count + 1 registration_name = function_name + str(self._count) udf = UserDefinedFunction(func, ret_type, registration_name) # Used to allow non-default (e.g. Arrow) UDFS udf.evalType = function_info.evalType() judf = udf._judf return judf else: print("Could not find function") # We do this rather than raising an exception since Py4J debugging # is rough and we can check it. return None
Example #7
Source File: test_broadcast.py From LearningApacheSpark with MIT License | 5 votes |
def setUpClass(cls): gateway = launch_gateway(SparkConf()) cls._jvm = gateway.jvm cls.longMessage = True random.seed(42)
Example #8
Source File: test_broadcast.py From LearningApacheSpark with MIT License | 5 votes |
def _test_multiple_broadcasts(self, *extra_confs): """ Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, and also multiple jobs. """ conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) self._test_encryption_helper([5]) self._test_encryption_helper([5, 10, 20])
Example #9
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def getConf(self): conf = SparkConf() conf.setAll(self._conf.getAll()) return conf
Example #10
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def getOrCreate(cls, conf=None): """ Get or instantiate a SparkContext and register it as a singleton object. :param conf: SparkConf (optional) """ with SparkContext._lock: if SparkContext._active_spark_context is None: SparkContext(conf=conf or SparkConf()) return SparkContext._active_spark_context
Example #11
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 5 votes |
def config(self, key=None, value=None, conf=None): """Sets a config option. Options set using this method are automatically propagated to both :class:`SparkConf` and :class:`SparkSession`'s own configuration. For an existing SparkConf, use `conf` parameter. >>> from pyspark.conf import SparkConf >>> SparkSession.builder.config(conf=SparkConf()) <pyspark.sql.session... For a (key, value) pair, you can omit parameter names. >>> SparkSession.builder.config("spark.some.config.option", "some-value") <pyspark.sql.session... :param key: a key name string for configuration property :param value: a value for configuration property :param conf: an instance of :class:`SparkConf` """ with self._lock: if conf is None: self._options[key] = str(value) else: for (k, v) in conf.getAll(): self._options[k] = v return self
Example #12
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext('local[4]', class_name, conf=conf)
Example #13
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def conf(cls): """ Override this in subclasses to supply a more specific conf """ return SparkConf()
Example #14
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_external_sort_in_rdd(self): conf = SparkConf().set("spark.python.worker.memory", "1m") sc = SparkContext(conf=conf) l = list(range(10240)) random.shuffle(l) rdd = sc.parallelize(l, 4) self.assertEqual(sorted(l), rdd.sortBy(lambda x: x).collect()) sc.stop()
Example #15
Source File: session.py From eva with Apache License 2.0 | 5 votes |
def init_spark_session(self, application_name, spark_master=None): """Setup a spark session. :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system environment configured spark cluster. Use 'local[*]' to run on a local box. :return: spark_session: A spark session """ eva_spark_conf = SparkConf() eva_spark_conf.set('spark.logConf', 'true') session_builder = SparkSession \ .builder \ .appName(application_name) \ .config(conf=eva_spark_conf) if spark_master: session_builder.master(spark_master) # Gets an existing SparkSession or, # if there is no existing one, creates a new one based # on the options set in this builder. self._session = session_builder.getOrCreate() # Configure logging log4j_level = LoggingManager().getLog4JLevel() spark_context = self._session.sparkContext spark_context.setLogLevel(log4j_level)
Example #16
Source File: startup.py From sparklingml with Apache License 2.0 | 5 votes |
def spark_jvm_imports(jvm): # Import the classes used by PySpark java_import(jvm, "org.apache.spark.SparkConf") java_import(jvm, "org.apache.spark.api.java.*") java_import(jvm, "org.apache.spark.api.python.*") java_import(jvm, "org.apache.spark.ml.python.*") java_import(jvm, "org.apache.spark.mllib.api.python.*") # TODO(davies): move into sql java_import(jvm, "org.apache.spark.sql.*") java_import(jvm, "org.apache.spark.sql.hive.*") java_import(jvm, "scala.Tuple2")
Example #17
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 5 votes |
def config(self, key=None, value=None, conf=None): """Sets a config option. Options set using this method are automatically propagated to both :class:`SparkConf` and :class:`SparkSession`'s own configuration. For an existing SparkConf, use `conf` parameter. >>> from pyspark.conf import SparkConf >>> SparkSession.builder.config(conf=SparkConf()) <pyspark.sql.session... For a (key, value) pair, you can omit parameter names. >>> SparkSession.builder.config("spark.some.config.option", "some-value") <pyspark.sql.session... :param key: a key name string for configuration property :param value: a value for configuration property :param conf: an instance of :class:`SparkConf` """ with self._lock: if conf is None: self._options[key] = str(value) else: for (k, v) in conf.getAll(): self._options[k] = v return self
Example #18
Source File: tests.py From pyspark-elastic with Apache License 2.0 | 5 votes |
def setUpClass(cls): conf = SparkConf() conf.set('spark.ui.showConsoleProgress', 'false') cls.sc = EsSparkContext(conf=conf.setAppName("PySpark Elastic Test"))
Example #19
Source File: context.py From LearningApacheSpark with MIT License | 4 votes |
def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=0, serializer=PickleSerializer(), conf=None, gateway=None, jsc=None, profiler_cls=BasicProfiler): """ Create a new SparkContext. At least the master and app name should be set, either through the named parameters here or through C{conf}. :param master: Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). :param appName: A name for your job, to display on the cluster web UI. :param sparkHome: Location where Spark is installed on cluster nodes. :param pyFiles: Collection of .zip or .py files to send to the cluster and add to PYTHONPATH. These can be paths on the local file system or HDFS, HTTP, HTTPS, or FTP URLs. :param environment: A dictionary of environment variables to set on worker nodes. :param batchSize: The number of Python objects represented as a single Java object. Set 1 to disable batching, 0 to automatically choose the batch size based on object sizes, or -1 to use an unlimited batch size :param serializer: The serializer for RDDs. :param conf: A L{SparkConf} object setting Spark properties. :param gateway: Use an existing gateway and JVM, otherwise a new JVM will be instantiated. :param jsc: The JavaSparkContext instance (optional). :param profiler_cls: A class of custom Profiler used to do profiling (default is pyspark.profiler.BasicProfiler). >>> from pyspark.context import SparkContext >>> sc = SparkContext('local', 'test') >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... ValueError:... """ self._callsite = first_spark_call() or CallSite(None, None, None) SparkContext._ensure_initialized(self, gateway=gateway, conf=conf) try: self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls) except: # If an error occurs, clean up in order to allow future SparkContext creation: self.stop() raise
Example #20
Source File: session.py From LearningApacheSpark with MIT License | 4 votes |
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() >>> s1.conf.get("k1") == s2.conf.get("k1") True >>> s1.conf.get("k2") == s2.conf.get("k2") True """ with self._lock: from pyspark.context import SparkContext from pyspark.conf import SparkConf session = SparkSession._instantiatedSession if session is None or session._sc._jsc is None: sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sc = SparkContext.getOrCreate(sparkConf) # This SparkContext may be an existing one. for key, value in self._options.items(): # we need to propagate the confs # before we create the SparkSession. Otherwise, confs like # warehouse path and metastore url will not be set correctly ( # these confs cannot be changed once the SparkSession is created). sc._conf.set(key, value) session = SparkSession(sc) for key, value in self._options.items(): session._jsparkSession.sessionState().conf().setConfString(key, value) for key, value in self._options.items(): session.sparkContext._conf.set(key, value) return session
Example #21
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 4 votes |
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() >>> s1.conf.get("k1") == s2.conf.get("k1") True >>> s1.conf.get("k2") == s2.conf.get("k2") True """ with self._lock: from pyspark.context import SparkContext from pyspark.conf import SparkConf session = SparkSession._instantiatedSession if session is None or session._sc._jsc is None: sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sc = SparkContext.getOrCreate(sparkConf) # This SparkContext may be an existing one. for key, value in self._options.items(): # we need to propagate the confs # before we create the SparkSession. Otherwise, confs like # warehouse path and metastore url will not be set correctly ( # these confs cannot be changed once the SparkSession is created). sc._conf.set(key, value) session = SparkSession(sc) for key, value in self._options.items(): session._jsparkSession.sessionState().conf().setConfString(key, value) for key, value in self._options.items(): session.sparkContext._conf.set(key, value) return session
Example #22
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 4 votes |
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() >>> s1.conf.get("k1") == s2.conf.get("k1") True >>> s1.conf.get("k2") == s2.conf.get("k2") True """ with self._lock: from pyspark.context import SparkContext from pyspark.conf import SparkConf session = SparkSession._instantiatedSession if session is None or session._sc._jsc is None: sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sc = SparkContext.getOrCreate(sparkConf) # This SparkContext may be an existing one. for key, value in self._options.items(): # we need to propagate the confs # before we create the SparkSession. Otherwise, confs like # warehouse path and metastore url will not be set correctly ( # these confs cannot be changed once the SparkSession is created). sc._conf.set(key, value) session = SparkSession(sc) for key, value in self._options.items(): session._jsparkSession.sessionState().conf().setConfString(key, value) for key, value in self._options.items(): session.sparkContext._conf.set(key, value) return session