Python Examples of pyspark.conf.SparkConf

Source File: cassandra_example.py From pyspark-cassandra with Apache License 2.0

6 votes

def main():
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>"
        sys.exit(-1)

    keyspace_name = sys.argv[1]
    column_family_name = sys.argv[2]

    # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md
    conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1")

    sc = SparkContext(appName="Spark + Cassandra Example",
                      conf=conf)

    # import time; time.sleep(30)
    java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil")
    print sc._jvm.CassandraJavaUtil

    users = (
        ["Mike", "Sukmanowsky"],
        ["Andrew", "Montalenti"],
        ["Keith", "Bourgoin"],
    )
    rdd = sc.parallelize(users)
    print rdd.collect()

Source File: session.py From tidb-docker-compose with Apache License 2.0

6 votes

def _create_shell_session():
        """
        Initialize a SparkSession for a pyspark shell session. This is called from shell.py
        to make error handling simpler without needing to declare local variables in that
        script, which would expose those to users.
        """
        import py4j
        from pyspark.conf import SparkConf
        from pyspark.context import SparkContext
        try:
            # Try to access HiveConf, it will raise exception if Hive is not added
            conf = SparkConf()
            if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive':
                SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
                return SparkSession.builder\
                    .enableHiveSupport()\
                    .getOrCreate()
            else:
                return SparkSession.builder.getOrCreate()
        except (py4j.protocol.Py4JError, TypeError):
            if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
                warnings.warn("Fall back to non-hive support because failing to access HiveConf, "
                              "please make sure you build spark with hive")

        return SparkSession.builder.getOrCreate()

Source File: session.py From LearningApacheSpark with MIT License

6 votes

def _create_shell_session():
        """
        Initialize a SparkSession for a pyspark shell session. This is called from shell.py
        to make error handling simpler without needing to declare local variables in that
        script, which would expose those to users.
        """
        import py4j
        from pyspark.conf import SparkConf
        from pyspark.context import SparkContext
        try:
            # Try to access HiveConf, it will raise exception if Hive is not added
            conf = SparkConf()
            if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive':
                SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
                return SparkSession.builder\
                    .enableHiveSupport()\
                    .getOrCreate()
            else:
                return SparkSession.builder.getOrCreate()
        except (py4j.protocol.Py4JError, TypeError):
            if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
                warnings.warn("Fall back to non-hive support because failing to access HiveConf, "
                              "please make sure you build spark with hive")

        return SparkSession.builder.getOrCreate()

Source File: tests.py From LearningApacheSpark with MIT License

6 votes

def test_user_configuration(self):
        """Make sure user configuration is respected (SPARK-19307)"""
        script = self.createTempFile("test.py", """
            |from pyspark import SparkConf, SparkContext
            |
            |conf = SparkConf().set("spark.test_config", "1")
            |sc = SparkContext(conf = conf)
            |try:
            |    if sc._conf.get("spark.test_config") != "1":
            |        raise Exception("Cannot find spark.test_config in SparkContext's conf.")
            |finally:
            |    sc.stop()
            """)
        proc = subprocess.Popen(
            self.sparkSubmit + ["--master", "local", script],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT)
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode, msg="Process failed with error:\n {0}".format(out))

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def test_profiler_disabled(self):
        sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
        try:
            self.assertRaisesRegexp(
                RuntimeError,
                "'spark.python.profile' configuration must be set",
                lambda: sc.show_profiles())
            self.assertRaisesRegexp(
                RuntimeError,
                "'spark.python.profile' configuration must be set",
                lambda: sc.dump_profiles("/tmp/abc"))
        finally:
            sc.stop()

Source File: startup.py From sparklingml with Apache License 2.0

5 votes

def registerFunction(self, ssc, jsession, function_name, params):
        jvm = self.gateway.jvm
        # If we don't have a reference to a running SparkContext
        # Get the SparkContext from the provided SparkSession.
        if not self._sc:
            master = ssc.master()
            jsc = jvm.org.apache.spark.api.java.JavaSparkContext(ssc)
            jsparkConf = ssc.conf()
            sparkConf = SparkConf(_jconf=jsparkConf)
            self._sc = SparkContext(
                master=master,
                conf=sparkConf,
                gateway=self.gateway,
                jsc=jsc)
            self._session = SparkSession.builder.getOrCreate()
        if function_name in functions_info:
            function_info = functions_info[function_name]
            if params:
                evaledParams = ast.literal_eval(params)
            else:
                evaledParams = []
            func = function_info.func(*evaledParams)
            ret_type = function_info.returnType()
            self._count = self._count + 1
            registration_name = function_name + str(self._count)
            udf = UserDefinedFunction(func, ret_type, registration_name)
            # Used to allow non-default (e.g. Arrow) UDFS
            udf.evalType = function_info.evalType()
            judf = udf._judf
            return judf
        else:
            print("Could not find function")
            # We do this rather than raising an exception since Py4J debugging
            # is rough and we can check it.
            return None

Source File: test_broadcast.py From LearningApacheSpark with MIT License

5 votes

def setUpClass(cls):
        gateway = launch_gateway(SparkConf())
        cls._jvm = gateway.jvm
        cls.longMessage = True
        random.seed(42)

Source File: test_broadcast.py From LearningApacheSpark with MIT License

5 votes

def _test_multiple_broadcasts(self, *extra_confs):
        """
        Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
        and also multiple jobs.
        """
        conf = SparkConf()
        for key, value in extra_confs:
            conf.set(key, value)
        conf.setMaster("local-cluster[2,1,1024]")
        self.sc = SparkContext(conf=conf)
        self._test_encryption_helper([5])
        self._test_encryption_helper([5, 10, 20])

Source File: context.py From LearningApacheSpark with MIT License

5 votes

def getConf(self):
        conf = SparkConf()
        conf.setAll(self._conf.getAll())
        return conf

Source File: context.py From LearningApacheSpark with MIT License

5 votes

def getOrCreate(cls, conf=None):
        """
        Get or instantiate a SparkContext and register it as a singleton object.

        :param conf: SparkConf (optional)
        """
        with SparkContext._lock:
            if SparkContext._active_spark_context is None:
                SparkContext(conf=conf or SparkConf())
            return SparkContext._active_spark_context

Source File: session.py From tidb-docker-compose with Apache License 2.0

5 votes

def config(self, key=None, value=None, conf=None):
            """Sets a config option. Options set using this method are automatically propagated to
            both :class:`SparkConf` and :class:`SparkSession`'s own configuration.

            For an existing SparkConf, use `conf` parameter.

            >>> from pyspark.conf import SparkConf
            >>> SparkSession.builder.config(conf=SparkConf())
            <pyspark.sql.session...

            For a (key, value) pair, you can omit parameter names.

            >>> SparkSession.builder.config("spark.some.config.option", "some-value")
            <pyspark.sql.session...

            :param key: a key name string for configuration property
            :param value: a value for configuration property
            :param conf: an instance of :class:`SparkConf`
            """
            with self._lock:
                if conf is None:
                    self._options[key] = str(value)
                else:
                    for (k, v) in conf.getAll():
                        self._options[k] = v
                return self

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.profile", "true")
        self.sc = SparkContext('local[4]', class_name, conf=conf)

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def conf(cls):
        """
        Override this in subclasses to supply a more specific conf
        """
        return SparkConf()

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def test_external_sort_in_rdd(self):
        conf = SparkConf().set("spark.python.worker.memory", "1m")
        sc = SparkContext(conf=conf)
        l = list(range(10240))
        random.shuffle(l)
        rdd = sc.parallelize(l, 4)
        self.assertEqual(sorted(l), rdd.sortBy(lambda x: x).collect())
        sc.stop()

Source File: session.py From eva with Apache License 2.0

5 votes

def init_spark_session(self, application_name, spark_master=None):
        """Setup a spark session.

        :param spark_master: A master parameter used by spark session builder.
          Use default value (None) to use system
          environment configured spark cluster.
          Use 'local[*]' to run on a local box.

        :return: spark_session: A spark session
        """
        eva_spark_conf = SparkConf()
        eva_spark_conf.set('spark.logConf', 'true')

        session_builder = SparkSession \
            .builder \
            .appName(application_name) \
            .config(conf=eva_spark_conf)

        if spark_master:
            session_builder.master(spark_master)

        # Gets an existing SparkSession or,
        # if there is no existing one, creates a new one based
        # on the options set in this builder.
        self._session = session_builder.getOrCreate()

        # Configure logging
        log4j_level = LoggingManager().getLog4JLevel()
        spark_context = self._session.sparkContext
        spark_context.setLogLevel(log4j_level)

Source File: startup.py From sparklingml with Apache License 2.0

5 votes

def spark_jvm_imports(jvm):
        # Import the classes used by PySpark
        java_import(jvm, "org.apache.spark.SparkConf")
        java_import(jvm, "org.apache.spark.api.java.*")
        java_import(jvm, "org.apache.spark.api.python.*")
        java_import(jvm, "org.apache.spark.ml.python.*")
        java_import(jvm, "org.apache.spark.mllib.api.python.*")
        # TODO(davies): move into sql
        java_import(jvm, "org.apache.spark.sql.*")
        java_import(jvm, "org.apache.spark.sql.hive.*")
        java_import(jvm, "scala.Tuple2")

Source File: session.py From tidb-docker-compose with Apache License 2.0

5 votes

def config(self, key=None, value=None, conf=None):
            """Sets a config option. Options set using this method are automatically propagated to
            both :class:`SparkConf` and :class:`SparkSession`'s own configuration.

            For an existing SparkConf, use `conf` parameter.

            >>> from pyspark.conf import SparkConf
            >>> SparkSession.builder.config(conf=SparkConf())
            <pyspark.sql.session...

            For a (key, value) pair, you can omit parameter names.

            >>> SparkSession.builder.config("spark.some.config.option", "some-value")
            <pyspark.sql.session...

            :param key: a key name string for configuration property
            :param value: a value for configuration property
            :param conf: an instance of :class:`SparkConf`
            """
            with self._lock:
                if conf is None:
                    self._options[key] = str(value)
                else:
                    for (k, v) in conf.getAll():
                        self._options[k] = v
                return self

Source File: tests.py From pyspark-elastic with Apache License 2.0

5 votes

def setUpClass(cls):
        conf = SparkConf()
        conf.set('spark.ui.showConsoleProgress', 'false')
        cls.sc = EsSparkContext(conf=conf.setAppName("PySpark Elastic Test"))

Source File: context.py From LearningApacheSpark with MIT License

4 votes

def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                 environment=None, batchSize=0, serializer=PickleSerializer(), conf=None,
                 gateway=None, jsc=None, profiler_cls=BasicProfiler):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        :param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        :param appName: A name for your job, to display on the cluster web UI.
        :param sparkHome: Location where Spark is installed on cluster nodes.
        :param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        :param environment: A dictionary of environment variables to set on
               worker nodes.
        :param batchSize: The number of Python objects represented as a single
               Java object. Set 1 to disable batching, 0 to automatically choose
               the batch size based on object sizes, or -1 to use an unlimited
               batch size
        :param serializer: The serializer for RDDs.
        :param conf: A L{SparkConf} object setting Spark properties.
        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instantiated.
        :param jsc: The JavaSparkContext instance (optional).
        :param profiler_cls: A class of custom Profiler used to do profiling
               (default is pyspark.profiler.BasicProfiler).


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        self._callsite = first_spark_call() or CallSite(None, None, None)
        SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
        try:
            self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                          conf, jsc, profiler_cls)
        except:
            # If an error occurs, clean up in order to allow future SparkContext creation:
            self.stop()
            raise

Source File: session.py From LearningApacheSpark with MIT License

4 votes

def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid global default SparkSession, and if
            yes, return that one. If no valid global default SparkSession exists, the method
            creates a new SparkSession and assigns the newly created SparkSession as the global
            default.

            >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
            >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1"
            True

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.

            >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
            >>> s1.conf.get("k1") == s2.conf.get("k1")
            True
            >>> s1.conf.get("k2") == s2.conf.get("k2")
            True
            """
            with self._lock:
                from pyspark.context import SparkContext
                from pyspark.conf import SparkConf
                session = SparkSession._instantiatedSession
                if session is None or session._sc._jsc is None:
                    sparkConf = SparkConf()
                    for key, value in self._options.items():
                        sparkConf.set(key, value)
                    sc = SparkContext.getOrCreate(sparkConf)
                    # This SparkContext may be an existing one.
                    for key, value in self._options.items():
                        # we need to propagate the confs
                        # before we create the SparkSession. Otherwise, confs like
                        # warehouse path and metastore url will not be set correctly (
                        # these confs cannot be changed once the SparkSession is created).
                        sc._conf.set(key, value)
                    session = SparkSession(sc)
                for key, value in self._options.items():
                    session._jsparkSession.sessionState().conf().setConfString(key, value)
                for key, value in self._options.items():
                    session.sparkContext._conf.set(key, value)
                return session

Source File: session.py From tidb-docker-compose with Apache License 2.0

4 votes

def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid global default SparkSession, and if
            yes, return that one. If no valid global default SparkSession exists, the method
            creates a new SparkSession and assigns the newly created SparkSession as the global
            default.

            >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
            >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1"
            True

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.

            >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
            >>> s1.conf.get("k1") == s2.conf.get("k1")
            True
            >>> s1.conf.get("k2") == s2.conf.get("k2")
            True
            """
            with self._lock:
                from pyspark.context import SparkContext
                from pyspark.conf import SparkConf
                session = SparkSession._instantiatedSession
                if session is None or session._sc._jsc is None:
                    sparkConf = SparkConf()
                    for key, value in self._options.items():
                        sparkConf.set(key, value)
                    sc = SparkContext.getOrCreate(sparkConf)
                    # This SparkContext may be an existing one.
                    for key, value in self._options.items():
                        # we need to propagate the confs
                        # before we create the SparkSession. Otherwise, confs like
                        # warehouse path and metastore url will not be set correctly (
                        # these confs cannot be changed once the SparkSession is created).
                        sc._conf.set(key, value)
                    session = SparkSession(sc)
                for key, value in self._options.items():
                    session._jsparkSession.sessionState().conf().setConfString(key, value)
                for key, value in self._options.items():
                    session.sparkContext._conf.set(key, value)
                return session

Source File: session.py From tidb-docker-compose with Apache License 2.0

4 votes

def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid global default SparkSession, and if
            yes, return that one. If no valid global default SparkSession exists, the method
            creates a new SparkSession and assigns the newly created SparkSession as the global
            default.

            >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
            >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1"
            True

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.

            >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
            >>> s1.conf.get("k1") == s2.conf.get("k1")
            True
            >>> s1.conf.get("k2") == s2.conf.get("k2")
            True
            """
            with self._lock:
                from pyspark.context import SparkContext
                from pyspark.conf import SparkConf
                session = SparkSession._instantiatedSession
                if session is None or session._sc._jsc is None:
                    sparkConf = SparkConf()
                    for key, value in self._options.items():
                        sparkConf.set(key, value)
                    sc = SparkContext.getOrCreate(sparkConf)
                    # This SparkContext may be an existing one.
                    for key, value in self._options.items():
                        # we need to propagate the confs
                        # before we create the SparkSession. Otherwise, confs like
                        # warehouse path and metastore url will not be set correctly (
                        # these confs cannot be changed once the SparkSession is created).
                        sc._conf.set(key, value)
                    session = SparkSession(sc)
                for key, value in self._options.items():
                    session._jsparkSession.sessionState().conf().setConfString(key, value)
                for key, value in self._options.items():
                    session.sparkContext._conf.set(key, value)
                return session

Python pyspark.conf.SparkConf() Examples