Python pyspark.context.SparkContext() Examples

Example #1
Source File:    From spark-cluster-deployment with Apache License 2.0 6 votes vote down vote up
def test_module_dependency_on_cluster(self):
        """Submit and test a script with a dependency on another module on a cluster"""
        script = self.createTempFile("", """
            |from pyspark import SparkContext
            |from mylib import myfunc
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(myfunc).collect()
        zip = self.createFileInZip("", """
            |def myfunc(x):
            |    return x + 1
        proc = subprocess.Popen(
            [self.sparkSubmit, "--py-files", zip, "--master", "local-cluster[1,1,512]", script],
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[2, 3, 4]", out) 
Example #2
Source File:    From spark-cluster-deployment with Apache License 2.0 6 votes vote down vote up
def test_module_dependency(self):
        """Submit and test a script with a dependency on another module"""
        script = self.createTempFile("", """
            |from pyspark import SparkContext
            |from mylib import myfunc
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(myfunc).collect()
        zip = self.createFileInZip("", """
            |def myfunc(x):
            |    return x + 1
        proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, script],
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[2, 3, 4]", out) 
Example #3
Source File:    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def __init__(self, spark_context, sql_ctx=None):
        """Initialize a PSparkContext with the associacted spark context,
        and Spark SQL context if provided. This context is usef to load
        data into L{DataFrame}s.

        spark_context: SparkContext
            Initialized and configured spark context. If you are running in the
            PySpark shell, this is already created as "sc".
        sql_ctx: SQLContext, optional
            Initialized and configured SQL context, if not provided Sparkling
            Panda's will create one.
        Correctly initialized SparklingPandasContext.
        self.spark_ctx = spark_context
        if sql_ctx:
            self.sql_ctx = sql_ctx
  "No sql context provided, creating")
            from pyspark.sql import SQLContext
            self.sql_ctx = SQLContext(self.spark_ctx)
        # Register our magical functions
Example #4
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _test():
    import doctest
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    if failure_count:
Example #5
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _test():
    import doctest
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
        {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
    if failure_count:
Example #6
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def __init__(self, sparkContext, sqlContext = None):
        """Create a new SQLContext.

        @param sparkContext: The SparkContext to wrap.

        >>> srdd = sqlCtx.inferSchema(rdd)
        >>> sqlCtx.inferSchema(srdd) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):

        >>> bad_rdd = sc.parallelize([1,2,3])
        >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):

        >>> allTypes = sc.parallelize([{"int" : 1, "string" : "string", "double" : 1.0, "long": 1L,
        ... "boolean" : True}])
        >>> srdd = sqlCtx.inferSchema(allTypes).map(lambda x: (, x.string, x.double, x.long,
        ... x.boolean))
        >>> srdd.collect()[0]
        (1, u'string', 1.0, 1, True)
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        self._pythonToJavaMap = self._jvm.PythonRDD.pythonToJavaMap

        if sqlContext:
            self._scala_SQLContext = sqlContext 
Example #7
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def test_script_with_local_functions(self):
        """Submit and test a single script file calling a global function"""
        script = self.createTempFile("", """
            |from pyspark import SparkContext
            |def foo(x):
            |    return x * 3
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(foo).collect()
        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[3, 6, 9]", out) 
Example #8
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def test_single_script(self):
        """Submit and test a single script file"""
        script = self.createTempFile("", """
            |from pyspark import SparkContext
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(lambda x: x * 2).collect()
        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[2, 4, 6]", out) 
Example #9
Source File:    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__ = SparkContext('local[4]', class_name , batchSize=2) 
Example #10
Source File:    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def stop(self):
        """Stop the underlying SparkContext
Example #11
Source File:    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def simple(cls, *args, **kwargs):
        """Takes the same arguments as SparkContext and constructs a
        return PSparkContext(SparkContext(*args, **kwargs)) 
Example #12
Source File:    From pyspark2pmml with GNU Affero General Public License v3.0 5 votes vote down vote up
def setUp(self): = SparkContext()
		self.sqlContext = SQLContext( 
Example #13
Source File:    From lopq with Apache License 2.0 5 votes vote down vote up
def compute_local_rotations(sc, data, model, num_buckets):
    Analogous to the function of the same name in lopq.model.

    :param SparkContext sc:
        a SparkContext
    :param RDD data:
        an RDD of numpy arrays
    :param KMeansModel model:
        a KMeansModel instance for which to fit local rotations
    :param int num_buckets:
        the number of subvectors over which to balance residual variance
    # Get estimators
    A, mu, count = accumulate_covariance_estimators(sc, data, model)

    # Format as ndarrays
    V = len(model.centers)
    A = dict_to_ndarray(A, V)
    mu = dict_to_ndarray(mu, V)
    count = dict_to_ndarray(count, V)

    # Compute params
    R, mu = compute_rotations_from_accumulators(A, mu, count, num_buckets)

    return R, mu, count 
Example #14
Source File:    From lopq with Apache License 2.0 5 votes vote down vote up
def accumulate_covariance_estimators(sc, data, model):
    Analogous function to function of the same name in lopq.model.

    :param SparkContext sc:
        a SparkContext
    :param RDD data:
        an RDD of numpy arrays
    :param KMeansModel model:
        a KMeansModel instance for which to fit local rotations

    def get_residual(x):
        cluster = model.predict(x)
        centroid = model.clusterCenters[cluster]
        residual = x - centroid
        return (cluster, residual)

    def seq_op(acc, x):
        acc += np.outer(x, x)
        return acc

    # Compute (assignment, residual) k/v pairs
    residuals =

    # Collect counts and mean residuals
    count = residuals.countByKey()
    mu = residuals.reduceByKey(add).collectAsMap()

    # Extract the dimension of the data
    D = len(mu.values()[0])

    # Collect accumulated outer products
    A = residuals.aggregateByKey(np.zeros((D, D)), seq_op, add).collectAsMap()


    return A, mu, count 
Example #15
Source File:    From sparklingml with Apache License 2.0 5 votes vote down vote up
def registerFunction(self, ssc, jsession, function_name, params):
        jvm = self.gateway.jvm
        # If we don't have a reference to a running SparkContext
        # Get the SparkContext from the provided SparkSession.
        if not self._sc:
            master = ssc.master()
            jsc =
            jsparkConf = ssc.conf()
            sparkConf = SparkConf(_jconf=jsparkConf)
            self._sc = SparkContext(
            self._session = SparkSession.builder.getOrCreate()
        if function_name in functions_info:
            function_info = functions_info[function_name]
            if params:
                evaledParams = ast.literal_eval(params)
                evaledParams = []
            func = function_info.func(*evaledParams)
            ret_type = function_info.returnType()
            self._count = self._count + 1
            registration_name = function_name + str(self._count)
            udf = UserDefinedFunction(func, ret_type, registration_name)
            # Used to allow non-default (e.g. Arrow) UDFS
            udf.evalType = function_info.evalType()
            judf = udf._judf
            return judf
            print("Could not find function")
            # We do this rather than raising an exception since Py4J debugging
            # is rough and we can check it.
            return None 
Example #16
Source File:    From pyspark-cassandra with Apache License 2.0 5 votes vote down vote up
def _do_init(self, *args, **kwargs):
        # Modifies base _do_init to add a Java-Cassandra SparkContext (jcsc)
        # to the instance
        super(CassandraSparkContext, self)._do_init(*args, **kwargs)
        java_import(self._jvm, "com.datastax.spark.connector.CassandraJavaUtil")
        java_import(self._jvm, "com.datastax.spark.connector.RowConvertingIterator")
        self._jcsc = self._jvm.CassandraJavaUtil.javaFunctions(self._jsc) 
Example #17
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _test():
    import doctest
    import os
    import tempfile
    import py4j
    from pyspark.context import SparkContext
    from pyspark.sql import SparkSession, Row
    import pyspark.sql.readwriter


    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
        spark = SparkSession.builder.getOrCreate()
    except py4j.protocol.Py4JError:
        spark = SparkSession(sc)

    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] ='python/test_support/sql/parquet_partitioned')
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    if failure_count:
Example #18
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _test():
    import os
    import doctest
    import tempfile
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context


    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")]
    globs['df'] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    if failure_count:
Example #19
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def newSession(self):
        Returns a new SQLContext as new session, that has separate SQLConf,
        registered temporary views and UDFs, but shared SparkContext and
        table cache.
        return self.__class__(self._sc, self.sparkSession.newSession()) 
Example #20
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def getOrCreate(cls, sc):
        Get the existing SQLContext or create a new one with given SparkContext.

        :param sc: SparkContext
        if cls._instantiatedContext is None:
            jsqlContext = sc._jvm.SQLContext.getOrCreate(
            sparkSession = SparkSession(sc, jsqlContext.sparkSession())
            cls(sc, sparkSession, jsqlContext)
        return cls._instantiatedContext 
Example #21
Source File:    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _test_multiple_broadcasts(self, *extra_confs):
        Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
        and also multiple jobs.
        conf = SparkConf()
        for key, value in extra_confs:
            conf.set(key, value)
        conf.setMaster("local-cluster[2,1,1024]") = SparkContext(conf=conf)
        self._test_encryption_helper([5, 10, 20])