Python pyspark.context.SparkContext() Examples

The following are 21 code examples of pyspark.context.SparkContext(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.context , or try the search function .
Example #1
Source File: tests.py    From spark-cluster-deployment with Apache License 2.0 6 votes vote down vote up
def test_module_dependency_on_cluster(self):
        """Submit and test a script with a dependency on another module on a cluster"""
        script = self.createTempFile("test.py", """
            |from pyspark import SparkContext
            |from mylib import myfunc
            |
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(myfunc).collect()
            """)
        zip = self.createFileInZip("mylib.py", """
            |def myfunc(x):
            |    return x + 1
            """)
        proc = subprocess.Popen(
            [self.sparkSubmit, "--py-files", zip, "--master", "local-cluster[1,1,512]", script],
            stdout=subprocess.PIPE)
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[2, 3, 4]", out) 
Example #2
Source File: tests.py    From spark-cluster-deployment with Apache License 2.0 6 votes vote down vote up
def test_module_dependency(self):
        """Submit and test a script with a dependency on another module"""
        script = self.createTempFile("test.py", """
            |from pyspark import SparkContext
            |from mylib import myfunc
            |
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(myfunc).collect()
            """)
        zip = self.createFileInZip("mylib.py", """
            |def myfunc(x):
            |    return x + 1
            """)
        proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, script],
            stdout=subprocess.PIPE)
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[2, 3, 4]", out) 
Example #3
Source File: pcontext.py    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def __init__(self, spark_context, sql_ctx=None):
        """Initialize a PSparkContext with the associacted spark context,
        and Spark SQL context if provided. This context is usef to load
        data into L{DataFrame}s.

        Parameters
        ----------
        spark_context: SparkContext
            Initialized and configured spark context. If you are running in the
            PySpark shell, this is already created as "sc".
        sql_ctx: SQLContext, optional
            Initialized and configured SQL context, if not provided Sparkling
            Panda's will create one.
        Returns
        -------
        Correctly initialized SparklingPandasContext.
        """
        self.spark_ctx = spark_context
        if sql_ctx:
            self.sql_ctx = sql_ctx
        else:
            logging.info("No sql context provided, creating")
            from pyspark.sql import SQLContext
            self.sql_ctx = SQLContext(self.spark_ctx)
        # Register our magical functions
        register_sql_extensions(self.sql_ctx) 
Example #4
Source File: util.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _test():
    import doctest
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1) 
Example #5
Source File: sql.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def _test():
    import doctest
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
        {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1) 
Example #6
Source File: sql.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def __init__(self, sparkContext, sqlContext = None):
        """Create a new SQLContext.

        @param sparkContext: The SparkContext to wrap.

        >>> srdd = sqlCtx.inferSchema(rdd)
        >>> sqlCtx.inferSchema(srdd) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...

        >>> bad_rdd = sc.parallelize([1,2,3])
        >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...

        >>> allTypes = sc.parallelize([{"int" : 1, "string" : "string", "double" : 1.0, "long": 1L,
        ... "boolean" : True}])
        >>> srdd = sqlCtx.inferSchema(allTypes).map(lambda x: (x.int, x.string, x.double, x.long,
        ... x.boolean))
        >>> srdd.collect()[0]
        (1, u'string', 1.0, 1, True)
        """
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        self._pythonToJavaMap = self._jvm.PythonRDD.pythonToJavaMap

        if sqlContext:
            self._scala_SQLContext = sqlContext 
Example #7
Source File: tests.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def test_script_with_local_functions(self):
        """Submit and test a single script file calling a global function"""
        script = self.createTempFile("test.py", """
            |from pyspark import SparkContext
            |
            |def foo(x):
            |    return x * 3
            |
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(foo).collect()
            """)
        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[3, 6, 9]", out) 
Example #8
Source File: tests.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def test_single_script(self):
        """Submit and test a single script file"""
        script = self.createTempFile("test.py", """
            |from pyspark import SparkContext
            |
            |sc = SparkContext()
            |print sc.parallelize([1, 2, 3]).map(lambda x: x * 2).collect()
            """)
        proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE)
        out, err = proc.communicate()
        self.assertEqual(0, proc.returncode)
        self.assertIn("[2, 4, 6]", out) 
Example #9
Source File: tests.py    From spark-cluster-deployment with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        self.sc = SparkContext('local[4]', class_name , batchSize=2) 
Example #10
Source File: pcontext.py    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def stop(self):
        """Stop the underlying SparkContext
        """
        self.spark_ctx.stop() 
Example #11
Source File: pcontext.py    From sparklingpandas with Apache License 2.0 5 votes vote down vote up
def simple(cls, *args, **kwargs):
        """Takes the same arguments as SparkContext and constructs a
        PSparkContext"""
        return PSparkContext(SparkContext(*args, **kwargs)) 
Example #12
Source File: __init__.py    From pyspark2pmml with GNU Affero General Public License v3.0 5 votes vote down vote up
def setUp(self):
		self.sc = SparkContext()
		self.sqlContext = SQLContext(self.sc) 
Example #13
Source File: train_model.py    From lopq with Apache License 2.0 5 votes vote down vote up
def compute_local_rotations(sc, data, model, num_buckets):
    """
    Analogous to the function of the same name in lopq.model.

    :param SparkContext sc:
        a SparkContext
    :param RDD data:
        an RDD of numpy arrays
    :param KMeansModel model:
        a KMeansModel instance for which to fit local rotations
    :param int num_buckets:
        the number of subvectors over which to balance residual variance
    """
    # Get estimators
    A, mu, count = accumulate_covariance_estimators(sc, data, model)

    # Format as ndarrays
    V = len(model.centers)
    A = dict_to_ndarray(A, V)
    mu = dict_to_ndarray(mu, V)
    count = dict_to_ndarray(count, V)

    # Compute params
    R, mu = compute_rotations_from_accumulators(A, mu, count, num_buckets)

    return R, mu, count 
Example #14
Source File: train_model.py    From lopq with Apache License 2.0 5 votes vote down vote up
def accumulate_covariance_estimators(sc, data, model):
    """
    Analogous function to function of the same name in lopq.model.

    :param SparkContext sc:
        a SparkContext
    :param RDD data:
        an RDD of numpy arrays
    :param KMeansModel model:
        a KMeansModel instance for which to fit local rotations
    """

    def get_residual(x):
        cluster = model.predict(x)
        centroid = model.clusterCenters[cluster]
        residual = x - centroid
        return (cluster, residual)

    def seq_op(acc, x):
        acc += np.outer(x, x)
        return acc

    # Compute (assignment, residual) k/v pairs
    residuals = data.map(get_residual)
    residuals.cache()

    # Collect counts and mean residuals
    count = residuals.countByKey()
    mu = residuals.reduceByKey(add).collectAsMap()

    # Extract the dimension of the data
    D = len(mu.values()[0])

    # Collect accumulated outer products
    A = residuals.aggregateByKey(np.zeros((D, D)), seq_op, add).collectAsMap()

    residuals.unpersist()

    return A, mu, count 
Example #15
Source File: startup.py    From sparklingml with Apache License 2.0 5 votes vote down vote up
def registerFunction(self, ssc, jsession, function_name, params):
        jvm = self.gateway.jvm
        # If we don't have a reference to a running SparkContext
        # Get the SparkContext from the provided SparkSession.
        if not self._sc:
            master = ssc.master()
            jsc = jvm.org.apache.spark.api.java.JavaSparkContext(ssc)
            jsparkConf = ssc.conf()
            sparkConf = SparkConf(_jconf=jsparkConf)
            self._sc = SparkContext(
                master=master,
                conf=sparkConf,
                gateway=self.gateway,
                jsc=jsc)
            self._session = SparkSession.builder.getOrCreate()
        if function_name in functions_info:
            function_info = functions_info[function_name]
            if params:
                evaledParams = ast.literal_eval(params)
            else:
                evaledParams = []
            func = function_info.func(*evaledParams)
            ret_type = function_info.returnType()
            self._count = self._count + 1
            registration_name = function_name + str(self._count)
            udf = UserDefinedFunction(func, ret_type, registration_name)
            # Used to allow non-default (e.g. Arrow) UDFS
            udf.evalType = function_info.evalType()
            judf = udf._judf
            return judf
        else:
            print("Could not find function")
            # We do this rather than raising an exception since Py4J debugging
            # is rough and we can check it.
            return None 
Example #16
Source File: pyspark_cassandra.py    From pyspark-cassandra with Apache License 2.0 5 votes vote down vote up
def _do_init(self, *args, **kwargs):
        # Modifies base _do_init to add a Java-Cassandra SparkContext (jcsc)
        # to the instance
        super(CassandraSparkContext, self)._do_init(*args, **kwargs)
        java_import(self._jvm, "com.datastax.spark.connector.CassandraJavaUtil")
        java_import(self._jvm, "com.datastax.spark.connector.RowConvertingIterator")
        self._jcsc = self._jvm.CassandraJavaUtil.javaFunctions(self._jsc) 
Example #17
Source File: readwriter.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _test():
    import doctest
    import os
    import tempfile
    import py4j
    from pyspark.context import SparkContext
    from pyspark.sql import SparkSession, Row
    import pyspark.sql.readwriter

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    try:
        spark = SparkSession.builder.getOrCreate()
    except py4j.protocol.Py4JError:
        spark = SparkSession(sc)

    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned')
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    sc.stop()
    if failure_count:
        sys.exit(-1) 
Example #18
Source File: context.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _test():
    import os
    import doctest
    import tempfile
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")]
    )
    globs['df'] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        sys.exit(-1) 
Example #19
Source File: context.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def newSession(self):
        """
        Returns a new SQLContext as new session, that has separate SQLConf,
        registered temporary views and UDFs, but shared SparkContext and
        table cache.
        """
        return self.__class__(self._sc, self.sparkSession.newSession()) 
Example #20
Source File: context.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def getOrCreate(cls, sc):
        """
        Get the existing SQLContext or create a new one with given SparkContext.

        :param sc: SparkContext
        """
        if cls._instantiatedContext is None:
            jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())
            sparkSession = SparkSession(sc, jsqlContext.sparkSession())
            cls(sc, sparkSession, jsqlContext)
        return cls._instantiatedContext 
Example #21
Source File: test_broadcast.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _test_multiple_broadcasts(self, *extra_confs):
        """
        Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
        and also multiple jobs.
        """
        conf = SparkConf()
        for key, value in extra_confs:
            conf.set(key, value)
        conf.setMaster("local-cluster[2,1,1024]")
        self.sc = SparkContext(conf=conf)
        self._test_encryption_helper([5])
        self._test_encryption_helper([5, 10, 20])