Python pyspark.context.SparkContext() Examples
The following are 21
code examples of pyspark.context.SparkContext().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.context
, or try the search function
.
Example #1
Source File: tests.py From spark-cluster-deployment with Apache License 2.0 | 6 votes |
def test_module_dependency_on_cluster(self): """Submit and test a script with a dependency on another module on a cluster""" script = self.createTempFile("test.py", """ |from pyspark import SparkContext |from mylib import myfunc | |sc = SparkContext() |print sc.parallelize([1, 2, 3]).map(myfunc).collect() """) zip = self.createFileInZip("mylib.py", """ |def myfunc(x): | return x + 1 """) proc = subprocess.Popen( [self.sparkSubmit, "--py-files", zip, "--master", "local-cluster[1,1,512]", script], stdout=subprocess.PIPE) out, err = proc.communicate() self.assertEqual(0, proc.returncode) self.assertIn("[2, 3, 4]", out)
Example #2
Source File: tests.py From spark-cluster-deployment with Apache License 2.0 | 6 votes |
def test_module_dependency(self): """Submit and test a script with a dependency on another module""" script = self.createTempFile("test.py", """ |from pyspark import SparkContext |from mylib import myfunc | |sc = SparkContext() |print sc.parallelize([1, 2, 3]).map(myfunc).collect() """) zip = self.createFileInZip("mylib.py", """ |def myfunc(x): | return x + 1 """) proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, script], stdout=subprocess.PIPE) out, err = proc.communicate() self.assertEqual(0, proc.returncode) self.assertIn("[2, 3, 4]", out)
Example #3
Source File: pcontext.py From sparklingpandas with Apache License 2.0 | 5 votes |
def __init__(self, spark_context, sql_ctx=None): """Initialize a PSparkContext with the associacted spark context, and Spark SQL context if provided. This context is usef to load data into L{DataFrame}s. Parameters ---------- spark_context: SparkContext Initialized and configured spark context. If you are running in the PySpark shell, this is already created as "sc". sql_ctx: SQLContext, optional Initialized and configured SQL context, if not provided Sparkling Panda's will create one. Returns ------- Correctly initialized SparklingPandasContext. """ self.spark_ctx = spark_context if sql_ctx: self.sql_ctx = sql_ctx else: logging.info("No sql context provided, creating") from pyspark.sql import SQLContext self.sql_ctx = SQLContext(self.spark_ctx) # Register our magical functions register_sql_extensions(self.sql_ctx)
Example #4
Source File: util.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def _test(): import doctest from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
Example #5
Source File: sql.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def _test(): import doctest from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext('local[4]', 'PythonTest', batchSize=2) globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
Example #6
Source File: sql.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def __init__(self, sparkContext, sqlContext = None): """Create a new SQLContext. @param sparkContext: The SparkContext to wrap. >>> srdd = sqlCtx.inferSchema(rdd) >>> sqlCtx.inferSchema(srdd) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... ValueError:... >>> bad_rdd = sc.parallelize([1,2,3]) >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... ValueError:... >>> allTypes = sc.parallelize([{"int" : 1, "string" : "string", "double" : 1.0, "long": 1L, ... "boolean" : True}]) >>> srdd = sqlCtx.inferSchema(allTypes).map(lambda x: (x.int, x.string, x.double, x.long, ... x.boolean)) >>> srdd.collect()[0] (1, u'string', 1.0, 1, True) """ self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm self._pythonToJavaMap = self._jvm.PythonRDD.pythonToJavaMap if sqlContext: self._scala_SQLContext = sqlContext
Example #7
Source File: tests.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def test_script_with_local_functions(self): """Submit and test a single script file calling a global function""" script = self.createTempFile("test.py", """ |from pyspark import SparkContext | |def foo(x): | return x * 3 | |sc = SparkContext() |print sc.parallelize([1, 2, 3]).map(foo).collect() """) proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE) out, err = proc.communicate() self.assertEqual(0, proc.returncode) self.assertIn("[3, 6, 9]", out)
Example #8
Source File: tests.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def test_single_script(self): """Submit and test a single script file""" script = self.createTempFile("test.py", """ |from pyspark import SparkContext | |sc = SparkContext() |print sc.parallelize([1, 2, 3]).map(lambda x: x * 2).collect() """) proc = subprocess.Popen([self.sparkSubmit, script], stdout=subprocess.PIPE) out, err = proc.communicate() self.assertEqual(0, proc.returncode) self.assertIn("[2, 4, 6]", out)
Example #9
Source File: tests.py From spark-cluster-deployment with Apache License 2.0 | 5 votes |
def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ self.sc = SparkContext('local[4]', class_name , batchSize=2)
Example #10
Source File: pcontext.py From sparklingpandas with Apache License 2.0 | 5 votes |
def stop(self): """Stop the underlying SparkContext """ self.spark_ctx.stop()
Example #11
Source File: pcontext.py From sparklingpandas with Apache License 2.0 | 5 votes |
def simple(cls, *args, **kwargs): """Takes the same arguments as SparkContext and constructs a PSparkContext""" return PSparkContext(SparkContext(*args, **kwargs))
Example #12
Source File: __init__.py From pyspark2pmml with GNU Affero General Public License v3.0 | 5 votes |
def setUp(self): self.sc = SparkContext() self.sqlContext = SQLContext(self.sc)
Example #13
Source File: train_model.py From lopq with Apache License 2.0 | 5 votes |
def compute_local_rotations(sc, data, model, num_buckets): """ Analogous to the function of the same name in lopq.model. :param SparkContext sc: a SparkContext :param RDD data: an RDD of numpy arrays :param KMeansModel model: a KMeansModel instance for which to fit local rotations :param int num_buckets: the number of subvectors over which to balance residual variance """ # Get estimators A, mu, count = accumulate_covariance_estimators(sc, data, model) # Format as ndarrays V = len(model.centers) A = dict_to_ndarray(A, V) mu = dict_to_ndarray(mu, V) count = dict_to_ndarray(count, V) # Compute params R, mu = compute_rotations_from_accumulators(A, mu, count, num_buckets) return R, mu, count
Example #14
Source File: train_model.py From lopq with Apache License 2.0 | 5 votes |
def accumulate_covariance_estimators(sc, data, model): """ Analogous function to function of the same name in lopq.model. :param SparkContext sc: a SparkContext :param RDD data: an RDD of numpy arrays :param KMeansModel model: a KMeansModel instance for which to fit local rotations """ def get_residual(x): cluster = model.predict(x) centroid = model.clusterCenters[cluster] residual = x - centroid return (cluster, residual) def seq_op(acc, x): acc += np.outer(x, x) return acc # Compute (assignment, residual) k/v pairs residuals = data.map(get_residual) residuals.cache() # Collect counts and mean residuals count = residuals.countByKey() mu = residuals.reduceByKey(add).collectAsMap() # Extract the dimension of the data D = len(mu.values()[0]) # Collect accumulated outer products A = residuals.aggregateByKey(np.zeros((D, D)), seq_op, add).collectAsMap() residuals.unpersist() return A, mu, count
Example #15
Source File: startup.py From sparklingml with Apache License 2.0 | 5 votes |
def registerFunction(self, ssc, jsession, function_name, params): jvm = self.gateway.jvm # If we don't have a reference to a running SparkContext # Get the SparkContext from the provided SparkSession. if not self._sc: master = ssc.master() jsc = jvm.org.apache.spark.api.java.JavaSparkContext(ssc) jsparkConf = ssc.conf() sparkConf = SparkConf(_jconf=jsparkConf) self._sc = SparkContext( master=master, conf=sparkConf, gateway=self.gateway, jsc=jsc) self._session = SparkSession.builder.getOrCreate() if function_name in functions_info: function_info = functions_info[function_name] if params: evaledParams = ast.literal_eval(params) else: evaledParams = [] func = function_info.func(*evaledParams) ret_type = function_info.returnType() self._count = self._count + 1 registration_name = function_name + str(self._count) udf = UserDefinedFunction(func, ret_type, registration_name) # Used to allow non-default (e.g. Arrow) UDFS udf.evalType = function_info.evalType() judf = udf._judf return judf else: print("Could not find function") # We do this rather than raising an exception since Py4J debugging # is rough and we can check it. return None
Example #16
Source File: pyspark_cassandra.py From pyspark-cassandra with Apache License 2.0 | 5 votes |
def _do_init(self, *args, **kwargs): # Modifies base _do_init to add a Java-Cassandra SparkContext (jcsc) # to the instance super(CassandraSparkContext, self)._do_init(*args, **kwargs) java_import(self._jvm, "com.datastax.spark.connector.CassandraJavaUtil") java_import(self._jvm, "com.datastax.spark.connector.RowConvertingIterator") self._jcsc = self._jvm.CassandraJavaUtil.javaFunctions(self._jsc)
Example #17
Source File: readwriter.py From LearningApacheSpark with MIT License | 5 votes |
def _test(): import doctest import os import tempfile import py4j from pyspark.context import SparkContext from pyspark.sql import SparkSession, Row import pyspark.sql.readwriter os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') try: spark = SparkSession.builder.getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession(sc) globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc globs['spark'] = spark globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned') (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) sc.stop() if failure_count: sys.exit(-1)
Example #18
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def _test(): import os import doctest import tempfile from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.context.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")] ) globs['df'] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: sys.exit(-1)
Example #19
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def newSession(self): """ Returns a new SQLContext as new session, that has separate SQLConf, registered temporary views and UDFs, but shared SparkContext and table cache. """ return self.__class__(self._sc, self.sparkSession.newSession())
Example #20
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def getOrCreate(cls, sc): """ Get the existing SQLContext or create a new one with given SparkContext. :param sc: SparkContext """ if cls._instantiatedContext is None: jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc()) sparkSession = SparkSession(sc, jsqlContext.sparkSession()) cls(sc, sparkSession, jsqlContext) return cls._instantiatedContext
Example #21
Source File: test_broadcast.py From LearningApacheSpark with MIT License | 5 votes |
def _test_multiple_broadcasts(self, *extra_confs): """ Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, and also multiple jobs. """ conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) self._test_encryption_helper([5]) self._test_encryption_helper([5, 10, 20])