Python pyspark.sql.SparkSession.builder() Examples
The following are 30
code examples of pyspark.sql.SparkSession.builder().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.SparkSession
, or try the search function
.
Example #1
Source File: data_source_provider.py From marvin-python-toolbox with Apache License 2.0 | 6 votes |
def get_spark_session(enable_hive=False, app_name='marvin-engine', configs=[]): """Return a Spark Session object""" # Prepare spark context to be used import findspark findspark.init() from pyspark.sql import SparkSession # prepare spark sesseion to be returned spark = SparkSession.builder spark = spark.appName(app_name) spark = spark.enableHiveSupport() if enable_hive else spark # if has configs for config in configs: spark = spark.config(config) return spark.getOrCreate()
Example #2
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def _test(): import doctest from pyspark.sql import Row, SparkSession import pyspark.sql.functions globs = pyspark.sql.functions.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ .appName("sql.functions tests")\ .getOrCreate() sc = spark.sparkContext globs['sc'] = sc globs['spark'] = spark globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)]) (failure_count, test_count) = doctest.testmod( pyspark.sql.functions, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) spark.stop() if failure_count: sys.exit(-1)
Example #3
Source File: distributed.py From LearningApacheSpark with MIT License | 6 votes |
def _test(): import doctest import numpy from pyspark.sql import SparkSession from pyspark.mllib.linalg import Matrices import pyspark.mllib.linalg.distributed try: # Numpy 1.14+ changed it's string format. numpy.set_printoptions(legacy='1.13') except TypeError: pass globs = pyspark.mllib.linalg.distributed.__dict__.copy() spark = SparkSession.builder\ .master("local[2]")\ .appName("mllib.linalg.distributed tests")\ .getOrCreate() globs['sc'] = spark.sparkContext globs['Matrices'] = Matrices (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) spark.stop() if failure_count: sys.exit(-1)
Example #4
Source File: udf.py From LearningApacheSpark with MIT License | 6 votes |
def _test(): import doctest from pyspark.sql import SparkSession import pyspark.sql.udf globs = pyspark.sql.udf.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ .appName("sql.udf tests")\ .getOrCreate() globs['spark'] = spark (failure_count, test_count) = doctest.testmod( pyspark.sql.udf, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) spark.stop() if failure_count: sys.exit(-1)
Example #5
Source File: column.py From LearningApacheSpark with MIT License | 6 votes |
def cast(self, dataType): """ Convert the column into type ``dataType``. >>> df.select(df.age.cast("string").alias('ages')).collect() [Row(ages=u'2'), Row(ages=u'5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() [Row(ages=u'2'), Row(ages=u'5')] """ if isinstance(dataType, basestring): jc = self._jc.cast(dataType) elif isinstance(dataType, DataType): from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() jdt = spark._jsparkSession.parseDataType(dataType.json()) jc = self._jc.cast(jdt) else: raise TypeError("unexpected type: %s" % type(dataType)) return Column(jc)
Example #6
Source File: column.py From LearningApacheSpark with MIT License | 6 votes |
def _test(): import doctest from pyspark.sql import SparkSession import pyspark.sql.column globs = pyspark.sql.column.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ .appName("sql.column tests")\ .getOrCreate() sc = spark.sparkContext globs['spark'] = spark globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) (failure_count, test_count) = doctest.testmod( pyspark.sql.column, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) spark.stop() if failure_count: sys.exit(-1)
Example #7
Source File: fpm.py From LearningApacheSpark with MIT License | 6 votes |
def _test(): import doctest from pyspark.sql import SparkSession import pyspark.mllib.fpm globs = pyspark.mllib.fpm.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ .appName("mllib.fpm tests")\ .getOrCreate() globs['sc'] = spark.sparkContext import tempfile temp_path = tempfile.mkdtemp() globs['temp_path'] = temp_path try: (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) spark.stop() finally: from shutil import rmtree try: rmtree(temp_path) except OSError: pass if failure_count: sys.exit(-1)
Example #8
Source File: evaluation.py From LearningApacheSpark with MIT License | 6 votes |
def _test(): import doctest import numpy from pyspark.sql import SparkSession import pyspark.mllib.evaluation try: # Numpy 1.14+ changed it's string format. numpy.set_printoptions(legacy='1.13') except TypeError: pass globs = pyspark.mllib.evaluation.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ .appName("mllib.evaluation tests")\ .getOrCreate() globs['sc'] = spark.sparkContext (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) spark.stop() if failure_count: sys.exit(-1)
Example #9
Source File: fixtures.py From pytest-spark with MIT License | 6 votes |
def _spark_session(): """Internal fixture for SparkSession instance. Yields SparkSession instance if it is supported by the pyspark version, otherwise yields None. Required to correctly initialize `spark_context` fixture after `spark_session` fixture. ..note:: It is not possible to create SparkSession from the existing SparkContext. """ try: from pyspark.sql import SparkSession except ImportError: yield else: session = SparkSession.builder \ .config(conf=SparkConfigBuilder().get()) \ .getOrCreate() yield session session.stop()
Example #10
Source File: spark.py From mlflow with Apache License 2.0 | 6 votes |
def _load_pyfunc(path): """ Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``. :param path: Local filesystem path to the MLflow Model with the ``spark`` flavor. """ # NOTE: The getOrCreate() call below may change settings of the active session which we do not # intend to do here. In particular, setting master to local[1] can break distributed clusters. # To avoid this problem, we explicitly check for an active session. This is not ideal but there # is no good workaround at the moment. import pyspark spark = pyspark.sql.SparkSession._instantiatedSession if spark is None: spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \ .master("local[1]").getOrCreate() return _PyFuncModelWrapper(spark, _load_model(model_uri=path))
Example #11
Source File: populate_tables.py From data-testing-with-airflow with Apache License 2.0 | 6 votes |
def spark(): spark = SparkSession.builder \ .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \ .config('spark.hadoop.javax.jdo.option.ConnectionURL', 'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \ .enableHiveSupport() \ .getOrCreate() # Now populate some tables for database_name in ['dev_app', 'tst_app', 'acc_app', 'prd_app', 'transaction_a', 'transaction_b']: spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect() spark.sql('CREATE DATABASE {0}'.format(database_name)).collect() populate_transaction_a(spark) populate_transaction_b(spark) for environment in ['dev', 'tst', 'acc', 'prd']: populate_account_info(spark, environment) populate_countries(spark, environment)
Example #12
Source File: conftest.py From data-testing-with-airflow with Apache License 2.0 | 6 votes |
def spark(request): spark = SparkSession.builder \ .master('local[*]') \ .enableHiveSupport() \ .getOrCreate() # Now populate some tables for database_name in ['tst_app', 'transaction_a', 'transaction_b']: spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect() spark.sql('CREATE DATABASE {0}'.format(database_name)) populate_transaction_a(spark) populate_transaction_b(spark) populate_account_info(spark) populate_countries(spark) return spark
Example #13
Source File: utils.py From mlflow with Apache License 2.0 | 5 votes |
def _get_or_create_spark_session(jars=None): jar_path = jars if jars is not None else _get_mlflow_spark_jar_path() return SparkSession.builder \ .config("spark.jars", jar_path) \ .master("local[*]") \ .getOrCreate()
Example #14
Source File: utils.py From mlflow with Apache License 2.0 | 5 votes |
def spark_session(): jar_path = _get_mlflow_spark_jar_path() session = SparkSession.builder \ .config("spark.jars", jar_path) \ .master("local[*]") \ .getOrCreate() yield session session.stop()
Example #15
Source File: ga_chp_bq_ingest_avro_file.py From MorphL-Community-Edition with Apache License 2.0 | 5 votes |
def main(): spark_session = ( SparkSession.builder .appName(APPLICATION_NAME) .master(MASTER_URL) .config('spark.cassandra.connection.host', MORPHL_SERVER_IP_ADDRESS) .config('spark.cassandra.auth.username', MORPHL_CASSANDRA_USERNAME) .config('spark.cassandra.auth.password', MORPHL_CASSANDRA_PASSWORD) .config('spark.sql.shuffle.partitions', 16) .getOrCreate()) log4j = spark_session.sparkContext._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) avro_df = ( spark_session .read .format('avro') .load(LOCAL_AVRO_FILE)) save_options_ga_chp_bq_features_raw = { 'keyspace': MORPHL_CASSANDRA_KEYSPACE, 'table': 'ga_chp_bq_features_raw_t' if TRAINING_OR_PREDICTION == 'training' else 'ga_chp_bq_features_raw_p' } (avro_df .withColumn('day_of_data_capture', f.lit(DAY_OF_DATA_CAPTURE)) .withColumn('website_url', f.lit(WEBSITE_URL)) .write .format('org.apache.spark.sql.cassandra') .mode('append') .options(**save_options_ga_chp_bq_features_raw) .save())
Example #16
Source File: test_standardize.py From datadevops with MIT License | 5 votes |
def spark(): """Spark Session fixture """ from pyspark.sql import SparkSession spark = SparkSession.builder\ .master("local[2]")\ .appName("Unit Testing")\ .getOrCreate() spark.sparkContext.setLogLevel("ERROR") return spark
Example #17
Source File: test_transform.py From datadevops with MIT License | 5 votes |
def spark(): """Spark Session fixture """ from pyspark.sql import SparkSession spark = SparkSession.builder\ .master("local[2]")\ .appName("Unit Testing")\ .getOrCreate() spark.sparkContext.setLogLevel("ERROR") return spark
Example #18
Source File: dml_script.py From PerfKitBenchmarker with Apache License 2.0 | 5 votes |
def main(): spark = (SparkSession.builder .appName('Setup Spark table') .enableHiveSupport() .getOrCreate()) table = 'warehouse' table_dir = sys.argv[1] # clean up previous table spark.sql('drop table if exists ' + table) # register new table spark.catalog.createTable(table, table_dir, source='parquet')
Example #19
Source File: resources.py From dagster with Apache License 2.0 | 5 votes |
def spark_session_from_config(spark_conf=None): spark_conf = check.opt_dict_param(spark_conf, 'spark_conf') builder = SparkSession.builder flat = flatten_dict(spark_conf) for key, value in flat: builder = builder.config(key, value) return builder.getOrCreate()
Example #20
Source File: spark_table.py From PerfKitBenchmarker with Apache License 2.0 | 5 votes |
def main(): spark = (SparkSession.builder .appName('Setup Spark tables') .enableHiveSupport() .getOrCreate()) root_dir = sys.argv[1] tables = sys.argv[2].split(',') for table in tables: table_dir = os.path.join(root_dir, table) # clean up previous table spark.sql('drop table if exists ' + table) # register new table spark.catalog.createTable(table, table_dir, source='parquet')
Example #21
Source File: util.py From spark-sklearn with Apache License 2.0 | 5 votes |
def createLocalSparkSession(appName="spark-sklearn"): """Generates a :class:`SparkSession` utilizing all local cores with the progress bar disabled but otherwise default config.""" return SparkSession.builder \ .master("local[*]") \ .appName(appName) \ .config("spark.ui.showConsoleProgress", "false") \ .getOrCreate()
Example #22
Source File: test_gapply.py From spark-sklearn with Apache License 2.0 | 5 votes |
def setUpClass(cls): super(GapplyConfTests, cls).setUpClass() cls.spark = SparkSession.builder \ .config("spark.sql.retainGroupColumns", "false") \ .getOrCreate()
Example #23
Source File: group.py From LearningApacheSpark with MIT License | 5 votes |
def _test(): import doctest from pyspark.sql import Row, SparkSession import pyspark.sql.group globs = pyspark.sql.group.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ .appName("sql.group tests")\ .getOrCreate() sc = spark.sparkContext globs['sc'] = sc globs['spark'] = spark globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80), Row(name='Bob', age=5, height=85)]).toDF() globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000), Row(course="Java", year=2012, earnings=20000), Row(course="dotNET", year=2012, earnings=5000), Row(course="dotNET", year=2013, earnings=48000), Row(course="Java", year=2013, earnings=30000)]).toDF() globs['df5'] = sc.parallelize([ Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=10000)), Row(training="junior", sales=Row(course="Java", year=2012, earnings=20000)), Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=5000)), Row(training="junior", sales=Row(course="dotNET", year=2013, earnings=48000)), Row(training="expert", sales=Row(course="Java", year=2013, earnings=30000))]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.group, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) spark.stop() if failure_count: sys.exit(-1)
Example #24
Source File: dl_runner.py From sparkflow with MIT License | 5 votes |
def create_testing_spark_session(cls): return (SparkSession.builder .master('local[2]') .appName('sparkflow') .getOrCreate())
Example #25
Source File: udf.py From LearningApacheSpark with MIT License | 5 votes |
def _create_judf(self): from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext wrapped_func = _wrap_function(sc, self.func, self.returnType) jdt = spark._jsparkSession.parseDataType(self.returnType.json()) judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( self._name, wrapped_func, jdt, self.evalType, self.deterministic) return judf
Example #26
Source File: test_gapply.py From spark-sklearn with Apache License 2.0 | 5 votes |
def tearDownClass(cls): super(GapplyConfTests, cls).tearDownClass() # Creating a new SparkSession here seems confusing, but it is necessary because # the config is (for some stupid reason...) cached, which would make it get in # the way of other tests that expect a default configuration. cls.spark = SparkSession.builder \ .config("spark.sql.retainGroupColumns", "true") \ .getOrCreate()
Example #27
Source File: image.py From LearningApacheSpark with MIT License | 5 votes |
def readImages(self, path, recursive=False, numPartitions=-1, dropImageFailures=False, sampleRatio=1.0, seed=0): """ Reads the directory of images from the local or remote source. .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag, there may be a race condition where one job overwrites the hadoop configs of another. .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but potentially non-deterministic. .. note:: Deprecated in 2.4.0. Use `spark.read.format("image").load(path)` instead and this `readImages` will be removed in 3.0.0. :param str path: Path to the image directory. :param bool recursive: Recursive search flag. :param int numPartitions: Number of DataFrame partitions. :param bool dropImageFailures: Drop the files that are not valid images. :param float sampleRatio: Fraction of the images loaded. :param int seed: Random number seed. :return: a :class:`DataFrame` with a single column of "images", see ImageSchema for details. >>> df = ImageSchema.readImages('data/mllib/images/origin/kittens', recursive=True) >>> df.count() 5 .. versionadded:: 2.3.0 """ warnings.warn("`ImageSchema.readImage` is deprecated. " + "Use `spark.read.format(\"image\").load(path)` instead.", DeprecationWarning) spark = SparkSession.builder.getOrCreate() image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema jsession = spark._jsparkSession jresult = image_schema.readImages(path, jsession, recursive, numPartitions, dropImageFailures, float(sampleRatio), seed) return DataFrame(jresult, spark._wrapped)
Example #28
Source File: conftest.py From data-testing-with-airflow with Apache License 2.0 | 5 votes |
def spark(request): """ Fixture to create the SparkSession. """ spark = SparkSession.builder \ .appName(APP_NAME) \ .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \ .config('spark.hadoop.javax.jdo.option.ConnectionURL', 'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \ .enableHiveSupport() \ .getOrCreate() request.addfinalizer(spark.stop) return spark
Example #29
Source File: spark_jdbc_script.py From airflow with Apache License 2.0 | 5 votes |
def _create_spark_session(arguments) -> SparkSession: return SparkSession.builder \ .appName(arguments.name) \ .enableHiveSupport() \ .getOrCreate()
Example #30
Source File: test_sparktorch.py From sparktorch with MIT License | 5 votes |
def spark(): return (SparkSession.builder .master('local[2]') .appName('sparktorch') .getOrCreate())