Python Examples of pyspark.sql.SparkSession.builder

Source File: data_source_provider.py From marvin-python-toolbox with Apache License 2.0

6 votes

def get_spark_session(enable_hive=False, app_name='marvin-engine', configs=[]):
    """Return a Spark Session object"""

    # Prepare spark context to be used
    import findspark
    findspark.init()
    from pyspark.sql import SparkSession

    # prepare spark sesseion to be returned
    spark = SparkSession.builder

    spark = spark.appName(app_name)
    spark = spark.enableHiveSupport() if enable_hive else spark

    # if has configs
    for config in configs:
        spark = spark.config(config)

    return spark.getOrCreate()

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def _test():
    import doctest
    from pyspark.sql import Row, SparkSession
    import pyspark.sql.functions
    globs = pyspark.sql.functions.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.functions tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.functions, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1)

Source File: distributed.py From LearningApacheSpark with MIT License

6 votes

def _test():
    import doctest
    import numpy
    from pyspark.sql import SparkSession
    from pyspark.mllib.linalg import Matrices
    import pyspark.mllib.linalg.distributed
    try:
        # Numpy 1.14+ changed it's string format.
        numpy.set_printoptions(legacy='1.13')
    except TypeError:
        pass
    globs = pyspark.mllib.linalg.distributed.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("mllib.linalg.distributed tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    globs['Matrices'] = Matrices
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    spark.stop()
    if failure_count:
        sys.exit(-1)

Source File: udf.py From LearningApacheSpark with MIT License

6 votes

def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.udf
    globs = pyspark.sql.udf.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.udf tests")\
        .getOrCreate()
    globs['spark'] = spark
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.udf, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    spark.stop()
    if failure_count:
        sys.exit(-1)

Source File: column.py From LearningApacheSpark with MIT License

6 votes

def cast(self, dataType):
        """ Convert the column into type ``dataType``.

        >>> df.select(df.age.cast("string").alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
        [Row(ages=u'2'), Row(ages=u'5')]
        """
        if isinstance(dataType, basestring):
            jc = self._jc.cast(dataType)
        elif isinstance(dataType, DataType):
            from pyspark.sql import SparkSession
            spark = SparkSession.builder.getOrCreate()
            jdt = spark._jsparkSession.parseDataType(dataType.json())
            jc = self._jc.cast(jdt)
        else:
            raise TypeError("unexpected type: %s" % type(dataType))
        return Column(jc)

Source File: column.py From LearningApacheSpark with MIT License

6 votes

def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.column
    globs = pyspark.sql.column.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.column tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['spark'] = spark
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.column, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    spark.stop()
    if failure_count:
        sys.exit(-1)

Source File: fpm.py From LearningApacheSpark with MIT License

6 votes

def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.mllib.fpm
    globs = pyspark.mllib.fpm.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("mllib.fpm tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    import tempfile

    temp_path = tempfile.mkdtemp()
    globs['temp_path'] = temp_path
    try:
        (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
        spark.stop()
    finally:
        from shutil import rmtree
        try:
            rmtree(temp_path)
        except OSError:
            pass
    if failure_count:
        sys.exit(-1)

Source File: evaluation.py From LearningApacheSpark with MIT License

6 votes

def _test():
    import doctest
    import numpy
    from pyspark.sql import SparkSession
    import pyspark.mllib.evaluation
    try:
        # Numpy 1.14+ changed it's string format.
        numpy.set_printoptions(legacy='1.13')
    except TypeError:
        pass
    globs = pyspark.mllib.evaluation.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("mllib.evaluation tests")\
        .getOrCreate()
    globs['sc'] = spark.sparkContext
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    spark.stop()
    if failure_count:
        sys.exit(-1)

Source File: fixtures.py From pytest-spark with MIT License

6 votes

def _spark_session():
    """Internal fixture for SparkSession instance.

    Yields SparkSession instance if it is supported by the pyspark
    version, otherwise yields None.

    Required to correctly initialize `spark_context` fixture after
    `spark_session` fixture.

    ..note::
        It is not possible to create SparkSession from the existing
        SparkContext.
    """

    try:
        from pyspark.sql import SparkSession
    except ImportError:
        yield
    else:
        session = SparkSession.builder \
            .config(conf=SparkConfigBuilder().get()) \
            .getOrCreate()

        yield session
        session.stop()

Source File: spark.py From mlflow with Apache License 2.0

6 votes

def _load_pyfunc(path):
    """
    Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.

    :param path: Local filesystem path to the MLflow Model with the ``spark`` flavor.
    """
    # NOTE: The getOrCreate() call below may change settings of the active session which we do not
    # intend to do here. In particular, setting master to local[1] can break distributed clusters.
    # To avoid this problem, we explicitly check for an active session. This is not ideal but there
    # is no good workaround at the moment.
    import pyspark

    spark = pyspark.sql.SparkSession._instantiatedSession
    if spark is None:
        spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
            .master("local[1]").getOrCreate()
    return _PyFuncModelWrapper(spark, _load_model(model_uri=path))

Source File: populate_tables.py From data-testing-with-airflow with Apache License 2.0

6 votes

def spark():
    spark = SparkSession.builder \
        .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
        .config('spark.hadoop.javax.jdo.option.ConnectionURL',
                'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
        .enableHiveSupport() \
        .getOrCreate()

    # Now populate some tables
    for database_name in ['dev_app', 'tst_app', 'acc_app', 'prd_app', 'transaction_a', 'transaction_b']:
        spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
        spark.sql('CREATE DATABASE {0}'.format(database_name)).collect()

    populate_transaction_a(spark)
    populate_transaction_b(spark)

    for environment in ['dev', 'tst', 'acc', 'prd']:
        populate_account_info(spark, environment)
        populate_countries(spark, environment)

Source File: conftest.py From data-testing-with-airflow with Apache License 2.0

6 votes

def spark(request):
    spark = SparkSession.builder \
        .master('local[*]') \
        .enableHiveSupport() \
        .getOrCreate()

    # Now populate some tables
    for database_name in ['tst_app', 'transaction_a', 'transaction_b']:
        spark.sql('DROP DATABASE IF EXISTS {0} CASCADE'.format(database_name)).collect()
        spark.sql('CREATE DATABASE {0}'.format(database_name))

    populate_transaction_a(spark)
    populate_transaction_b(spark)
    populate_account_info(spark)
    populate_countries(spark)

    return spark

Source File: utils.py From mlflow with Apache License 2.0

5 votes

def _get_or_create_spark_session(jars=None):
    jar_path = jars if jars is not None else _get_mlflow_spark_jar_path()
    return SparkSession.builder \
        .config("spark.jars", jar_path) \
        .master("local[*]") \
        .getOrCreate()

Source File: utils.py From mlflow with Apache License 2.0

5 votes

def spark_session():
    jar_path = _get_mlflow_spark_jar_path()
    session = SparkSession.builder \
        .config("spark.jars", jar_path) \
        .master("local[*]") \
        .getOrCreate()
    yield session
    session.stop()

Source File: ga_chp_bq_ingest_avro_file.py From MorphL-Community-Edition with Apache License 2.0

5 votes

def main():
    spark_session = (
        SparkSession.builder
        .appName(APPLICATION_NAME)
        .master(MASTER_URL)
        .config('spark.cassandra.connection.host', MORPHL_SERVER_IP_ADDRESS)
        .config('spark.cassandra.auth.username', MORPHL_CASSANDRA_USERNAME)
        .config('spark.cassandra.auth.password', MORPHL_CASSANDRA_PASSWORD)
        .config('spark.sql.shuffle.partitions', 16)
        .getOrCreate())

    log4j = spark_session.sparkContext._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

    avro_df = (
        spark_session
        .read
        .format('avro')
        .load(LOCAL_AVRO_FILE))

    save_options_ga_chp_bq_features_raw = {
        'keyspace': MORPHL_CASSANDRA_KEYSPACE,
        'table': 'ga_chp_bq_features_raw_t' if TRAINING_OR_PREDICTION ==
        'training' else 'ga_chp_bq_features_raw_p'
    }

    (avro_df
     .withColumn('day_of_data_capture', f.lit(DAY_OF_DATA_CAPTURE))
     .withColumn('website_url', f.lit(WEBSITE_URL))
     .write
     .format('org.apache.spark.sql.cassandra')
     .mode('append')
     .options(**save_options_ga_chp_bq_features_raw)
     .save())

Source File: test_standardize.py From datadevops with MIT License

5 votes

def spark():
    """Spark Session fixture
    """
    from pyspark.sql import SparkSession

    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("Unit Testing")\
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    return spark

Source File: test_transform.py From datadevops with MIT License

5 votes

def spark():
    """Spark Session fixture
    """
    from pyspark.sql import SparkSession

    spark = SparkSession.builder\
        .master("local[2]")\
        .appName("Unit Testing")\
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    return spark

Source File: dml_script.py From PerfKitBenchmarker with Apache License 2.0

5 votes

def main():
  spark = (SparkSession.builder
           .appName('Setup Spark table')
           .enableHiveSupport()
           .getOrCreate())
  table = 'warehouse'
  table_dir = sys.argv[1]
  # clean up previous table
  spark.sql('drop table if exists ' + table)
  # register new table
  spark.catalog.createTable(table, table_dir, source='parquet')

Source File: resources.py From dagster with Apache License 2.0

5 votes

def spark_session_from_config(spark_conf=None):
    spark_conf = check.opt_dict_param(spark_conf, 'spark_conf')
    builder = SparkSession.builder
    flat = flatten_dict(spark_conf)
    for key, value in flat:
        builder = builder.config(key, value)

    return builder.getOrCreate()

Source File: spark_table.py From PerfKitBenchmarker with Apache License 2.0

5 votes

def main():
  spark = (SparkSession.builder
           .appName('Setup Spark tables')
           .enableHiveSupport()
           .getOrCreate())
  root_dir = sys.argv[1]
  tables = sys.argv[2].split(',')
  for table in tables:
    table_dir = os.path.join(root_dir, table)
    # clean up previous table
    spark.sql('drop table if exists ' + table)
    # register new table
    spark.catalog.createTable(table, table_dir, source='parquet')

Source File: util.py From spark-sklearn with Apache License 2.0

5 votes

def createLocalSparkSession(appName="spark-sklearn"):
    """Generates a :class:`SparkSession` utilizing all local cores
    with the progress bar disabled but otherwise default config."""
    return SparkSession.builder \
                       .master("local[*]") \
                       .appName(appName) \
                       .config("spark.ui.showConsoleProgress", "false") \
                       .getOrCreate()

Source File: test_gapply.py From spark-sklearn with Apache License 2.0

5 votes

def setUpClass(cls):
        super(GapplyConfTests, cls).setUpClass()
        cls.spark = SparkSession.builder \
                                .config("spark.sql.retainGroupColumns", "false") \
                                .getOrCreate()

Source File: group.py From LearningApacheSpark with MIT License

5 votes

def _test():
    import doctest
    from pyspark.sql import Row, SparkSession
    import pyspark.sql.group
    globs = pyspark.sql.group.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.group tests")\
        .getOrCreate()
    sc = spark.sparkContext
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
                                   Row(name='Bob', age=5, height=85)]).toDF()
    globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000),
                                   Row(course="Java",   year=2012, earnings=20000),
                                   Row(course="dotNET", year=2012, earnings=5000),
                                   Row(course="dotNET", year=2013, earnings=48000),
                                   Row(course="Java",   year=2013, earnings=30000)]).toDF()
    globs['df5'] = sc.parallelize([
        Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=10000)),
        Row(training="junior", sales=Row(course="Java",   year=2012, earnings=20000)),
        Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=5000)),
        Row(training="junior", sales=Row(course="dotNET", year=2013, earnings=48000)),
        Row(training="expert", sales=Row(course="Java",   year=2013, earnings=30000))]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.group, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    spark.stop()
    if failure_count:
        sys.exit(-1)

Source File: dl_runner.py From sparkflow with MIT License

5 votes

def create_testing_spark_session(cls):
        return (SparkSession.builder
                .master('local[2]')
                .appName('sparkflow')
                .getOrCreate())

Source File: udf.py From LearningApacheSpark with MIT License

5 votes

def _create_judf(self):
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext

        wrapped_func = _wrap_function(sc, self.func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf

Source File: test_gapply.py From spark-sklearn with Apache License 2.0

5 votes

def tearDownClass(cls):
        super(GapplyConfTests, cls).tearDownClass()
        # Creating a new SparkSession here seems confusing, but it is necessary because
        # the config is (for some stupid reason...) cached, which would make it get in
        # the way of other tests that expect a default configuration.
        cls.spark = SparkSession.builder \
                                .config("spark.sql.retainGroupColumns", "true") \
                                .getOrCreate()

Source File: image.py From LearningApacheSpark with MIT License

5 votes

def readImages(self, path, recursive=False, numPartitions=-1,
                   dropImageFailures=False, sampleRatio=1.0, seed=0):
        """
        Reads the directory of images from the local or remote source.

        .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag,
            there may be a race condition where one job overwrites the hadoop configs of another.

        .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but
            potentially non-deterministic.

        .. note:: Deprecated in 2.4.0. Use `spark.read.format("image").load(path)` instead and
            this `readImages` will be removed in 3.0.0.

        :param str path: Path to the image directory.
        :param bool recursive: Recursive search flag.
        :param int numPartitions: Number of DataFrame partitions.
        :param bool dropImageFailures: Drop the files that are not valid images.
        :param float sampleRatio: Fraction of the images loaded.
        :param int seed: Random number seed.
        :return: a :class:`DataFrame` with a single column of "images",
               see ImageSchema for details.

        >>> df = ImageSchema.readImages('data/mllib/images/origin/kittens', recursive=True)
        >>> df.count()
        5

        .. versionadded:: 2.3.0
        """
        warnings.warn("`ImageSchema.readImage` is deprecated. " +
                      "Use `spark.read.format(\"image\").load(path)` instead.", DeprecationWarning)
        spark = SparkSession.builder.getOrCreate()
        image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema
        jsession = spark._jsparkSession
        jresult = image_schema.readImages(path, jsession, recursive, numPartitions,
                                          dropImageFailures, float(sampleRatio), seed)
        return DataFrame(jresult, spark._wrapped)

Source File: conftest.py From data-testing-with-airflow with Apache License 2.0

5 votes

def spark(request):
    """
    Fixture to create the SparkSession.
    """
    spark = SparkSession.builder \
        .appName(APP_NAME) \
        .config('spark.sql.warehouse.dir', '/usr/local/airflow/spark_warehouse') \
        .config('spark.hadoop.javax.jdo.option.ConnectionURL',
                'jdbc:derby:;databaseName=/usr/local/airflow/metastore_db;create=true') \
        .enableHiveSupport() \
        .getOrCreate()

    request.addfinalizer(spark.stop)

    return spark

Source File: spark_jdbc_script.py From airflow with Apache License 2.0

5 votes

def _create_spark_session(arguments) -> SparkSession:
    return SparkSession.builder \
        .appName(arguments.name) \
        .enableHiveSupport() \
        .getOrCreate()

Source File: test_sparktorch.py From sparktorch with MIT License

5 votes

def spark():
    return (SparkSession.builder
            .master('local[2]')
            .appName('sparktorch')
            .getOrCreate())

Python pyspark.sql.SparkSession.builder() Examples