Python Examples of pyspark.sql.SparkSession

Source File: recommend.py From treasure-boxes with MIT License

6 votes

def _prepare_td_spark() -> TDSparkContext:
    """
    Create SparkSession with local mode setting td-spark specific configurations.
    :return: TDSparkContext
    """

    apikey = os.environ["TD_API_KEY"]
    endpoint = os.environ["TD_API_SERVER"]

    site = "us"
    if ".co.jp" in endpoint:
        site = "jp"
    elif "eu01" in endpoint:
        site = "eu01"

    builder = SparkSession.builder.appName("spark_als")
    td = (
        TDSparkContextBuilder(builder)
        .apikey(apikey)
        .site(site)
        .jars(TDSparkContextBuilder.default_jar_path())
        .build()
    )

    return td

Source File: helpers.py From search-MjoLniR with MIT License

6 votes

def require_training_files_partition(self):
        @self._post_process_args.append
        def post(spark: SparkSession, kwargs: Dict) -> None:
            kwargs['training_files'] = read_metadata(
                kwargs['training_files_path'])

        self.add_argument('--training-files-path', required=True)

Source File: helpers.py From search-MjoLniR with MIT License

6 votes

def require_feature_vectors_partition(self):
        @self._post_process_args.append
        def post(spark: SparkSession, kwargs: Dict) -> None:
            kwargs['feature_vectors'] = HivePartition(
                spark, kwargs['feature_vectors_table'], {
                    'date': kwargs['date'],
                    'feature_set': kwargs['feature_set']
                }, direct_parquet_read=True)

        self.add_argument('--feature-vectors-table', required=True)
        self.add_argument('--feature-set', required=True)

Source File: hyperparam.py From search-MjoLniR with MIT License

6 votes

def tune_wiki(
    spark: SparkSession,
    folds: List[Mapping[str, str]],
    initial_num_trees: int,
    final_num_trees: int,
    iterations: int,
    num_observations: int,
    num_cv_jobs: Optional[int] = None,
) -> List[Mapping]:
    if num_cv_jobs is None:
        # default to running all cross validation in parallel, and
        # all hyperopt trials sequentially. Settings num_cv_jobs to
        # a multiple of folds will run multiple trials in parallel.
        num_cv_jobs = len(folds)
    results = mjolnir.training.xgboost.tune(
        folds, {'num_observations': num_observations},
        num_cv_jobs=num_cv_jobs,
        train_matrix='train',
        initial_num_trees=initial_num_trees,
        final_num_trees=final_num_trees,
        iterations=iterations,
        spark=spark)

    # Results contains a set of trials for each stage of tuning. We generally
    # only care about the final params and metrics for all evaluated models
    # so flatten it out.
    flat_trials = cast(List[Mapping], [])
    for trials in results['trials'].values():
        for trial in trials:
            flat_trials.append(trial)
    return flat_trials

Source File: helpers.py From search-MjoLniR with MIT License

6 votes

def require_labeled_query_page_partition(self):
        @self._post_process_args.append
        def post(spark: SparkSession, kwargs: Dict) -> None:
            kwargs['labeled_query_page'] = HivePartition(
                spark, kwargs['labels_table'], {
                    'date': kwargs['date'],
                    'algorithm': kwargs['labeling_algorithm'],
                })

        self.add_argument('--labels-table', required=True)
        self.add_argument('--labeling-algorithm')

Source File: helpers.py From search-MjoLniR with MIT License

6 votes

def require_query_clustering_partition(self):
        @self._post_process_args.append
        def post(spark: SparkSession, kwargs: Dict) -> None:
            kwargs['query_clustering'] = HivePartition(
                spark, kwargs['clustering_table'], {
                    'date': kwargs['date'],
                    'algorithm': kwargs['clustering_algorithm'],
                })

        self.add_argument('--clustering-table', required=True)
        self.add_argument('--clustering-algorithm', required=True)

Source File: helpers.py From search-MjoLniR with MIT License

6 votes

def require_query_clicks_partition(self):
        @self._post_process_args.append
        def post(spark: SparkSession, kwargs: Dict) -> None:
            kwargs['query_clicks'] = HivePartition(
                spark, kwargs['clicks_table'], {
                    'date': kwargs['date'],
                })

        self.add_argument('--clicks-table', required=True)

Source File: sys_exec.py From cadCAD with MIT License

6 votes

def to_spark_df(rdd: RDD, spark: SparkSession, init_condition: dict = None):
    # Typefull
    if init_condition is not None:
        return to_spark(rdd, init_condition)
    # Typeless
    else:
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true")
        warnings.simplefilter(action='ignore', category=UserWarning)
        pdf_from_rdd: DataFrame = to_pandas(rdd)
        result = spark.createDataFrame(pdf_from_rdd)
        del pdf_from_rdd
        return result

Source File: copy_dataset.py From petastorm with Apache License 2.0

6 votes

def _main(sys_argv):
    logging.basicConfig()

    args = args_parser().parse_args(sys_argv)

    # We set spark.sql.files.maxPartitionBytes to a large value since we typically have small number of rows per
    # rowgroup. Reading a parquet store with default settings would result in excessively large number of partitions
    # and inefficient processing
    spark = configure_spark(SparkSession.builder.appName('petastorm-copy'), args) \
        .config('spark.sql.files.maxPartitionBytes', '1010612736') \
        .getOrCreate()

    copy_dataset(spark, args.source_url, args.target_url, args.field_regex, args.not_null_fields, args.overwrite_output,
                 args.partition_count, args.row_group_size_mb, hdfs_driver=args.hdfs_driver)

    spark.stop()

Source File: streaming.py From LearningApacheSpark with MIT License

6 votes

def schema(self, schema):
        """Specifies the input schema.

        Some data sources (e.g. JSON) can infer the input schema automatically from data.
        By specifying the schema here, the underlying data source can skip the schema
        inference step, and thus speed up data loading.

        .. note:: Evolving.

        :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string
                       (For example ``col0 INT, col1 DOUBLE``).

        >>> s = spark.readStream.schema(sdf_schema)
        >>> s = spark.readStream.schema("col0 INT, col1 DOUBLE")
        """
        from pyspark.sql import SparkSession
        spark = SparkSession.builder.getOrCreate()
        if isinstance(schema, StructType):
            jschema = spark._jsparkSession.parseDataType(schema.json())
            self._jreader = self._jreader.schema(jschema)
        elif isinstance(schema, basestring):
            self._jreader = self._jreader.schema(schema)
        else:
            raise TypeError("schema should be StructType or string")
        return self

Source File: dataframe.py From LearningApacheSpark with MIT License

6 votes

def registerTempTable(self, name):
        """Registers this DataFrame as a temporary table using the given name.

        The lifetime of this temporary table is tied to the :class:`SparkSession`
        that was used to create this :class:`DataFrame`.

        >>> df.registerTempTable("people")
        >>> df2 = spark.sql("select * from people")
        >>> sorted(df.collect()) == sorted(df2.collect())
        True
        >>> spark.catalog.dropTempView("people")

        .. note:: Deprecated in 2.0, use createOrReplaceTempView instead.
        """
        warnings.warn(
            "Deprecated in 2.0, use createOrReplaceTempView instead.", DeprecationWarning)
        self._jdf.createOrReplaceTempView(name)

Source File: dataframe.py From LearningApacheSpark with MIT License

6 votes

def createOrReplaceTempView(self, name):
        """Creates or replaces a local temporary view with this DataFrame.

        The lifetime of this temporary table is tied to the :class:`SparkSession`
        that was used to create this :class:`DataFrame`.

        >>> df.createOrReplaceTempView("people")
        >>> df2 = df.filter(df.age > 3)
        >>> df2.createOrReplaceTempView("people")
        >>> df3 = spark.sql("select * from people")
        >>> sorted(df3.collect()) == sorted(df2.collect())
        True
        >>> spark.catalog.dropTempView("people")

        """
        self._jdf.createOrReplaceTempView(name)

Source File: pyspark.py From omniduct with MIT License

6 votes

def _init(self, app_name='omniduct', config=None, master=None, enable_hive_support=False):
        """
        Args:
            app_name (str): The application name of the SparkSession.
            config (dict or None): Any additional configuration to pass through
                to the SparkSession builder.
            master (str): The Spark master URL to connect to (only necessary
                if environment specified configuration is missing).
            enable_hive_support (bool): Whether to enable Hive support for the
                Spark session.

        Note: Pyspark must be installed in order to use this backend.
        """
        self.app_name = app_name
        self.config = config or {}
        self.master = master
        self.enable_hive_support = enable_hive_support
        self._spark_session = None

    # Connection management

Source File: td-spark.py From treasure-boxes with MIT License

6 votes

def upload_dataframe(
    database_name: str, table_name: str, td_spark: Optional[TDSparkContext] = None
) -> None:
    """
    Create Pandas DataFrame and upload it to Treasure Data

    :param database_name: Target database name on Treasure Data
    :param table_name: Target table name on Treasure Data
    :param spark: [Optional] SparkSession
    """

    import numpy as np
    import pandas as pd

    if td_spark is None:
        td_spark = _prepare_td_spark()

    spark = td_spark.spark

    df = pd.DataFrame({"c": np.random.binomial(10, 0.5, 10)})
    sdf = spark.createDataFrame(df)
    td_spark.create_database_if_not_exists(database_name)
    td_spark.create_or_replace(sdf, f"{database_name}.{table_name}")

Source File: td-spark.py From treasure-boxes with MIT License

6 votes

def process_data(
    database_name: str, table_name: str, td_spark: Optional[TDSparkContext] = None
) -> None:
    """
    Load a Treasure Data table and upload it to Treasure Data after PySpark processing.

    :param database_name: Target database name on Treasure Data
    :param table_name: Target table name on Treasure Data
    :param spark: [Optional] SparkSession
    """
    if td_spark is None:
        td_spark = _prepare_td_spark()

    # Read sample_datasets from TD table
    access_df = td_spark.table("sample_datasets.www_access").df()

    # Process with PySpark
    processed_df = access_df.filter("method = 'GET'").withColumn(
        "time_str", func.from_unixtime("time")
    )

    # Upload processed Spark DataFrame to TD
    td_spark.create_database_if_not_exists(database_name)
    td_spark.create_or_replace(processed_df, f"{database_name}.{table_name}")

Source File: util.py From datadevops with MIT License

6 votes

def save_overwrite_unmanaged_table(spark: SparkSession, dataframe: DataFrame, table_name: str, path: str):
    """When trying to read and overwrite the same table, you get this error:
    'Cannot overwrite table dw.dim_parking_bay that is also being read from;'
    This utility function workarounds this by saving to a temporary table first prior to overwriting.
    """
    temp_table_name = table_name + "___temp"
    spark.sql("DROP TABLE IF EXISTS " + temp_table_name).collect()
    # Save temp table
    dataframe.write.saveAsTable(temp_table_name)
    # Read temp table and overwrite original table
    spark.read.table(temp_table_name)\
        .write.mode("overwrite")\
        .option("path", path)\
        .saveAsTable(table_name)
    # Drop temp table
    spark.sql("DROP TABLE " + temp_table_name).collect()

Source File: df_naive.py From example_dataproc_twitter with MIT License

6 votes

def register_udfs(self, sess, sc):
        """Register UDFs to be used in SQL queries.

        :type sess: `pyspark.sql.SparkSession`
        :param sess: Session used in Spark for SQL queries.

        :type sc: `pyspark.SparkContext`
        :param sc: Spark Context to run Spark jobs.
        """ 
        sess.udf.register("SQUARED", self.squared, returnType=(
            stypes.ArrayType(stypes.StructType(
            fields=[stypes.StructField('sku0', stypes.StringType()),
            stypes.StructField('norm', stypes.FloatType())]))))

        sess.udf.register('INTERSECTIONS',self.process_intersections,
            returnType=stypes.ArrayType(stypes.StructType(fields=[
            stypes.StructField('sku0', stypes.StringType()),
            stypes.StructField('sku1', stypes.StringType()),
            stypes.StructField('cor', stypes.FloatType())])))

Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0

6 votes

def init_spark_session(app_name):
    """ Initializes a Spark Session with the given application name.

        Args:
            app_name (str): Name of the Spark application. This will also occur in the Spark UI.
    """
    global session, context, sql_context
    try:
        session = SparkSession \
                .builder \
                .appName(app_name) \
                .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
                .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \
                .config("spark.driver.maxResultSize", "4g") \
                .getOrCreate()
        context = session.sparkContext
        context.setLogLevel("ERROR")
        sql_context = SQLContext(context)
    except Py4JJavaError as err:
        raise SparkSessionNotInitializedException(app_name, err.java_exception)

Source File: customReportService.py From mmtf-pyspark with Apache License 2.0

6 votes

def _concat_ids(spark, dataset, columnNames):
    """Concatenates structureId and chainId fields into a single key if chainId
    field is present

    Parameters
    ----------
    spark : :obj:`SparkSession <pyspark.sql.SparkSession>`
    dataset : Dataframe
    columnNames : list
    """

    if "chainId" in dataset.columns:
        dataset.createOrReplaceTempView("table")

        sql = "SELECT CONCAT(structureId,'.',chainId) as structureChainId," + \
              "structureId,chainId,%s" % ','.join(columnNames) + \
              " from table"

        dataset = spark.sql(sql)

    return dataset

Source File: testconfig.py From SMV with Apache License 2.0

6 votes

def sparkSession(cls):
        if not hasattr(cls, "spark"):
            # We can't use the SparkSession Builder here, since we need to call
            # Scala side's SmvTestHive.createContext to create the HiveTestContext's
            # SparkSession.
            # So we need to
            #   * Create a java_gateway
            #   * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
            #   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
            #   * Create Scala side HiveTestContext SparkSession
            #   * Create python SparkSession
            jgw = launch_gateway(None)
            jvm = jgw.jvm
            import tempfile
            import getpass
            hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
            sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                              .set("spark.sql.hive.metastore.barrierPrefixes",
                                                   "org.apache.spark.sql.hive.execution.PairSerDe")\
                                              .set("spark.sql.warehouse.dir", hivedir)\
                                              .set("spark.ui.enabled", "false")
            sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
            jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
            cls.spark = SparkSession(sc, jss.sparkSession())
        return cls.spark

Source File: readwriter.py From LearningApacheSpark with MIT License

6 votes

def schema(self, schema):
        """Specifies the input schema.

        Some data sources (e.g. JSON) can infer the input schema automatically from data.
        By specifying the schema here, the underlying data source can skip the schema
        inference step, and thus speed up data loading.

        :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string
                       (For example ``col0 INT, col1 DOUBLE``).

        >>> s = spark.read.schema("col0 INT, col1 DOUBLE")
        """
        from pyspark.sql import SparkSession
        spark = SparkSession.builder.getOrCreate()
        if isinstance(schema, StructType):
            jschema = spark._jsparkSession.parseDataType(schema.json())
            self._jreader = self._jreader.schema(jschema)
        elif isinstance(schema, basestring):
            self._jreader = self._jreader.schema(schema)
        else:
            raise TypeError("schema should be StructType or string")
        return self

Source File: dataframe.py From LearningApacheSpark with MIT License

6 votes

def createTempView(self, name):
        """Creates a local temporary view with this DataFrame.

        The lifetime of this temporary table is tied to the :class:`SparkSession`
        that was used to create this :class:`DataFrame`.
        throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the
        catalog.

        >>> df.createTempView("people")
        >>> df2 = spark.sql("select * from people")
        >>> sorted(df.collect()) == sorted(df2.collect())
        True
        >>> df.createTempView("people")  # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
        ...
        AnalysisException: u"Temporary table 'people' already exists;"
        >>> spark.catalog.dropTempView("people")

        """
        self._jdf.createTempView(name)

Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0

5 votes

def init_test_session(app_name):
    global session, context, sql_context
    try:
        session = SparkSession \
                .builder \
                .master('local') \
                .appName(app_name) \
                .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
                .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \
                .getOrCreate()
        context = session.sparkContext
        context.setLogLevel("ERROR")
        sql_context = SQLContext(context)
    except Py4JJavaError as err:
        raise SparkSessionNotInitializedException(app_name, err.java_exception)

Source File: sql.py From koalas with Apache License 2.0

5 votes

def __init__(self, scope: Dict[str, Any], statement: str, session: SparkSession):
        self._scope = scope
        self._statement = statement
        # All the temporary views created when executing this statement
        # The key is the name of the variable in {}
        # The value is the cached Spark Dataframe.
        self._temp_views = {}  # type: Dict[str, SDataFrame]
        # All the other variables, converted to a normalized form.
        # The normalized form is typically a string
        self._cached_vars = {}  # type: Dict[str, Any]
        # The SQL statement after:
        # - all the dataframes have been have been registered as temporary views
        # - all the values have been converted normalized to equivalent SQL representations
        self._normalized_statement = None  # type: Optional[str]
        self._session = session

Source File: trigger_maven.py From dagster with Apache License 2.0

5 votes

def spark_session():
    spark = (
        SparkSession.builder.appName("DownloadStuff")
        .config(
            'spark.jars.packages',
            'com.databricks:spark-avro_2.11:3.0.0,'
            'com.databricks:spark-redshift_2.11:2.0.1,'
            'com.databricks:spark-csv_2.11:1.5.0,'
            'org.postgresql:postgresql:42.2.5,'
            'org.apache.hadoop:hadoop-aws:2.6.5,'
            'com.amazonaws:aws-java-sdk:1.7.4',
        )
        .getOrCreate()
    )
    return spark

Source File: backend.py From joblib-spark with Apache License 2.0

5 votes

def __init__(self, **backend_args):
        super(SparkDistributedBackend, self).__init__(**backend_args)
        self._pool = None
        self._n_jobs = None
        self._spark = SparkSession \
            .builder \
            .appName("JoblibSparkBackend") \
            .getOrCreate()
        self._job_group = "joblib-spark-job-group-" + str(uuid.uuid4())

Source File: addon_aggregates.py From python_mozetl with MIT License

5 votes

def load_main_summary(spark, input_bucket, input_prefix, input_version):
    """
    Loads main_summary from the bucket constructed from
    input_bucket, input_prefix, input_version

    :param spark: SparkSession object
    :param input_bucket: s3 bucket (telemetry-parquet)
    :param input_prefix: s3 prefix (main_summary)
    :param input_version: dataset version (v4)
    :return SparkDF
    """
    dest = get_dest(input_bucket, input_prefix, input_version)
    print("loading...", dest)
    return spark.read.option("mergeSchema", True).parquet(dest)

Source File: xgboost.py From search-MjoLniR with MIT License

5 votes

def trainWithFilesRemote(
        spark: SparkSession,
        fold: Mapping[str, str],
        train_matrix: str,
        params: Mapping[str, Any],
        **kwargs
    ) -> 'XGBoostModel':
        """Train model on a single remote spark executor.

        Silly hack to train models inside the yarn cluster. To train multiple
        models in parallel python threads will need to be used.  Wish pyspark
        had collectAsync.
        """
        nthread = int(spark.conf.get('spark.task.cpus', '1'))
        if 'nthread' not in params:
            params = dict(params, nthread=nthread)
        elif params['nthread'] != nthread:
            raise Exception("Executors have [{}] cpus but training requested [{}]".format(
                nthread, params['nthread']))

        return (
            spark.sparkContext
            .parallelize([1], 1)
            .map(lambda x: XGBoostModel.trainWithFiles(fold, train_matrix, params, **kwargs))
            .collect()[0]
        )

Source File: gradient_descent_example.py From intro_ds with Apache License 2.0

5 votes

def startSpark():
    """
    创建SparkSession，这是Spark程序的入口
    """
    spark = SparkSession.builder.appName("gd_example").getOrCreate()
    return spark

Source File: pyspark.py From omniduct with MIT License

5 votes

def _connect(self):
        from pyspark.sql import SparkSession

        builder = SparkSession.builder.appName(self.app_name)
        if self.master:
            builder.master(self.master)
        if self.enable_hive_support:
            builder.enableHiveSupport()
        if self.config:
            for key, value in self.config.items():
                builder.config(key, value)

        self._spark_session = builder.getOrCreate()

Python pyspark.sql.SparkSession() Examples