Python pyspark.sql.SparkSession() Examples
The following are 30
code examples of pyspark.sql.SparkSession().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql
, or try the search function
.
Example #1
Source File: recommend.py From treasure-boxes with MIT License | 6 votes |
def _prepare_td_spark() -> TDSparkContext: """ Create SparkSession with local mode setting td-spark specific configurations. :return: TDSparkContext """ apikey = os.environ["TD_API_KEY"] endpoint = os.environ["TD_API_SERVER"] site = "us" if ".co.jp" in endpoint: site = "jp" elif "eu01" in endpoint: site = "eu01" builder = SparkSession.builder.appName("spark_als") td = ( TDSparkContextBuilder(builder) .apikey(apikey) .site(site) .jars(TDSparkContextBuilder.default_jar_path()) .build() ) return td
Example #2
Source File: helpers.py From search-MjoLniR with MIT License | 6 votes |
def require_training_files_partition(self): @self._post_process_args.append def post(spark: SparkSession, kwargs: Dict) -> None: kwargs['training_files'] = read_metadata( kwargs['training_files_path']) self.add_argument('--training-files-path', required=True)
Example #3
Source File: helpers.py From search-MjoLniR with MIT License | 6 votes |
def require_feature_vectors_partition(self): @self._post_process_args.append def post(spark: SparkSession, kwargs: Dict) -> None: kwargs['feature_vectors'] = HivePartition( spark, kwargs['feature_vectors_table'], { 'date': kwargs['date'], 'feature_set': kwargs['feature_set'] }, direct_parquet_read=True) self.add_argument('--feature-vectors-table', required=True) self.add_argument('--feature-set', required=True)
Example #4
Source File: hyperparam.py From search-MjoLniR with MIT License | 6 votes |
def tune_wiki( spark: SparkSession, folds: List[Mapping[str, str]], initial_num_trees: int, final_num_trees: int, iterations: int, num_observations: int, num_cv_jobs: Optional[int] = None, ) -> List[Mapping]: if num_cv_jobs is None: # default to running all cross validation in parallel, and # all hyperopt trials sequentially. Settings num_cv_jobs to # a multiple of folds will run multiple trials in parallel. num_cv_jobs = len(folds) results = mjolnir.training.xgboost.tune( folds, {'num_observations': num_observations}, num_cv_jobs=num_cv_jobs, train_matrix='train', initial_num_trees=initial_num_trees, final_num_trees=final_num_trees, iterations=iterations, spark=spark) # Results contains a set of trials for each stage of tuning. We generally # only care about the final params and metrics for all evaluated models # so flatten it out. flat_trials = cast(List[Mapping], []) for trials in results['trials'].values(): for trial in trials: flat_trials.append(trial) return flat_trials
Example #5
Source File: helpers.py From search-MjoLniR with MIT License | 6 votes |
def require_labeled_query_page_partition(self): @self._post_process_args.append def post(spark: SparkSession, kwargs: Dict) -> None: kwargs['labeled_query_page'] = HivePartition( spark, kwargs['labels_table'], { 'date': kwargs['date'], 'algorithm': kwargs['labeling_algorithm'], }) self.add_argument('--labels-table', required=True) self.add_argument('--labeling-algorithm')
Example #6
Source File: helpers.py From search-MjoLniR with MIT License | 6 votes |
def require_query_clustering_partition(self): @self._post_process_args.append def post(spark: SparkSession, kwargs: Dict) -> None: kwargs['query_clustering'] = HivePartition( spark, kwargs['clustering_table'], { 'date': kwargs['date'], 'algorithm': kwargs['clustering_algorithm'], }) self.add_argument('--clustering-table', required=True) self.add_argument('--clustering-algorithm', required=True)
Example #7
Source File: helpers.py From search-MjoLniR with MIT License | 6 votes |
def require_query_clicks_partition(self): @self._post_process_args.append def post(spark: SparkSession, kwargs: Dict) -> None: kwargs['query_clicks'] = HivePartition( spark, kwargs['clicks_table'], { 'date': kwargs['date'], }) self.add_argument('--clicks-table', required=True)
Example #8
Source File: sys_exec.py From cadCAD with MIT License | 6 votes |
def to_spark_df(rdd: RDD, spark: SparkSession, init_condition: dict = None): # Typefull if init_condition is not None: return to_spark(rdd, init_condition) # Typeless else: spark.conf.set("spark.sql.execution.arrow.enabled", "true") spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") warnings.simplefilter(action='ignore', category=UserWarning) pdf_from_rdd: DataFrame = to_pandas(rdd) result = spark.createDataFrame(pdf_from_rdd) del pdf_from_rdd return result
Example #9
Source File: copy_dataset.py From petastorm with Apache License 2.0 | 6 votes |
def _main(sys_argv): logging.basicConfig() args = args_parser().parse_args(sys_argv) # We set spark.sql.files.maxPartitionBytes to a large value since we typically have small number of rows per # rowgroup. Reading a parquet store with default settings would result in excessively large number of partitions # and inefficient processing spark = configure_spark(SparkSession.builder.appName('petastorm-copy'), args) \ .config('spark.sql.files.maxPartitionBytes', '1010612736') \ .getOrCreate() copy_dataset(spark, args.source_url, args.target_url, args.field_regex, args.not_null_fields, args.overwrite_output, args.partition_count, args.row_group_size_mb, hdfs_driver=args.hdfs_driver) spark.stop()
Example #10
Source File: streaming.py From LearningApacheSpark with MIT License | 6 votes |
def schema(self, schema): """Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading. .. note:: Evolving. :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). >>> s = spark.readStream.schema(sdf_schema) >>> s = spark.readStream.schema("col0 INT, col1 DOUBLE") """ from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() if isinstance(schema, StructType): jschema = spark._jsparkSession.parseDataType(schema.json()) self._jreader = self._jreader.schema(jschema) elif isinstance(schema, basestring): self._jreader = self._jreader.schema(schema) else: raise TypeError("schema should be StructType or string") return self
Example #11
Source File: dataframe.py From LearningApacheSpark with MIT License | 6 votes |
def registerTempTable(self, name): """Registers this DataFrame as a temporary table using the given name. The lifetime of this temporary table is tied to the :class:`SparkSession` that was used to create this :class:`DataFrame`. >>> df.registerTempTable("people") >>> df2 = spark.sql("select * from people") >>> sorted(df.collect()) == sorted(df2.collect()) True >>> spark.catalog.dropTempView("people") .. note:: Deprecated in 2.0, use createOrReplaceTempView instead. """ warnings.warn( "Deprecated in 2.0, use createOrReplaceTempView instead.", DeprecationWarning) self._jdf.createOrReplaceTempView(name)
Example #12
Source File: dataframe.py From LearningApacheSpark with MIT License | 6 votes |
def createOrReplaceTempView(self, name): """Creates or replaces a local temporary view with this DataFrame. The lifetime of this temporary table is tied to the :class:`SparkSession` that was used to create this :class:`DataFrame`. >>> df.createOrReplaceTempView("people") >>> df2 = df.filter(df.age > 3) >>> df2.createOrReplaceTempView("people") >>> df3 = spark.sql("select * from people") >>> sorted(df3.collect()) == sorted(df2.collect()) True >>> spark.catalog.dropTempView("people") """ self._jdf.createOrReplaceTempView(name)
Example #13
Source File: pyspark.py From omniduct with MIT License | 6 votes |
def _init(self, app_name='omniduct', config=None, master=None, enable_hive_support=False): """ Args: app_name (str): The application name of the SparkSession. config (dict or None): Any additional configuration to pass through to the SparkSession builder. master (str): The Spark master URL to connect to (only necessary if environment specified configuration is missing). enable_hive_support (bool): Whether to enable Hive support for the Spark session. Note: Pyspark must be installed in order to use this backend. """ self.app_name = app_name self.config = config or {} self.master = master self.enable_hive_support = enable_hive_support self._spark_session = None # Connection management
Example #14
Source File: td-spark.py From treasure-boxes with MIT License | 6 votes |
def upload_dataframe( database_name: str, table_name: str, td_spark: Optional[TDSparkContext] = None ) -> None: """ Create Pandas DataFrame and upload it to Treasure Data :param database_name: Target database name on Treasure Data :param table_name: Target table name on Treasure Data :param spark: [Optional] SparkSession """ import numpy as np import pandas as pd if td_spark is None: td_spark = _prepare_td_spark() spark = td_spark.spark df = pd.DataFrame({"c": np.random.binomial(10, 0.5, 10)}) sdf = spark.createDataFrame(df) td_spark.create_database_if_not_exists(database_name) td_spark.create_or_replace(sdf, f"{database_name}.{table_name}")
Example #15
Source File: td-spark.py From treasure-boxes with MIT License | 6 votes |
def process_data( database_name: str, table_name: str, td_spark: Optional[TDSparkContext] = None ) -> None: """ Load a Treasure Data table and upload it to Treasure Data after PySpark processing. :param database_name: Target database name on Treasure Data :param table_name: Target table name on Treasure Data :param spark: [Optional] SparkSession """ if td_spark is None: td_spark = _prepare_td_spark() # Read sample_datasets from TD table access_df = td_spark.table("sample_datasets.www_access").df() # Process with PySpark processed_df = access_df.filter("method = 'GET'").withColumn( "time_str", func.from_unixtime("time") ) # Upload processed Spark DataFrame to TD td_spark.create_database_if_not_exists(database_name) td_spark.create_or_replace(processed_df, f"{database_name}.{table_name}")
Example #16
Source File: util.py From datadevops with MIT License | 6 votes |
def save_overwrite_unmanaged_table(spark: SparkSession, dataframe: DataFrame, table_name: str, path: str): """When trying to read and overwrite the same table, you get this error: 'Cannot overwrite table dw.dim_parking_bay that is also being read from;' This utility function workarounds this by saving to a temporary table first prior to overwriting. """ temp_table_name = table_name + "___temp" spark.sql("DROP TABLE IF EXISTS " + temp_table_name).collect() # Save temp table dataframe.write.saveAsTable(temp_table_name) # Read temp table and overwrite original table spark.read.table(temp_table_name)\ .write.mode("overwrite")\ .option("path", path)\ .saveAsTable(table_name) # Drop temp table spark.sql("DROP TABLE " + temp_table_name).collect()
Example #17
Source File: df_naive.py From example_dataproc_twitter with MIT License | 6 votes |
def register_udfs(self, sess, sc): """Register UDFs to be used in SQL queries. :type sess: `pyspark.sql.SparkSession` :param sess: Session used in Spark for SQL queries. :type sc: `pyspark.SparkContext` :param sc: Spark Context to run Spark jobs. """ sess.udf.register("SQUARED", self.squared, returnType=( stypes.ArrayType(stypes.StructType( fields=[stypes.StructField('sku0', stypes.StringType()), stypes.StructField('norm', stypes.FloatType())])))) sess.udf.register('INTERSECTIONS',self.process_intersections, returnType=stypes.ArrayType(stypes.StructType(fields=[ stypes.StructField('sku0', stypes.StringType()), stypes.StructField('sku1', stypes.StringType()), stypes.StructField('cor', stypes.FloatType())])))
Example #18
Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0 | 6 votes |
def init_spark_session(app_name): """ Initializes a Spark Session with the given application name. Args: app_name (str): Name of the Spark application. This will also occur in the Spark UI. """ global session, context, sql_context try: session = SparkSession \ .builder \ .appName(app_name) \ .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \ .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \ .config("spark.driver.maxResultSize", "4g") \ .getOrCreate() context = session.sparkContext context.setLogLevel("ERROR") sql_context = SQLContext(context) except Py4JJavaError as err: raise SparkSessionNotInitializedException(app_name, err.java_exception)
Example #19
Source File: customReportService.py From mmtf-pyspark with Apache License 2.0 | 6 votes |
def _concat_ids(spark, dataset, columnNames): """Concatenates structureId and chainId fields into a single key if chainId field is present Parameters ---------- spark : :obj:`SparkSession <pyspark.sql.SparkSession>` dataset : Dataframe columnNames : list """ if "chainId" in dataset.columns: dataset.createOrReplaceTempView("table") sql = "SELECT CONCAT(structureId,'.',chainId) as structureChainId," + \ "structureId,chainId,%s" % ','.join(columnNames) + \ " from table" dataset = spark.sql(sql) return dataset
Example #20
Source File: testconfig.py From SMV with Apache License 2.0 | 6 votes |
def sparkSession(cls): if not hasattr(cls, "spark"): # We can't use the SparkSession Builder here, since we need to call # Scala side's SmvTestHive.createContext to create the HiveTestContext's # SparkSession. # So we need to # * Create a java_gateway # * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf) # * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir) # * Create Scala side HiveTestContext SparkSession # * Create python SparkSession jgw = launch_gateway(None) jvm = jgw.jvm import tempfile import getpass hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser()) sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\ .set("spark.sql.hive.metastore.barrierPrefixes", "org.apache.spark.sql.hive.execution.PairSerDe")\ .set("spark.sql.warehouse.dir", hivedir)\ .set("spark.ui.enabled", "false") sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate() jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc()) cls.spark = SparkSession(sc, jss.sparkSession()) return cls.spark
Example #21
Source File: readwriter.py From LearningApacheSpark with MIT License | 6 votes |
def schema(self, schema): """Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading. :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). >>> s = spark.read.schema("col0 INT, col1 DOUBLE") """ from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() if isinstance(schema, StructType): jschema = spark._jsparkSession.parseDataType(schema.json()) self._jreader = self._jreader.schema(jschema) elif isinstance(schema, basestring): self._jreader = self._jreader.schema(schema) else: raise TypeError("schema should be StructType or string") return self
Example #22
Source File: dataframe.py From LearningApacheSpark with MIT License | 6 votes |
def createTempView(self, name): """Creates a local temporary view with this DataFrame. The lifetime of this temporary table is tied to the :class:`SparkSession` that was used to create this :class:`DataFrame`. throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the catalog. >>> df.createTempView("people") >>> df2 = spark.sql("select * from people") >>> sorted(df.collect()) == sorted(df2.collect()) True >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... AnalysisException: u"Temporary table 'people' already exists;" >>> spark.catalog.dropTempView("people") """ self._jdf.createTempView(name)
Example #23
Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def init_test_session(app_name): global session, context, sql_context try: session = SparkSession \ .builder \ .master('local') \ .appName(app_name) \ .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \ .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \ .getOrCreate() context = session.sparkContext context.setLogLevel("ERROR") sql_context = SQLContext(context) except Py4JJavaError as err: raise SparkSessionNotInitializedException(app_name, err.java_exception)
Example #24
Source File: sql.py From koalas with Apache License 2.0 | 5 votes |
def __init__(self, scope: Dict[str, Any], statement: str, session: SparkSession): self._scope = scope self._statement = statement # All the temporary views created when executing this statement # The key is the name of the variable in {} # The value is the cached Spark Dataframe. self._temp_views = {} # type: Dict[str, SDataFrame] # All the other variables, converted to a normalized form. # The normalized form is typically a string self._cached_vars = {} # type: Dict[str, Any] # The SQL statement after: # - all the dataframes have been have been registered as temporary views # - all the values have been converted normalized to equivalent SQL representations self._normalized_statement = None # type: Optional[str] self._session = session
Example #25
Source File: trigger_maven.py From dagster with Apache License 2.0 | 5 votes |
def spark_session(): spark = ( SparkSession.builder.appName("DownloadStuff") .config( 'spark.jars.packages', 'com.databricks:spark-avro_2.11:3.0.0,' 'com.databricks:spark-redshift_2.11:2.0.1,' 'com.databricks:spark-csv_2.11:1.5.0,' 'org.postgresql:postgresql:42.2.5,' 'org.apache.hadoop:hadoop-aws:2.6.5,' 'com.amazonaws:aws-java-sdk:1.7.4', ) .getOrCreate() ) return spark
Example #26
Source File: backend.py From joblib-spark with Apache License 2.0 | 5 votes |
def __init__(self, **backend_args): super(SparkDistributedBackend, self).__init__(**backend_args) self._pool = None self._n_jobs = None self._spark = SparkSession \ .builder \ .appName("JoblibSparkBackend") \ .getOrCreate() self._job_group = "joblib-spark-job-group-" + str(uuid.uuid4())
Example #27
Source File: addon_aggregates.py From python_mozetl with MIT License | 5 votes |
def load_main_summary(spark, input_bucket, input_prefix, input_version): """ Loads main_summary from the bucket constructed from input_bucket, input_prefix, input_version :param spark: SparkSession object :param input_bucket: s3 bucket (telemetry-parquet) :param input_prefix: s3 prefix (main_summary) :param input_version: dataset version (v4) :return SparkDF """ dest = get_dest(input_bucket, input_prefix, input_version) print("loading...", dest) return spark.read.option("mergeSchema", True).parquet(dest)
Example #28
Source File: xgboost.py From search-MjoLniR with MIT License | 5 votes |
def trainWithFilesRemote( spark: SparkSession, fold: Mapping[str, str], train_matrix: str, params: Mapping[str, Any], **kwargs ) -> 'XGBoostModel': """Train model on a single remote spark executor. Silly hack to train models inside the yarn cluster. To train multiple models in parallel python threads will need to be used. Wish pyspark had collectAsync. """ nthread = int(spark.conf.get('spark.task.cpus', '1')) if 'nthread' not in params: params = dict(params, nthread=nthread) elif params['nthread'] != nthread: raise Exception("Executors have [{}] cpus but training requested [{}]".format( nthread, params['nthread'])) return ( spark.sparkContext .parallelize([1], 1) .map(lambda x: XGBoostModel.trainWithFiles(fold, train_matrix, params, **kwargs)) .collect()[0] )
Example #29
Source File: gradient_descent_example.py From intro_ds with Apache License 2.0 | 5 votes |
def startSpark(): """ 创建SparkSession,这是Spark程序的入口 """ spark = SparkSession.builder.appName("gd_example").getOrCreate() return spark
Example #30
Source File: pyspark.py From omniduct with MIT License | 5 votes |
def _connect(self): from pyspark.sql import SparkSession builder = SparkSession.builder.appName(self.app_name) if self.master: builder.master(self.master) if self.enable_hive_support: builder.enableHiveSupport() if self.config: for key, value in self.config.items(): builder.config(key, value) self._spark_session = builder.getOrCreate()