Python pyspark.sql.functions.explode() Examples
The following are 13
code examples of pyspark.sql.functions.explode().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: feature_vectors.py From search-MjoLniR with MIT License | 6 votes |
def resample_clicks_to_query_page( df_cluster: DataFrame, random_seed: Optional[int], samples_per_wiki: int ) -> mt.Transformer: # Resamples the click log by proxy of resampling clusters, such # that a complete cluster is either included or excluded from the # resulting dataset. # TODO: Evaluate alternative resampling, such as perhaps only dropping from # clusters where all clicks were to the top result (implying an "easy" search). mt.check_schema(df_cluster, mt.QueryClustering) return mt.seq_transform([ # Grab only the parts of the query log we need to make the resulting sampled QueryPage lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'), mt.join_cluster_by_query(df_cluster), # [1] is because sample returns a tuple of (page_counts, df) mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample( df, random_seed, samples_per_wiki)[1]), lambda df: df.withColumn( 'page_id', F.explode('hit_page_ids')).drop('hit_page_ids') ])
Example #2
Source File: swissModelDataset.py From mmtf-pyspark with Apache License 2.0 | 6 votes |
def _flatten_dataset(ds): '''Flattens the original hierarchical data schema into a simple row-based schema. Some less useful data are excluded. Parameters ---------- ds : dataset the original spark dataset Returns ------- dataset flattened dataset ''' ds = ds.withColumn("structures", explode(ds.result.structures)) return ds.select(col("query.ac"), col("result.sequence"), \ col("structures.from"), col("structures.to"), \ col("structures.qmean"), col("structures.qmean_norm"), \ col("structures.gmqe"), col("structures.coverage"), \ col("structures.oligo-state"), col("structures.method"), \ col("structures.template"), col("structures.identity"), \ col("structures.similarity"), col("structures.coordinates"),\ col("result.md5"), col("structures.md5"))
Example #3
Source File: dataset_utils.py From mmtf-pyspark with Apache License 2.0 | 6 votes |
def flatten_dataset(dataset: DataFrame): tmp = dataset for field in tmp.schema.fields: if isinstance(field.dataType, ArrayType): print(field.name, field.dataType) tmp = tmp.withColumn(field.name, explode(tmp.field.name)) return tmp
Example #4
Source File: norm_query_clustering.py From search-MjoLniR with MIT License | 5 votes |
def with_unique_cluster_id(df: DataFrame) -> DataFrame: return ( df .groupby('wikiid', 'norm_query', 'norm_query_group_id') .agg(F.collect_list('query').alias('queries')) .select( 'wikiid', 'queries', F.monotonically_increasing_id().alias('cluster_id')) .select('wikiid', F.explode('queries').alias('query'), 'cluster_id'))
Example #5
Source File: advancedSearchDataset.py From mmtf-pyspark with Apache License 2.0 | 5 votes |
def __get_entity_to_chain_id(): # get entityID to strandId mapping query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly" mapping: DataFrame = pdbjMineDataset.get_dataset(query) # split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows mapping = mapping.withColumn("chainId", split(mapping.pdbx_strand_id, ",")) mapping = mapping.withColumn("chainId", explode("chainId")) # create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A mapping = mapping.withColumn("pdbChainId", concat_ws(".", mapping.structureId, mapping.chainId)) return mapping.select(mapping.entity_id, mapping.structureId, mapping.pdbChainId)
Example #6
Source File: g2sDataset.py From mmtf-pyspark with Apache License 2.0 | 5 votes |
def _flatten_dataframe(df): return df.withColumn("pdbPosition", explode(col("residueMapping.pdbPosition"))) \ .withColumn("pdbAminoAcid", explode(col("residueMapping.pdbAminoAcid")))
Example #7
Source File: myVariantDataset.py From mmtf-pyspark with Apache License 2.0 | 5 votes |
def _flatten_dataframe(df): return df.withColumn("variationId", explode(df.hits._id)) \ .select(col("variationId"), col("uniprotId"))
Example #8
Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_distinct_keys(df, col_name, is_col_arr_map=False): """ Return list of distinct keys. Set is_col_arr_map to be true if column is an array of Maps. Otherwise, assume column is a Map. """ if is_col_arr_map: df = df.select(explode(col_name).alias(col_name)) df = df.select(explode(map_keys(col_name))) return df.distinct().rdd.flatMap(lambda x: x).collect()
Example #9
Source File: addon_aggregates.py From python_mozetl with MIT License | 5 votes |
def ms_explode_addons(ms): """ Explodes the active_addons object in the ms DataFrame and selects relevant fields :param ms: a subset of main_summary :return SparkDF """ addons_df = ( ms.select(MS_FIELDS + [fun.explode("active_addons").alias("addons")]) .select(MS_FIELDS + ADDON_FIELDS) .withColumn("app_version", fun.substring("app_version", 1, 2)) ) return addons_df
Example #10
Source File: norm_query_clustering.py From search-MjoLniR with MIT License | 4 votes |
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame: make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([ T.StructField('query', T.StringType(), nullable=False), T.StructField('norm_query_group_id', T.IntegerType(), nullable=False), ]))) return ( df .groupBy('wikiid', 'norm_query') .agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source')) .select( 'wikiid', 'norm_query', F.explode(make_groups('source')).alias('group')) .select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))
Example #11
Source File: taar_lite_guidguid.py From telemetry-airflow with Mozilla Public License 2.0 | 4 votes |
def transform(longitudinal_addons): # Only for logging, not used, but may be interesting for later analysis. guid_set_unique = ( longitudinal_addons.withColumn( "exploded", F.explode(longitudinal_addons.installed_addons) ) .select("exploded") # noqa: E501 - long lines .rdd.flatMap(lambda x: x) .distinct() .collect() ) logging.info( "Number of unique guids co-installed in sample: " + str(len(guid_set_unique)) ) restructured = longitudinal_addons.rdd.flatMap( lambda x: key_all(x.installed_addons) ).toDF(["key_addon", "coinstalled_addons"]) # Explode the list of co-installs and count pair occurrences. addon_co_installations = ( restructured.select( "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon") ) # noqa: E501 - long lines .groupBy("key_addon", "coinstalled_addon") .count() ) # Collect the set of coinstalled_addon, count pairs for each key_addon. combine_and_map_cols = F.udf( lambda x, y: (x, y), StructType([StructField("id", StringType()), StructField("n", LongType())]), ) # Spark functions are sometimes long and unwieldy. Tough luck. # Ignore E128 and E501 long line errors addon_co_installations_collapsed = ( addon_co_installations.select( # noqa: E128 "key_addon", combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501 "id_n" ), ) .groupby("key_addon") .agg(F.collect_list("id_n").alias("coinstallation_counts")) ) logging.info(addon_co_installations_collapsed.printSchema()) logging.info("Collecting final result of co-installations.") return addon_co_installations_collapsed
Example #12
Source File: taar_lite_guidguid.py From python_mozetl with MIT License | 4 votes |
def transform(longitudinal_addons): # Only for logging, not used, but may be interesting for later analysis. guid_set_unique = ( longitudinal_addons.withColumn( "exploded", F.explode(longitudinal_addons.installed_addons) ) .select("exploded") # noqa: E501 - long lines .rdd.flatMap(lambda x: x) .distinct() .collect() ) logging.info( "Number of unique guids co-installed in sample: " + str(len(guid_set_unique)) ) restructured = longitudinal_addons.rdd.flatMap( lambda x: key_all(x.installed_addons) ).toDF(["key_addon", "coinstalled_addons"]) # Explode the list of co-installs and count pair occurrences. addon_co_installations = ( restructured.select( "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon") ) # noqa: E501 - long lines .groupBy("key_addon", "coinstalled_addon") .count() ) # Collect the set of coinstalled_addon, count pairs for each key_addon. combine_and_map_cols = F.udf( lambda x, y: (x, y), StructType([StructField("id", StringType()), StructField("n", LongType())]), ) # Spark functions are sometimes long and unwieldy. Tough luck. # Ignore E128 and E501 long line errors addon_co_installations_collapsed = ( addon_co_installations.select( # noqa: E128 "key_addon", combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501 "id_n" ), ) .groupby("key_addon") .agg(F.collect_list("id_n").alias("coinstallation_counts")) ) logging.info(addon_co_installations_collapsed.printSchema()) logging.info("Collecting final result of co-installations.") return addon_co_installations_collapsed
Example #13
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 4 votes |
def run(self, initial_scaffolds): randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType())) get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType())) remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType()) results_df = self._initialize_results(initial_scaffolds) scaffolds_df = results_df.select("smiles", "scaffold", "decorations") i = 0 while scaffolds_df.count() > 0: # generate randomized SMILES self._log("info", "Starting iteration #%d.", i) scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\ .select( "smiles", "scaffold", "decorations", psf.explode("randomized_scaffold").alias("randomized_scaffold"))\ .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\ .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\ .withColumn("id", psf.monotonically_increasing_id())\ .persist() self._log("info", "Generated %d randomized SMILES from %d scaffolds.", scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count()) # sample each randomized scaffold N times scaffolds = scaffolds_df.select("id", "randomized_scaffold")\ .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator() self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count()) self._log("info", "Sampled %d scaffolds.", scaffolds_df.count()) # merge decorated molecules joined_df = self._join_results(scaffolds_df).persist() if joined_df.count() > 0: self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds", scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count()) scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\ .select("smiles", "scaffold", "decorations")\ .where("smiles LIKE '%*%'") self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count()) results_df = results_df.union(joined_df)\ .groupBy("smiles")\ .agg( psf.first("scaffold").alias("scaffold"), psf.first("decorations").alias("decorations"), psf.sum("count").alias("count"))\ .persist() i += 1 return results_df