Python Examples of pyspark.sql.functions.explode

Source File: feature_vectors.py From search-MjoLniR with MIT License

6 votes

def resample_clicks_to_query_page(
    df_cluster: DataFrame,
    random_seed: Optional[int],
    samples_per_wiki: int
) -> mt.Transformer:
    # Resamples the click log by proxy of resampling clusters, such
    # that a complete cluster is either included or excluded from the
    # resulting dataset.
    # TODO: Evaluate alternative resampling, such as perhaps only dropping from
    # clusters where all clicks were to the top result (implying an "easy" search).

    mt.check_schema(df_cluster, mt.QueryClustering)
    return mt.seq_transform([
        # Grab only the parts of the query log we need to make the resulting sampled QueryPage
        lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'),
        mt.join_cluster_by_query(df_cluster),
        # [1] is because sample returns a tuple of (page_counts, df)
        mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample(
            df, random_seed, samples_per_wiki)[1]),
        lambda df: df.withColumn(
            'page_id', F.explode('hit_page_ids')).drop('hit_page_ids')
    ])

Source File: swissModelDataset.py From mmtf-pyspark with Apache License 2.0

6 votes

def _flatten_dataset(ds):
    '''Flattens the original hierarchical data schema into a simple row-based
    schema. Some less useful data are excluded.

    Parameters
    ----------
    ds : dataset
       the original spark dataset

    Returns
    -------
    dataset
       flattened dataset
    '''

    ds = ds.withColumn("structures", explode(ds.result.structures))
    return ds.select(col("query.ac"), col("result.sequence"), \
                     col("structures.from"), col("structures.to"), \
                     col("structures.qmean"), col("structures.qmean_norm"), \
                     col("structures.gmqe"), col("structures.coverage"), \
                     col("structures.oligo-state"), col("structures.method"), \
                     col("structures.template"), col("structures.identity"), \
                     col("structures.similarity"), col("structures.coordinates"),\
                     col("result.md5"), col("structures.md5"))

Source File: dataset_utils.py From mmtf-pyspark with Apache License 2.0

6 votes

def flatten_dataset(dataset: DataFrame):
    tmp = dataset
    for field in tmp.schema.fields:
        if isinstance(field.dataType, ArrayType):
            print(field.name, field.dataType)
            tmp = tmp.withColumn(field.name, explode(tmp.field.name))

    return tmp

Source File: norm_query_clustering.py From search-MjoLniR with MIT License

5 votes

def with_unique_cluster_id(df: DataFrame) -> DataFrame:
    return (
        df
        .groupby('wikiid', 'norm_query', 'norm_query_group_id')
        .agg(F.collect_list('query').alias('queries'))
        .select(
            'wikiid', 'queries',
            F.monotonically_increasing_id().alias('cluster_id'))
        .select('wikiid', F.explode('queries').alias('query'), 'cluster_id'))

Source File: advancedSearchDataset.py From mmtf-pyspark with Apache License 2.0

5 votes

def __get_entity_to_chain_id():
    # get entityID to strandId mapping
    query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly"
    mapping: DataFrame = pdbjMineDataset.get_dataset(query)

    # split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows
    mapping = mapping.withColumn("chainId", split(mapping.pdbx_strand_id, ","))
    mapping = mapping.withColumn("chainId", explode("chainId"))

    # create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A
    mapping = mapping.withColumn("pdbChainId", concat_ws(".", mapping.structureId, mapping.chainId))

    return mapping.select(mapping.entity_id, mapping.structureId, mapping.pdbChainId)

Source File: g2sDataset.py From mmtf-pyspark with Apache License 2.0

5 votes

def _flatten_dataframe(df):
    return df.withColumn("pdbPosition", explode(col("residueMapping.pdbPosition"))) \
             .withColumn("pdbAminoAcid", explode(col("residueMapping.pdbAminoAcid")))

Source File: myVariantDataset.py From mmtf-pyspark with Apache License 2.0

5 votes

def _flatten_dataframe(df):
    return df.withColumn("variationId", explode(df.hits._id)) \
             .select(col("variationId"), col("uniprotId"))

Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License

5 votes

def get_distinct_keys(df, col_name, is_col_arr_map=False):
    """ Return list of distinct keys.
        Set is_col_arr_map to be true if column is an array of Maps.
        Otherwise, assume column is a Map.
    """
    if is_col_arr_map:
        df = df.select(explode(col_name).alias(col_name))
    df = df.select(explode(map_keys(col_name)))
    return df.distinct().rdd.flatMap(lambda x: x).collect()

Source File: addon_aggregates.py From python_mozetl with MIT License

5 votes

def ms_explode_addons(ms):
    """
    Explodes the active_addons object in
    the ms DataFrame and selects relevant fields

    :param ms: a subset of main_summary
    :return SparkDF
    """
    addons_df = (
        ms.select(MS_FIELDS + [fun.explode("active_addons").alias("addons")])
        .select(MS_FIELDS + ADDON_FIELDS)
        .withColumn("app_version", fun.substring("app_version", 1, 2))
    )
    return addons_df

Source File: norm_query_clustering.py From search-MjoLniR with MIT License

4 votes

def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
    make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
        T.StructField('query', T.StringType(), nullable=False),
        T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
    ])))
    return (
        df
        .groupBy('wikiid', 'norm_query')
        .agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
        .select(
            'wikiid', 'norm_query',
            F.explode(make_groups('source')).alias('group'))
        .select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))

Source File: taar_lite_guidguid.py From telemetry-airflow with Mozilla Public License 2.0

4 votes

def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed

Source File: taar_lite_guidguid.py From python_mozetl with MIT License

4 votes

def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed

Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License

4 votes

def run(self, initial_scaffolds):
        randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType()))
        get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType()))
        remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType())

        results_df = self._initialize_results(initial_scaffolds)
        scaffolds_df = results_df.select("smiles", "scaffold", "decorations")
        i = 0
        while scaffolds_df.count() > 0:
            # generate randomized SMILES
            self._log("info", "Starting iteration #%d.", i)
            scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\
                .select(
                    "smiles", "scaffold", "decorations",
                    psf.explode("randomized_scaffold").alias("randomized_scaffold"))\
                .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\
                .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\
                .withColumn("id", psf.monotonically_increasing_id())\
                .persist()
            self._log("info", "Generated %d randomized SMILES from %d scaffolds.",
                      scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count())

            # sample each randomized scaffold N times
            scaffolds = scaffolds_df.select("id", "randomized_scaffold")\
                .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator()
            self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count())
            self._log("info", "Sampled %d scaffolds.", scaffolds_df.count())

            # merge decorated molecules
            joined_df = self._join_results(scaffolds_df).persist()

            if joined_df.count() > 0:
                self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds",
                          scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count())

            scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\
                .select("smiles", "scaffold", "decorations")\
                .where("smiles LIKE '%*%'")
            self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count())

            results_df = results_df.union(joined_df)\
                .groupBy("smiles")\
                .agg(
                    psf.first("scaffold").alias("scaffold"),
                    psf.first("decorations").alias("decorations"),
                    psf.sum("count").alias("count"))\
                .persist()
            i += 1

        return results_df

Python pyspark.sql.functions.explode() Examples