Python pyspark.sql.functions.collect_list() Examples
The following are 19
code examples of pyspark.sql.functions.collect_list().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: metrics.py From search-MjoLniR with MIT License | 7 votes |
def _ndcg_at(k, label_col): def ndcg_at_k(predicted, actual): # TODO: Taking in rn and then re-sorting might not be necessary, but i can't # find any real guarantee that they would come in order after a groupBy + collect_list, # since they were only ordered within the window function. predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)] actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)] dcg = 0. for i, label in enumerate(predicted): # This form is used to match EvalNDCG in xgboost dcg += ((1 << label) - 1) / math.log(i + 2.0, 2) idcg = 0. for i, label in enumerate(actual): idcg += ((1 << label) - 1) / math.log(i + 2.0, 2) if idcg == 0: return 0 else: return dcg / idcg return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType())
Example #2
Source File: spark_evaluation.py From azure-python-labs with MIT License | 6 votes |
def _calculate_metrics(self): """Calculate ranking metrics.""" self._items_for_user_pred = self.rating_pred self._items_for_user_true = ( self.rating_true .groupBy(self.col_user) .agg(expr("collect_list(" + self.col_item + ") as ground_truth")) .select(self.col_user, "ground_truth") ) self._items_for_user_all = self._items_for_user_pred.join( self._items_for_user_true, on=self.col_user ).drop(self.col_user) return RankingMetrics(self._items_for_user_all.rdd)
Example #3
Source File: spark_evaluation.py From azure-python-labs with MIT License | 6 votes |
def _calculate_metrics(self): """Calculate ranking metrics.""" self._items_for_user_pred = self.rating_pred self._items_for_user_true = ( self.rating_true .groupBy(self.col_user) .agg(expr("collect_list(" + self.col_item + ") as ground_truth")) .select(self.col_user, "ground_truth") ) self._items_for_user_all = self._items_for_user_pred.join( self._items_for_user_true, on=self.col_user ).drop(self.col_user) return RankingMetrics(self._items_for_user_all.rdd)
Example #4
Source File: norm_query_clustering.py From search-MjoLniR with MIT License | 5 votes |
def with_unique_cluster_id(df: DataFrame) -> DataFrame: return ( df .groupby('wikiid', 'norm_query', 'norm_query_group_id') .agg(F.collect_list('query').alias('queries')) .select( 'wikiid', 'queries', F.monotonically_increasing_id().alias('cluster_id')) .select('wikiid', F.explode('queries').alias('query'), 'cluster_id'))
Example #5
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_group_concat(t, expr, scope, context=None, **kwargs): sep = expr.op().sep.op().value def fn(col): return F.concat_ws(sep, F.collect_list(col)) return compile_aggregator(t, expr, scope, fn, context)
Example #6
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_array_collect(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.collect_list(src_column) # --------------------------- Null Operations -----------------------------
Example #7
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_relevant_items_by_timestamp( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, col_prediction=PREDICTION_COL, k=DEFAULT_K ): """Get relevant items for each customer defined by timestamp. Relevant items are defined as k items that appear mostly recently according to timestamps. Args: dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_timestamp (str): column name for timestamp. col_prediction (str): column name for prediction. k: number of relevent items to be filtered by the function. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc()) items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates([col_user, col_prediction]) ) return items_for_user
Example #8
Source File: listening_activity.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def get_listening_activity(): """ Calculate number of listens for each user in time ranges given in the 'time_range' table """ # Calculate the number of listens in each time range for each user except the time ranges which have zero listens. result_without_zero_days = run_query(""" SELECT listens.user_name , time_range.time_range , count(listens.user_name) as listen_count FROM listens JOIN time_range ON listens.listened_at >= time_range.start AND listens.listened_at <= time_range.end GROUP BY listens.user_name , time_range.time_range """) result_without_zero_days.createOrReplaceTempView('result_without_zero_days') # Add the time ranges which have zero listens to the previous dataframe result = run_query(""" SELECT dist_user_name.user_name , time_range.time_range , to_unix_timestamp(time_range.start) as from_ts , to_unix_timestamp(time_range.end) as to_ts , ifnull(result_without_zero_days.listen_count, 0) as listen_count FROM (SELECT DISTINCT user_name FROM listens) dist_user_name CROSS JOIN time_range LEFT JOIN result_without_zero_days ON result_without_zero_days.user_name = dist_user_name.user_name AND result_without_zero_days.time_range = time_range.time_range """) # Create a table with a list of time ranges and corresponding listen count for each user iterator = result \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() return iterator
Example #9
Source File: listening_activity.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def get_listening_activity_all_time() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate the number of listens for an user in each year starting from LAST_FM_FOUNDING_YEAR (2002). """ current_app.logger.debug("Calculating listening_activity_all_time") to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) result_without_zero_years = None for year in range(from_date.year, to_date.year+1): year_start = datetime(year, 1, 1) year_end = get_year_end(year) try: _get_listens(year_start, year_end) except HDFSException: # Skip if no listens present in df continue year_df = run_query(""" SELECT user_name, count(user_name) as listen_count FROM listens GROUP BY user_name """) year_df = year_df.withColumn('time_range', lit(str(year))).withColumn( 'from_ts', lit(year_start.timestamp())).withColumn('to_ts', lit(year_end.timestamp())) result_without_zero_years = result_without_zero_years.union(year_df) if result_without_zero_years else year_df # Create a table with a list of time ranges and corresponding listen count for each user data = result_without_zero_years \ .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \ .toLocalIterator() messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return messages
Example #10
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=PREDICTION_COL, k=DEFAULT_K ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. NOTE: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (spark.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: spark.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .groupby(col_user) .agg(F.collect_list(col_item).alias(col_prediction)) ) return items_for_user
Example #11
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_relevant_items_by_threshold( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=PREDICTION_COL, threshold=DEFAULT_THRESHOLD ): """Get relevant items for each customer in the input rating data. Relevant items are defined as those having ratings above certain threshold. The threshold is defined as a statistical measure of the ratings for a user, e.g., median. Args: dataframe: Spark DataFrame of customerID-itemID-rating tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. threshold (float): threshold for determining the relevant recommended items. This is used for the case that predicted ratings follow a known distribution. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ items_for_user = ( dataframe .orderBy(col_rating, ascending=False) .where(col_rating + " >= " + str(threshold)) .select( col_user, col_item, col_rating ) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates() ) return items_for_user
Example #12
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_relevant_items_by_timestamp( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, col_prediction=PREDICTION_COL, k=DEFAULT_K ): """Get relevant items for each customer defined by timestamp. Relevant items are defined as k items that appear mostly recently according to timestamps. Args: dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_timestamp (str): column name for timestamp. col_prediction (str): column name for prediction. k: number of relevent items to be filtered by the function. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc()) items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates([col_user, col_prediction]) ) return items_for_user
Example #13
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_relevant_items_by_threshold( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=PREDICTION_COL, threshold=DEFAULT_THRESHOLD ): """Get relevant items for each customer in the input rating data. Relevant items are defined as those having ratings above certain threshold. The threshold is defined as a statistical measure of the ratings for a user, e.g., median. Args: dataframe: Spark DataFrame of customerID-itemID-rating tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. threshold (float): threshold for determining the relevant recommended items. This is used for the case that predicted ratings follow a known distribution. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ items_for_user = ( dataframe .orderBy(col_rating, ascending=False) .where(col_rating + " >= " + str(threshold)) .select( col_user, col_item, col_rating ) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates() ) return items_for_user
Example #14
Source File: artist.py From listenbrainz-server with GNU General Public License v2.0 | 4 votes |
def get_artists(table): """ Get artist information (artist_name, artist_msid etc) for every user ordered by listen count Args: table (str): name of the temporary table. Returns: iterator (iter): an iterator over result { user1: [{ 'artist_name': str, 'artist_msid': str, 'artist_mbids': list(str), 'listen_count': int }], user2: [{...}], } """ result = run_query(""" SELECT user_name , artist_name , CASE WHEN cardinality(artist_mbids) > 0 THEN NULL ELSE nullif(artist_msid, '') END as artist_msid , artist_mbids , count(artist_name) as listen_count FROM {table} GROUP BY user_name , artist_name , artist_msid , artist_mbids """.format(table=table)) iterator = result \ .withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \ .toLocalIterator() return iterator
Example #15
Source File: taar_lite_guidguid.py From python_mozetl with MIT License | 4 votes |
def transform(longitudinal_addons): # Only for logging, not used, but may be interesting for later analysis. guid_set_unique = ( longitudinal_addons.withColumn( "exploded", F.explode(longitudinal_addons.installed_addons) ) .select("exploded") # noqa: E501 - long lines .rdd.flatMap(lambda x: x) .distinct() .collect() ) logging.info( "Number of unique guids co-installed in sample: " + str(len(guid_set_unique)) ) restructured = longitudinal_addons.rdd.flatMap( lambda x: key_all(x.installed_addons) ).toDF(["key_addon", "coinstalled_addons"]) # Explode the list of co-installs and count pair occurrences. addon_co_installations = ( restructured.select( "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon") ) # noqa: E501 - long lines .groupBy("key_addon", "coinstalled_addon") .count() ) # Collect the set of coinstalled_addon, count pairs for each key_addon. combine_and_map_cols = F.udf( lambda x, y: (x, y), StructType([StructField("id", StringType()), StructField("n", LongType())]), ) # Spark functions are sometimes long and unwieldy. Tough luck. # Ignore E128 and E501 long line errors addon_co_installations_collapsed = ( addon_co_installations.select( # noqa: E128 "key_addon", combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501 "id_n" ), ) .groupby("key_addon") .agg(F.collect_list("id_n").alias("coinstallation_counts")) ) logging.info(addon_co_installations_collapsed.printSchema()) logging.info("Collecting final result of co-installations.") return addon_co_installations_collapsed
Example #16
Source File: taar_lite_guidguid.py From telemetry-airflow with Mozilla Public License 2.0 | 4 votes |
def transform(longitudinal_addons): # Only for logging, not used, but may be interesting for later analysis. guid_set_unique = ( longitudinal_addons.withColumn( "exploded", F.explode(longitudinal_addons.installed_addons) ) .select("exploded") # noqa: E501 - long lines .rdd.flatMap(lambda x: x) .distinct() .collect() ) logging.info( "Number of unique guids co-installed in sample: " + str(len(guid_set_unique)) ) restructured = longitudinal_addons.rdd.flatMap( lambda x: key_all(x.installed_addons) ).toDF(["key_addon", "coinstalled_addons"]) # Explode the list of co-installs and count pair occurrences. addon_co_installations = ( restructured.select( "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon") ) # noqa: E501 - long lines .groupBy("key_addon", "coinstalled_addon") .count() ) # Collect the set of coinstalled_addon, count pairs for each key_addon. combine_and_map_cols = F.udf( lambda x, y: (x, y), StructType([StructField("id", StringType()), StructField("n", LongType())]), ) # Spark functions are sometimes long and unwieldy. Tough luck. # Ignore E128 and E501 long line errors addon_co_installations_collapsed = ( addon_co_installations.select( # noqa: E128 "key_addon", combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501 "id_n" ), ) .groupby("key_addon") .agg(F.collect_list("id_n").alias("coinstallation_counts")) ) logging.info(addon_co_installations_collapsed.printSchema()) logging.info("Collecting final result of co-installations.") return addon_co_installations_collapsed
Example #17
Source File: release.py From listenbrainz-server with GNU General Public License v2.0 | 4 votes |
def get_releases(table): """ Get release information (release_name, release_mbid etc) for every user ordered by listen count (number of times a user has listened to tracks which belong to a particular release). Args: table: name of the temporary table Returns: iterator (iter): an iterator over result { 'user1' : [{ 'release_name': str 'release_msid': str, 'release_mbid': str, 'artist_name': str, 'artist_msid': str, 'artist_mbids': list(str), 'listen_count': int }], 'user2' : [{...}], } """ result = run_query(""" SELECT user_name , nullif(release_name, '') as release_name , CASE WHEN release_mbid IS NOT NULL AND release_mbid != '' THEN NULL ELSE nullif(release_msid, '') END as release_msid , nullif(release_mbid, '') as release_mbid , artist_name , CASE WHEN cardinality(artist_mbids) > 0 THEN NULL ELSE nullif(artist_msid, '') END as artist_msid , artist_mbids , count(release_name) as listen_count FROM {} WHERE release_name IS NOT NULL AND release_name != '' GROUP BY user_name , release_name , release_msid , release_mbid , artist_name , artist_msid , artist_mbids """.format(table)) iterator = result \ .withColumn("releases", struct("listen_count", "release_name", "release_msid", "release_mbid", "artist_name", "artist_msid", "artist_mbids")) \ .groupBy("user_name") \ .agg(sort_array(collect_list("releases"), asc=False).alias("releases")) \ .toLocalIterator() return iterator
Example #18
Source File: norm_query_clustering.py From search-MjoLniR with MIT License | 4 votes |
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame: make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([ T.StructField('query', T.StringType(), nullable=False), T.StructField('norm_query_group_id', T.IntegerType(), nullable=False), ]))) return ( df .groupBy('wikiid', 'norm_query') .agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source')) .select( 'wikiid', 'norm_query', F.explode(make_groups('source')).alias('group')) .select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))
Example #19
Source File: metrics.py From search-MjoLniR with MIT License | 4 votes |
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid', query_cols=['wikiid', 'query', 'session_id']): """ Calculate ndcg@k for the provided dataframe Parameters ---------- df : pyspark.sql.DataFrame Input dataframe to calculate against k : int Cutoff for ndcg calculation label_col : str Column name containing integer label, higher is better, of the hit position_col : str Column name containing order displayed to user, lowest first, of the hit query_cols : list of str Column names to group by, which indicate a unique query displayed to a user Returns ------- float The ndcg@k value, always between 0 and 1 """ if wiki_col not in query_cols: query_cols = query_cols + [wiki_col] # ideal results per labels w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc()) topAtK = ( df .select(label_col, *query_cols) .withColumn('rn', F.row_number().over(w)) .where(F.col('rn') <= k) .groupBy(*query_cols) .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK'))) # top k results shown to user w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc()) predictedTopAtK = ( df .select(label_col, position_col, *query_cols) .withColumn('rn', F.row_number().over(w)) .where(F.col('rn') <= k) .groupBy(*query_cols) .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK'))) return {row[wiki_col]: row.ndcgAtK for row in topAtK .join(predictedTopAtK, query_cols, how='inner') .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK')) .groupBy(wiki_col) .agg(F.mean('ndcgAtK').alias('ndcgAtK')) .collect()}