Python pyspark.sql.functions.row_number() Examples
The following are 20
code examples of pyspark.sql.functions.row_number().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: candidate_sets.py From listenbrainz-server with GNU General Public License v2.0 | 7 votes |
def get_top_artists(mapped_listens_subset, top_artist_limit): """ Get top artists listened to by users who have a listening history in the past X days where X = RECOMMENDATION_GENERATION_WINDOW. Args: df (dataframe): A subset of mapped_df containing user history. top_artist_limit (int): number of top artist to calculate Returns: top_artists_df (dataframe): Top Y artists listened to by a user for all users where Y = TOP_ARTISTS_LIMIT """ df = mapped_listens_subset.select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \ .groupBy('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \ .agg(func.count('mb_artist_credit_id').alias('count')) window = Window.partitionBy('user_name').orderBy(col('count').desc()) top_artists_df = df.withColumn('rank', row_number().over(window)) \ .where(col('rank') <= top_artist_limit) \ .select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') return top_artists_df
Example #2
Source File: sampler.py From python_mozetl with MIT License | 5 votes |
def transform(landfill, n_documents=1000): meta_schema = StructType( [StructField(k, StringType(), True) for k in META_WHITELIST] ) schema = StructType( [ StructField("namespace", StringType(), False), StructField("doc_type", StringType(), False), StructField("doc_version", StringType(), True), StructField("doc_id", StringType(), True), StructField("meta", meta_schema, False), StructField("content", StringType(), False), ] ) documents = ( landfill.map(_process) .filter(lambda x: x[0] and x[1] and x[-2] and x[-1]) .toDF(schema) ) window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy( "doc_id" ) df = ( documents.fillna("0", "doc_version") .withColumn("row_id", row_number().over(window_spec)) .where(col("row_id") <= n_documents) .drop("row_id") ) return df
Example #3
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=PREDICTION_COL, k=DEFAULT_K ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. NOTE: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (spark.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: spark.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .groupby(col_user) .agg(F.collect_list(col_item).alias(col_prediction)) ) return items_for_user
Example #4
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def sum(self): def sum(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.sum(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(sum)
Example #5
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def min(self): def min(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.min(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(min)
Example #6
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def max(self): def max(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.max(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(max)
Example #7
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def mean(self): def mean(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.mean(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(mean)
Example #8
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def std(self): def std(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.stddev(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(std)
Example #9
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_relevant_items_by_timestamp( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, col_prediction=PREDICTION_COL, k=DEFAULT_K ): """Get relevant items for each customer defined by timestamp. Relevant items are defined as k items that appear mostly recently according to timestamps. Args: dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_timestamp (str): column name for timestamp. col_prediction (str): column name for prediction. k: number of relevent items to be filtered by the function. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc()) items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates([col_user, col_prediction]) ) return items_for_user
Example #10
Source File: spark_evaluation.py From azure-python-labs with MIT License | 5 votes |
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=PREDICTION_COL, k=DEFAULT_K ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. NOTE: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (spark.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: spark.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .groupby(col_user) .agg(F.collect_list(col_item).alias(col_prediction)) ) return items_for_user
Example #11
Source File: internal.py From koalas with Apache License 2.0 | 5 votes |
def attach_sequence_column(sdf, column_name): scols = [scol_for(sdf, column) for column in sdf.columns] sequential_index = F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1 return sdf.select(sequential_index.alias(column_name), *scols)
Example #12
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def nsmallest(self, n=5): """ Return the first n rows ordered by columns in ascending order in group. Return the first n rows with the smallest values in columns, in ascending order. The columns that are not specified are returned as well, but not used for ordering. Parameters ---------- n : int Number of items to retrieve. See Also -------- databricks.koalas.Series.nsmallest databricks.koalas.DataFrame.nsmallest Examples -------- >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3], ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b']) >>> df.groupby(['a'])['b'].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE a 1 0 1 2 3 2 3 6 3 Name: b, dtype: int64 """ if len(self._kdf._internal.index_names) > 1: raise ValueError("nsmallest do not support multi-index now") sdf = self._kdf._internal.spark_frame name = self._agg_columns[0]._internal.data_spark_column_names[0] window = Window.partitionBy(self._groupkeys_scols).orderBy( self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME ) sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n) internal = InternalFrame( spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME), index_map=OrderedDict( [ (s._internal.data_spark_column_names[0], s._internal.column_labels[0]) for s in self._groupkeys ] + list(self._kdf._internal.index_map.items()) ), data_spark_columns=[scol_for(sdf, name)], ) return first_series(DataFrame(internal)) # TODO: add keep parameter
Example #13
Source File: spark_splitters.py From azure-python-labs with MIT License | 4 votes |
def spark_timestamp_split( data, ratio=0.75, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark timestamp based splitter The splitter splits the data into sets by timestamps without stratification on either user or item. The ratios are applied on the timestamp column which is divided accordingly into several partitions. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time in split[0] <= the earliest time in split[1]) col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Float number represented in seconds since Epoch. Returns: list: Splits of the input data as spark.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.orderBy(col(col_timestamp)) rating = data.withColumn("rank", row_number().over(window_spec)) data_count = rating.count() rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop( "rank" ) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ).drop("rank") splits.append(rating_split) return splits
Example #14
Source File: spark_splitters.py From azure-python-labs with MIT License | 4 votes |
def spark_timestamp_split( data, ratio=0.75, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark timestamp based splitter The splitter splits the data into sets by timestamps without stratification on either user or item. The ratios are applied on the timestamp column which is divided accordingly into several partitions. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time in split[0] <= the earliest time in split[1]) col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Float number represented in seconds since Epoch. Returns: list: Splits of the input data as spark.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.orderBy(col(col_timestamp)) rating = data.withColumn("rank", row_number().over(window_spec)) data_count = rating.count() rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop( "rank" ) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ).drop("rank") splits.append(rating_split) return splits
Example #15
Source File: series.py From koalas with Apache License 2.0 | 4 votes |
def _rank(self, method="average", ascending=True, part_cols=()): if method not in ["average", "min", "max", "first", "dense"]: msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" raise ValueError(msg) if len(self._internal.index_spark_column_names) > 1: raise ValueError("rank do not support index now") if ascending: asc_func = lambda scol: scol.asc() else: asc_func = lambda scol: scol.desc() if method == "first": window = ( Window.orderBy( asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)), ) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) scol = F.row_number().over(window) elif method == "dense": window = ( Window.orderBy(asc_func(self.spark.column)) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) scol = F.dense_rank().over(window) else: if method == "average": stat_func = F.mean elif method == "min": stat_func = F.min elif method == "max": stat_func = F.max window1 = ( Window.orderBy(asc_func(self.spark.column)) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing ) scol = stat_func(F.row_number().over(window1)).over(window2) kser = self._with_new_scol(scol).rename(self.name) return kser.astype(np.float64)
Example #16
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def nlargest(self, n=5): """ Return the first n rows ordered by columns in descending order in group. Return the first n rows with the smallest values in columns, in descending order. The columns that are not specified are returned as well, but not used for ordering. Parameters ---------- n : int Number of items to retrieve. See Also -------- databricks.koalas.Series.nlargest databricks.koalas.DataFrame.nlargest Examples -------- >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3], ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b']) >>> df.groupby(['a'])['b'].nlargest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE a 1 1 2 2 4 3 3 7 4 Name: b, dtype: int64 """ if len(self._kdf._internal.index_names) > 1: raise ValueError("nlargest do not support multi-index now") sdf = self._kdf._internal.spark_frame name = self._agg_columns[0]._internal.data_spark_column_names[0] window = Window.partitionBy(self._groupkeys_scols).orderBy( self._agg_columns[0].spark.column.desc(), NATURAL_ORDER_COLUMN_NAME ) sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n) internal = InternalFrame( spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME), index_map=OrderedDict( [ (s._internal.data_spark_column_names[0], s._internal.column_labels[0]) for s in self._groupkeys ] + list(self._kdf._internal.index_map.items()) ), data_spark_columns=[scol_for(sdf, name)], ) return first_series(DataFrame(internal)) # TODO: add bins, normalize parameter
Example #17
Source File: metrics.py From search-MjoLniR with MIT License | 4 votes |
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid', query_cols=['wikiid', 'query', 'session_id']): """ Calculate ndcg@k for the provided dataframe Parameters ---------- df : pyspark.sql.DataFrame Input dataframe to calculate against k : int Cutoff for ndcg calculation label_col : str Column name containing integer label, higher is better, of the hit position_col : str Column name containing order displayed to user, lowest first, of the hit query_cols : list of str Column names to group by, which indicate a unique query displayed to a user Returns ------- float The ndcg@k value, always between 0 and 1 """ if wiki_col not in query_cols: query_cols = query_cols + [wiki_col] # ideal results per labels w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc()) topAtK = ( df .select(label_col, *query_cols) .withColumn('rn', F.row_number().over(w)) .where(F.col('rn') <= k) .groupBy(*query_cols) .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK'))) # top k results shown to user w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc()) predictedTopAtK = ( df .select(label_col, position_col, *query_cols) .withColumn('rn', F.row_number().over(w)) .where(F.col('rn') <= k) .groupBy(*query_cols) .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK'))) return {row[wiki_col]: row.ndcgAtK for row in topAtK .join(predictedTopAtK, query_cols, how='inner') .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK')) .groupBy(wiki_col) .agg(F.mean('ndcgAtK').alias('ndcgAtK')) .collect()}
Example #18
Source File: window.py From koalas with Apache License 2.0 | 4 votes |
def count(self): """ The expanding count of any non-NaN observations inside the window. .. note:: the current implementation of this API uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Returns ------- Series or DataFrame Returned object type is determined by the caller of the expanding calculation. See Also -------- Series.expanding : Calling object with Series data. DataFrame.expanding : Calling object with DataFrames. Series.count : Count of the full Series. DataFrame.count : Count of the full DataFrame. Examples -------- >>> s = ks.Series([2, 3, float("nan"), 10]) >>> s.expanding().count() 0 1.0 1 2.0 2 2.0 3 3.0 Name: 0, dtype: float64 >>> s.to_frame().expanding().count() 0 0 1.0 1 2.0 2 2.0 3 3.0 """ def count(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.count(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(count).astype("float64")
Example #19
Source File: candidate_sets.py From listenbrainz-server with GNU General Public License v2.0 | 4 votes |
def get_top_similar_artists(top_artists_df, artists_relation_df, similar_artist_limit): """ Get artists similar to top artists. Args: top_artists_df: Dataframe containing top artists listened to by users artist_relation_df: Dataframe containing artists and similar artists. For columns refer to artist_relation_schema in listenbrainz_spark/schema.py. similar_artist_limit (int): number of similar artist to calculate Returns: top_similar_artists_df (dataframe): Top Z artists similar to top artists where Z = SIMILAR_ARTISTS_LIMIT. """ condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_0] df1 = top_artists_df.join(artists_relation_df, condition, 'inner') \ .select(col('id_0').alias('top_artist_credit_id'), col('name_0').alias('top_artist_name'), col('id_1').alias('similar_artist_credit_id'), col('name_1').alias('similar_artist_name'), 'score', 'user_name') condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_1] df2 = top_artists_df.join(artists_relation_df, condition, 'inner') \ .select(col('id_1').alias('top_artist_credit_id'), col('name_1').alias('top_artist_name'), col('id_0').alias('similar_artist_credit_id'), col('name_0').alias('similar_artist_name'), 'score', 'user_name') similar_artists_df = df1.union(df2) window = Window.partitionBy('top_artist_credit_id', 'user_name')\ .orderBy(col('score').desc()) top_similar_artists_df = similar_artists_df.withColumn('rank', row_number().over(window)) \ .where(col('rank') <= similar_artist_limit)\ .select('top_artist_credit_id', 'top_artist_name', 'similar_artist_credit_id', 'similar_artist_name', 'score', 'user_name') return top_similar_artists_df
Example #20
Source File: compiler.py From ibis with Apache License 2.0 | 4 votes |
def compile_row_number(t, expr, scope, *, window, **kwargs): return F.row_number().over(window).cast('long') - 1 # -------------------------- Temporal Operations ---------------------------- # Ibis value to PySpark value