Python pyspark.sql.functions.mean() Examples
The following are 17
code examples of pyspark.sql.functions.mean().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: test_basic.py From ibis with Apache License 2.0 | 6 votes |
def test_window(client): import pyspark.sql.functions as F from pyspark.sql.window import Window table = client.table('basic_table') w = ibis.window() result = table.mutate( grouped_demeaned=table['id'] - table['id'].mean().over(w) ).compile() spark_window = Window.partitionBy() spark_table = table.compile() expected = spark_table.withColumn( 'grouped_demeaned', spark_table['id'] - F.mean(spark_table['id']).over(spark_window), ) tm.assert_frame_equal(result.toPandas(), expected.toPandas())
Example #2
Source File: series.py From koalas with Apache License 2.0 | 6 votes |
def mad(self): """ Return the mean absolute deviation of values. Examples -------- >>> s = ks.Series([1, 2, 3, 4]) >>> s 0 1 1 2 2 3 3 4 Name: 0, dtype: int64 >>> s.mad() 1.0 """ sdf = self._internal.spark_frame spark_column = self.spark.column avg = unpack_scalar(sdf.select(F.avg(spark_column))) mad = unpack_scalar(sdf.select(F.avg(F.abs(spark_column - avg)))) return mad
Example #3
Source File: transformers.py From dist-keras with GNU General Public License v3.0 | 6 votes |
def transform(self, dataframe): """Applies standardization to the specified columns. # Arguments dataframe: dataframe. Spark Dataframe. """ # Compute the means of the specified columns. means = [mean(x) for x in self.columns] means = dataframe.select(means).collect()[0].asDict() self.means = self.clean_mean_keys(means) # Compute the standard deviation of the specified columns. stddevs = [stddev_pop(x) for x in self.columns] stddevs = dataframe.select(stddevs).collect()[0].asDict() self.stddevs = self.clean_stddev_keys(stddevs) # For every feature, add a new column to the dataframe. for column in self.columns: self.current_column = column dataframe = dataframe.rdd.map(self._transform).toDF() return dataframe
Example #4
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_mean(t, expr, scope, context=None, **kwargs): return compile_aggregator(t, expr, scope, F.mean, context, **kwargs)
Example #5
Source File: fields.py From python_mozetl with MIT License | 5 votes |
def agg_mean(field_name, alias=None): field_alias = get_alias(field_name, alias, "mean") return F.mean(field_name).alias(field_alias)
Example #6
Source File: groupby.py From sparklingpandas with Apache License 2.0 | 5 votes |
def mean(self): """Compute mean of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.mean) self._prep_pandas_groupby() return DataFrame.fromDataFrameRDD( self._regroup_mergedRDD().values().map( lambda x: x.mean()), self.sql_ctx)
Example #7
Source File: plot.py From koalas with Apache License 2.0 | 5 votes |
def _compute_stats(data, colname, whis, precision): # Computes mean, median, Q1 and Q3 with approx_percentile and precision pdf = data._kdf._internal.resolved_copy.spark_frame.agg( *[ F.expr( "approx_percentile({}, {}, {})".format(colname, q, int(1.0 / precision)) ).alias("{}_{}%".format(colname, int(q * 100))) for q in [0.25, 0.50, 0.75] ], F.mean(colname).alias("{}_mean".format(colname)) ).toPandas() # Computes IQR and Tukey's fences iqr = "{}_iqr".format(colname) p75 = "{}_75%".format(colname) p25 = "{}_25%".format(colname) pdf.loc[:, iqr] = pdf.loc[:, p75] - pdf.loc[:, p25] pdf.loc[:, "{}_lfence".format(colname)] = pdf.loc[:, p25] - whis * pdf.loc[:, iqr] pdf.loc[:, "{}_ufence".format(colname)] = pdf.loc[:, p75] + whis * pdf.loc[:, iqr] qnames = ["25%", "50%", "75%", "mean", "lfence", "ufence"] col_summ = pdf[["{}_{}".format(colname, q) for q in qnames]] col_summ.columns = qnames lfence, ufence = col_summ["lfence"], col_summ["ufence"] stats = { "mean": col_summ["mean"].values[0], "med": col_summ["50%"].values[0], "q1": col_summ["25%"].values[0], "q3": col_summ["75%"].values[0], } return stats, (lfence.values[0], ufence.values[0])
Example #8
Source File: generic.py From koalas with Apache License 2.0 | 5 votes |
def mean(self, axis=None, numeric_only=True): """ Return the mean of the values. Parameters ---------- axis : {index (0), columns (1)} Axis for the function to be applied on. numeric_only : bool, default True Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. Returns ------- mean : scalar for a Series, and a Series for a DataFrame. Examples -------- >>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]}, ... columns=['a', 'b']) On a DataFrame: >>> df.mean() a 2.0 b 0.2 Name: 0, dtype: float64 >>> df.mean(axis=1) 0 0.55 1 1.10 2 1.65 3 NaN Name: 0, dtype: float64 On a Series: >>> df['a'].mean() 2.0 """ return self._reduce_for_stat_function( F.mean, name="mean", numeric_only=numeric_only, axis=axis )
Example #9
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def mean(self): def mean(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.mean(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(mean)
Example #10
Source File: transformers.py From dist-keras with GNU General Public License v3.0 | 5 votes |
def _transform(self, row): """Take the column, and normalize it with the computed means and std devs.""" mean = self.means[self.current_column] stddev = self.stddevs[self.current_column] x = row[self.current_column] x_normalized = (x - mean) / stddev output_column = self.current_column + self.column_suffix new_row = new_dataframe_row(row, output_column, x_normalized) return new_row
Example #11
Source File: transformers.py From dist-keras with GNU General Public License v3.0 | 5 votes |
def clean_mean_keys(self, means): """Cleans the keys of the specified dictionary (mean).""" new_means = {} for k in means: new_means[k[4:-1]] = means[k] return new_means
Example #12
Source File: window.py From koalas with Apache License 2.0 | 4 votes |
def mean(self): """ Calculate the expanding mean of the values. .. note:: the current implementation of this API uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Returns ------- Series or DataFrame Returned object type is determined by the caller of the expanding calculation. See Also -------- Series.expanding : Calling object with Series data. DataFrame.expanding : Calling object with DataFrames. Series.mean : Equivalent method for Series. DataFrame.mean : Equivalent method for DataFrame. Examples -------- The below examples will show expanding mean calculations with window sizes of two and three, respectively. >>> s = ks.Series([1, 2, 3, 4]) >>> s.expanding(2).mean() 0 NaN 1 1.5 2 2.0 3 2.5 Name: 0, dtype: float64 >>> s.expanding(3).mean() 0 NaN 1 NaN 2 2.0 3 2.5 Name: 0, dtype: float64 """ return super(Expanding, self).mean()
Example #13
Source File: window.py From koalas with Apache License 2.0 | 4 votes |
def mean(self): """ Calculate the expanding mean of the values. Returns ------- Series or DataFrame Returned object type is determined by the caller of the expanding calculation. See Also -------- Series.expanding : Calling object with Series data. DataFrame.expanding : Calling object with DataFrames. Series.mean : Equivalent method for Series. DataFrame.mean : Equivalent method for DataFrame. Examples -------- >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5]) >>> s.groupby(s).expanding(3).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE 0 2 0 NaN 1 NaN 3 2 NaN 3 NaN 4 3.0 4 5 NaN 6 NaN 7 4.0 8 4.0 5 9 NaN 10 NaN Name: 0, dtype: float64 For DataFrame, each expanding mean is computed column-wise. >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2}) >>> df.groupby(df.A).expanding(2).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE A B A 2 0 NaN NaN 1 2.0 4.0 3 2 NaN NaN 3 3.0 9.0 4 3.0 9.0 4 5 NaN NaN 6 4.0 16.0 7 4.0 16.0 8 4.0 16.0 5 9 NaN NaN 10 5.0 25.0 """ return super(ExpandingGroupby, self).mean()
Example #14
Source File: window.py From koalas with Apache License 2.0 | 4 votes |
def mean(self): """ The rolling mean of any non-NaN observations inside the window. Returns ------- Series or DataFrame Returned object type is determined by the caller of the rolling calculation. See Also -------- Series.rolling : Calling object with Series data. DataFrame.rolling : Calling object with DataFrames. Series.mean : Mean of the full Series. DataFrame.mean : Mean of the full DataFrame. Examples -------- >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5]) >>> s.groupby(s).rolling(3).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE 0 2 0 NaN 1 NaN 3 2 NaN 3 NaN 4 3.0 4 5 NaN 6 NaN 7 4.0 8 4.0 5 9 NaN 10 NaN Name: 0, dtype: float64 For DataFrame, each rolling mean is computed column-wise. >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2}) >>> df.groupby(df.A).rolling(2).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE A B A 2 0 NaN NaN 1 2.0 4.0 3 2 NaN NaN 3 3.0 9.0 4 3.0 9.0 4 5 NaN NaN 6 4.0 16.0 7 4.0 16.0 8 4.0 16.0 5 9 NaN NaN 10 5.0 25.0 """ return super(RollingGroupby, self).mean()
Example #15
Source File: plot.py From koalas with Apache License 2.0 | 4 votes |
def _compute_plot_data(self): colname = self.data.name data = self.data # Updates all props with the rc defaults from matplotlib self.kwds.update(KoalasBoxPlot.rc_defaults(**self.kwds)) # Gets some important kwds showfliers = self.kwds.get("showfliers", False) whis = self.kwds.get("whis", 1.5) labels = self.kwds.get("labels", [colname]) # This one is Koalas specific to control precision for approx_percentile precision = self.kwds.get("precision", 0.01) # # Computes mean, median, Q1 and Q3 with approx_percentile and precision col_stats, col_fences = KoalasBoxPlot._compute_stats(data, colname, whis, precision) # # Creates a column to flag rows as outliers or not outliers = KoalasBoxPlot._outliers(data, colname, *col_fences) # # Computes min and max values of non-outliers - the whiskers whiskers = KoalasBoxPlot._calc_whiskers(colname, outliers) if showfliers: fliers = KoalasBoxPlot._get_fliers(colname, outliers) else: fliers = [] # Builds bxpstats dict stats = [] item = { "mean": col_stats["mean"], "med": col_stats["med"], "q1": col_stats["q1"], "q3": col_stats["q3"], "whislo": whiskers[0], "whishi": whiskers[1], "fliers": fliers, "label": labels[0], } stats.append(item) self.data = {labels[0]: stats}
Example #16
Source File: series.py From koalas with Apache License 2.0 | 4 votes |
def _rank(self, method="average", ascending=True, part_cols=()): if method not in ["average", "min", "max", "first", "dense"]: msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" raise ValueError(msg) if len(self._internal.index_spark_column_names) > 1: raise ValueError("rank do not support index now") if ascending: asc_func = lambda scol: scol.asc() else: asc_func = lambda scol: scol.desc() if method == "first": window = ( Window.orderBy( asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)), ) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) scol = F.row_number().over(window) elif method == "dense": window = ( Window.orderBy(asc_func(self.spark.column)) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) scol = F.dense_rank().over(window) else: if method == "average": stat_func = F.mean elif method == "min": stat_func = F.min elif method == "max": stat_func = F.max window1 = ( Window.orderBy(asc_func(self.spark.column)) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing ) scol = stat_func(F.row_number().over(window1)).over(window2) kser = self._with_new_scol(scol).rename(self.name) return kser.astype(np.float64)
Example #17
Source File: metrics.py From search-MjoLniR with MIT License | 4 votes |
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid', query_cols=['wikiid', 'query', 'session_id']): """ Calculate ndcg@k for the provided dataframe Parameters ---------- df : pyspark.sql.DataFrame Input dataframe to calculate against k : int Cutoff for ndcg calculation label_col : str Column name containing integer label, higher is better, of the hit position_col : str Column name containing order displayed to user, lowest first, of the hit query_cols : list of str Column names to group by, which indicate a unique query displayed to a user Returns ------- float The ndcg@k value, always between 0 and 1 """ if wiki_col not in query_cols: query_cols = query_cols + [wiki_col] # ideal results per labels w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc()) topAtK = ( df .select(label_col, *query_cols) .withColumn('rn', F.row_number().over(w)) .where(F.col('rn') <= k) .groupBy(*query_cols) .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK'))) # top k results shown to user w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc()) predictedTopAtK = ( df .select(label_col, position_col, *query_cols) .withColumn('rn', F.row_number().over(w)) .where(F.col('rn') <= k) .groupBy(*query_cols) .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK'))) return {row[wiki_col]: row.ndcgAtK for row in topAtK .join(predictedTopAtK, query_cols, how='inner') .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK')) .groupBy(wiki_col) .agg(F.mean('ndcgAtK').alias('ndcgAtK')) .collect()}