Python Examples of pyspark.sql.functions.mean

Source File: test_basic.py From ibis with Apache License 2.0

6 votes

def test_window(client):
    import pyspark.sql.functions as F
    from pyspark.sql.window import Window

    table = client.table('basic_table')
    w = ibis.window()
    result = table.mutate(
        grouped_demeaned=table['id'] - table['id'].mean().over(w)
    ).compile()

    spark_window = Window.partitionBy()
    spark_table = table.compile()
    expected = spark_table.withColumn(
        'grouped_demeaned',
        spark_table['id'] - F.mean(spark_table['id']).over(spark_window),
    )

    tm.assert_frame_equal(result.toPandas(), expected.toPandas())

Source File: series.py From koalas with Apache License 2.0

6 votes

def mad(self):
        """
        Return the mean absolute deviation of values.

        Examples
        --------
        >>> s = ks.Series([1, 2, 3, 4])
        >>> s
        0    1
        1    2
        2    3
        3    4
        Name: 0, dtype: int64

        >>> s.mad()
        1.0
        """

        sdf = self._internal.spark_frame
        spark_column = self.spark.column
        avg = unpack_scalar(sdf.select(F.avg(spark_column)))
        mad = unpack_scalar(sdf.select(F.avg(F.abs(spark_column - avg))))

        return mad

Source File: transformers.py From dist-keras with GNU General Public License v3.0

6 votes

def transform(self, dataframe):
        """Applies standardization to the specified columns.

        # Arguments
            dataframe: dataframe. Spark Dataframe.
        """
        # Compute the means of the specified columns.
        means = [mean(x) for x in self.columns]
        means = dataframe.select(means).collect()[0].asDict()
        self.means = self.clean_mean_keys(means)
        # Compute the standard deviation of the specified columns.
        stddevs = [stddev_pop(x) for x in self.columns]
        stddevs = dataframe.select(stddevs).collect()[0].asDict()
        self.stddevs = self.clean_stddev_keys(stddevs)
        # For every feature, add a new column to the dataframe.
        for column in self.columns:
            self.current_column = column
            dataframe = dataframe.rdd.map(self._transform).toDF()

        return dataframe

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_mean(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.mean, context, **kwargs)

Source File: fields.py From python_mozetl with MIT License

5 votes

def agg_mean(field_name, alias=None):
    field_alias = get_alias(field_name, alias, "mean")
    return F.mean(field_name).alias(field_alias)

Source File: groupby.py From sparklingpandas with Apache License 2.0

5 votes

def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.mean)
        self._prep_pandas_groupby()
        return DataFrame.fromDataFrameRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.mean()), self.sql_ctx)

Source File: plot.py From koalas with Apache License 2.0

5 votes

def _compute_stats(data, colname, whis, precision):
        # Computes mean, median, Q1 and Q3 with approx_percentile and precision
        pdf = data._kdf._internal.resolved_copy.spark_frame.agg(
            *[
                F.expr(
                    "approx_percentile({}, {}, {})".format(colname, q, int(1.0 / precision))
                ).alias("{}_{}%".format(colname, int(q * 100)))
                for q in [0.25, 0.50, 0.75]
            ],
            F.mean(colname).alias("{}_mean".format(colname))
        ).toPandas()

        # Computes IQR and Tukey's fences
        iqr = "{}_iqr".format(colname)
        p75 = "{}_75%".format(colname)
        p25 = "{}_25%".format(colname)
        pdf.loc[:, iqr] = pdf.loc[:, p75] - pdf.loc[:, p25]
        pdf.loc[:, "{}_lfence".format(colname)] = pdf.loc[:, p25] - whis * pdf.loc[:, iqr]
        pdf.loc[:, "{}_ufence".format(colname)] = pdf.loc[:, p75] + whis * pdf.loc[:, iqr]

        qnames = ["25%", "50%", "75%", "mean", "lfence", "ufence"]
        col_summ = pdf[["{}_{}".format(colname, q) for q in qnames]]
        col_summ.columns = qnames
        lfence, ufence = col_summ["lfence"], col_summ["ufence"]

        stats = {
            "mean": col_summ["mean"].values[0],
            "med": col_summ["50%"].values[0],
            "q1": col_summ["25%"].values[0],
            "q3": col_summ["75%"].values[0],
        }

        return stats, (lfence.values[0], ufence.values[0])

Source File: generic.py From koalas with Apache License 2.0

5 votes

def mean(self, axis=None, numeric_only=True):
        """
        Return the mean of the values.

        Parameters
        ----------
        axis : {index (0), columns (1)}
            Axis for the function to be applied on.
        numeric_only : bool, default True
            Include only float, int, boolean columns. False is not supported. This parameter
            is mainly for pandas compatibility.

        Returns
        -------
        mean : scalar for a Series, and a Series for a DataFrame.

        Examples
        --------

        >>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
        ...                   columns=['a', 'b'])

        On a DataFrame:

        >>> df.mean()
        a    2.0
        b    0.2
        Name: 0, dtype: float64

        >>> df.mean(axis=1)
        0    0.55
        1    1.10
        2    1.65
        3     NaN
        Name: 0, dtype: float64

        On a Series:

        >>> df['a'].mean()
        2.0
        """
        return self._reduce_for_stat_function(
            F.mean, name="mean", numeric_only=numeric_only, axis=axis
        )

Source File: window.py From koalas with Apache License 2.0

5 votes

def mean(self):
        def mean(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.mean(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(mean)

Source File: transformers.py From dist-keras with GNU General Public License v3.0

5 votes

def _transform(self, row):
        """Take the column, and normalize it with the computed means and std devs."""
        mean = self.means[self.current_column]
        stddev = self.stddevs[self.current_column]
        x = row[self.current_column]
        x_normalized = (x - mean) / stddev
        output_column = self.current_column + self.column_suffix
        new_row = new_dataframe_row(row, output_column, x_normalized)

        return new_row

Source File: transformers.py From dist-keras with GNU General Public License v3.0

5 votes

def clean_mean_keys(self, means):
        """Cleans the keys of the specified dictionary (mean)."""
        new_means = {}

        for k in means:
            new_means[k[4:-1]] = means[k]

        return new_means

Source File: window.py From koalas with Apache License 2.0

4 votes

def mean(self):
        """
        Calculate the expanding mean of the values.

        .. note:: the current implementation of this API uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the expanding
            calculation.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        Series.mean : Equivalent method for Series.
        DataFrame.mean : Equivalent method for DataFrame.

        Examples
        --------
        The below examples will show expanding mean calculations with window sizes of
        two and three, respectively.

        >>> s = ks.Series([1, 2, 3, 4])
        >>> s.expanding(2).mean()
        0    NaN
        1    1.5
        2    2.0
        3    2.5
        Name: 0, dtype: float64

        >>> s.expanding(3).mean()
        0    NaN
        1    NaN
        2    2.0
        3    2.5
        Name: 0, dtype: float64
        """
        return super(Expanding, self).mean()

Source File: window.py From koalas with Apache License 2.0

4 votes

def mean(self):
        """
        Calculate the expanding mean of the values.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the expanding
            calculation.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        Series.mean : Equivalent method for Series.
        DataFrame.mean : Equivalent method for DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
        >>> s.groupby(s).expanding(3).mean().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        0
        2  0     NaN
           1     NaN
        3  2     NaN
           3     NaN
           4     3.0
        4  5     NaN
           6     NaN
           7     4.0
           8     4.0
        5  9     NaN
           10    NaN
        Name: 0, dtype: float64

        For DataFrame, each expanding mean is computed column-wise.

        >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
        >>> df.groupby(df.A).expanding(2).mean().sort_index()  # doctest: +NORMALIZE_WHITESPACE
                A     B
        A
        2 0   NaN   NaN
          1   2.0   4.0
        3 2   NaN   NaN
          3   3.0   9.0
          4   3.0   9.0
        4 5   NaN   NaN
          6   4.0  16.0
          7   4.0  16.0
          8   4.0  16.0
        5 9   NaN   NaN
          10  5.0  25.0
        """
        return super(ExpandingGroupby, self).mean()

Source File: window.py From koalas with Apache License 2.0

4 votes

def mean(self):
        """
        The rolling mean of any non-NaN observations inside the window.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the rolling
            calculation.

        See Also
        --------
        Series.rolling : Calling object with Series data.
        DataFrame.rolling : Calling object with DataFrames.
        Series.mean : Mean of the full Series.
        DataFrame.mean : Mean of the full DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
        >>> s.groupby(s).rolling(3).mean().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        0
        2  0     NaN
           1     NaN
        3  2     NaN
           3     NaN
           4     3.0
        4  5     NaN
           6     NaN
           7     4.0
           8     4.0
        5  9     NaN
           10    NaN
        Name: 0, dtype: float64

        For DataFrame, each rolling mean is computed column-wise.

        >>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
        >>> df.groupby(df.A).rolling(2).mean().sort_index()  # doctest: +NORMALIZE_WHITESPACE
                A     B
        A
        2 0   NaN   NaN
          1   2.0   4.0
        3 2   NaN   NaN
          3   3.0   9.0
          4   3.0   9.0
        4 5   NaN   NaN
          6   4.0  16.0
          7   4.0  16.0
          8   4.0  16.0
        5 9   NaN   NaN
          10  5.0  25.0
        """
        return super(RollingGroupby, self).mean()

Source File: plot.py From koalas with Apache License 2.0

4 votes

def _compute_plot_data(self):
        colname = self.data.name
        data = self.data

        # Updates all props with the rc defaults from matplotlib
        self.kwds.update(KoalasBoxPlot.rc_defaults(**self.kwds))

        # Gets some important kwds
        showfliers = self.kwds.get("showfliers", False)
        whis = self.kwds.get("whis", 1.5)
        labels = self.kwds.get("labels", [colname])

        # This one is Koalas specific to control precision for approx_percentile
        precision = self.kwds.get("precision", 0.01)

        # # Computes mean, median, Q1 and Q3 with approx_percentile and precision
        col_stats, col_fences = KoalasBoxPlot._compute_stats(data, colname, whis, precision)

        # # Creates a column to flag rows as outliers or not
        outliers = KoalasBoxPlot._outliers(data, colname, *col_fences)

        # # Computes min and max values of non-outliers - the whiskers
        whiskers = KoalasBoxPlot._calc_whiskers(colname, outliers)

        if showfliers:
            fliers = KoalasBoxPlot._get_fliers(colname, outliers)
        else:
            fliers = []

        # Builds bxpstats dict
        stats = []
        item = {
            "mean": col_stats["mean"],
            "med": col_stats["med"],
            "q1": col_stats["q1"],
            "q3": col_stats["q3"],
            "whislo": whiskers[0],
            "whishi": whiskers[1],
            "fliers": fliers,
            "label": labels[0],
        }
        stats.append(item)

        self.data = {labels[0]: stats}

Source File: series.py From koalas with Apache License 2.0

4 votes

def _rank(self, method="average", ascending=True, part_cols=()):
        if method not in ["average", "min", "max", "first", "dense"]:
            msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
            raise ValueError(msg)

        if len(self._internal.index_spark_column_names) > 1:
            raise ValueError("rank do not support index now")

        if ascending:
            asc_func = lambda scol: scol.asc()
        else:
            asc_func = lambda scol: scol.desc()

        if method == "first":
            window = (
                Window.orderBy(
                    asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)),
                )
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.row_number().over(window)
        elif method == "dense":
            window = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.dense_rank().over(window)
        else:
            if method == "average":
                stat_func = F.mean
            elif method == "min":
                stat_func = F.min
            elif method == "max":
                stat_func = F.max
            window1 = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween(
                Window.unboundedPreceding, Window.unboundedFollowing
            )
            scol = stat_func(F.row_number().over(window1)).over(window2)
        kser = self._with_new_scol(scol).rename(self.name)
        return kser.astype(np.float64)

Source File: metrics.py From search-MjoLniR with MIT License

4 votes

def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
         query_cols=['wikiid', 'query', 'session_id']):
    """
    Calculate ndcg@k for the provided dataframe

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input dataframe to calculate against
    k : int
        Cutoff for ndcg calculation
    label_col : str
        Column name containing integer label, higher is better, of the hit
    position_col : str
        Column name containing order displayed to user, lowest first, of the hit
    query_cols : list of str
        Column names to group by, which indicate a unique query displayed to a user

    Returns
    -------
    float
        The ndcg@k value, always between 0 and 1
    """
    if wiki_col not in query_cols:
        query_cols = query_cols + [wiki_col]

    # ideal results per labels
    w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
    topAtK = (
        df
        .select(label_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
    # top k results shown to user
    w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
    predictedTopAtK = (
        df
        .select(label_col, position_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
    return {row[wiki_col]: row.ndcgAtK for row in topAtK
            .join(predictedTopAtK, query_cols, how='inner')
            .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
            .groupBy(wiki_col)
            .agg(F.mean('ndcgAtK').alias('ndcgAtK'))
            .collect()}

Python pyspark.sql.functions.mean() Examples