Python Examples of pyspark.sql.functions.first

Source File: indexes.py From koalas with Apache License 2.0

6 votes

def _summary(self, name=None):
        """
        Return a summarized representation.

        Parameters
        ----------
        name : str
            name to use in the summary representation

        Returns
        -------
        String with a summarized representation of the index
        """
        head, tail, total_count = self._internal.spark_frame.select(
            F.first(self.spark.column), F.last(self.spark.column), F.count(F.expr("*"))
        ).first()

        if total_count > 0:
            index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail))
        else:
            index_summary = ""

        if name is None:
            name = type(self).__name__
        return "%s: %s entries%s" % (name, total_count, index_summary)

Source File: indexes.py From koalas with Apache License 2.0

6 votes

def has_duplicates(self) -> bool:
        """
        If index has duplicates, return True, otherwise False.

        Examples
        --------
        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
        >>> kdf.index.has_duplicates
        True

        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
        >>> kdf.index.has_duplicates
        False

        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
        >>> kdf.index.has_duplicates
        True
        """
        sdf = self._internal.spark_frame.select(self.spark.column)
        scol = scol_for(sdf, sdf.columns[0])

        return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]

Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License

6 votes

def _join_results(self, scaffolds_df):

        def _read_rows(row):
            idx, _, dec = row.split("\t")
            return ps.Row(id=idx, decoration_smi=dec)

        sampled_df = SPARK.createDataFrame(SC.textFile(self._tmp_path(
            "sampled_decorations"), self.num_partitions).map(_read_rows))

        if self.decorator_type == "single":
            processed_df = self._join_results_single(scaffolds_df, sampled_df)
        elif self.decorator_type == "multi":
            processed_df = self._join_results_multi(scaffolds_df, sampled_df)
        else:
            raise ValueError("decorator_type has an invalid value '{}'".format(self.decorator_type))

        return processed_df\
            .where("smiles IS NOT NULL")\
            .groupBy("smiles")\
            .agg(
                psf.first("scaffold").alias("scaffold"),
                psf.first("decorations").alias("decorations"),
                psf.count("smiles").alias("count"))

Source File: series.py From koalas with Apache License 2.0

6 votes

def __repr__(self):
        max_display_count = get_option("display.max_rows")
        if max_display_count is None:
            return self._to_internal_pandas().to_string(name=self.name, dtype=self.dtype)

        pser = self._kdf._get_or_create_repr_pandas_cache(max_display_count)[self.name]
        pser_length = len(pser)
        pser = pser.iloc[:max_display_count]
        if pser_length > max_display_count:
            repr_string = pser.to_string(length=True)
            rest, prev_footer = repr_string.rsplit("\n", 1)
            match = REPR_PATTERN.search(prev_footer)
            if match is not None:
                length = match.group("length")
                name = str(self.dtype.name)
                footer = "\nName: {name}, dtype: {dtype}\nShowing only the first {length}".format(
                    length=length, name=self.name, dtype=pprint_thing(name)
                )
                return rest + footer
        return pser.to_string(name=self.name, dtype=self.dtype)

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_first_value(t, expr, scope, *, window, **kwargs):
    op = expr.op()
    src_column = t.translate(op.arg, scope)
    return F.first(src_column).over(window)

Source File: slice_db.py From reinvent-scaffold-decorator with MIT License

5 votes

def run(self):
        def _enumerate(row, max_cuts=self.max_cuts, enumerator=self.enumerator):
            fields = row.split("\t")
            smiles = fields[0]
            mol = uc.to_mol(smiles)
            out_rows = []
            if mol:
                for cuts in range(1, max_cuts + 1):
                    for sliced_mol in enumerator.enumerate(mol, cuts=cuts):
                        # normalize scaffold and decorations
                        scaff_smi, dec_smis = sliced_mol.to_smiles()
                        dec_smis = [smi for num, smi in sorted(dec_smis.items())]
                        out_rows.append(ps.Row(
                            scaffold=scaff_smi,
                            decorations=dec_smis,
                            smiles=uc.to_smiles(mol),
                            cuts=cuts
                        ))
            return out_rows
        enumeration_df = SPARK.createDataFrame(
            SC.textFile(self.input_path)
            .repartition(self.partitions)
            .flatMap(_enumerate))\
            .groupBy("scaffold", "decorations")\
            .agg(psf.first("cuts").alias("cuts"), psf.first("smiles").alias("smiles"))\
            .persist()

        self._log("info", "Obtained %d sliced molecules", enumeration_df.count())

        if self.output_path:
            enumeration_df.write.parquet(self.output_path)
        return enumeration_df

Source File: fields.py From python_mozetl with MIT License

5 votes

def agg_first(field_name):
    return F.first(field_name, ignorenulls=True).alias(field_name)

Source File: groupby.py From sparklingpandas with Apache License 2.0

5 votes

def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        # If its possible to use Spark SQL grouping do it
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.first)
        myargs = self._myargs
        mykwargs = self._mykwargs
        self._prep_pandas_groupby()

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)

Source File: series.py From koalas with Apache License 2.0

5 votes

def first_series(df):
    """
    Takes a DataFrame and returns the first column of the DataFrame as a Series
    """
    assert isinstance(df, (DataFrame, pd.DataFrame)), type(df)
    if isinstance(df, DataFrame):
        return df._kser_for(df._internal.column_labels[0])
    else:
        return df[df.columns[0]]

Source File: series.py From koalas with Apache License 2.0

5 votes

def item(self):
        """
        Return the first element of the underlying data as a Python scalar.

        Returns
        -------
        scalar
            The first element of Series.

        Raises
        ------
        ValueError
            If the data is not length-1.

        Examples
        --------
        >>> kser = ks.Series([10])
        >>> kser.item()
        10
        """
        return self.head(2).to_pandas().item()

Source File: series.py From koalas with Apache License 2.0

5 votes

def head(self, n: int = 5) -> "Series":
        """
        Return the first n rows.

        This function returns the first n rows for the object based on position.
        It is useful for quickly testing if your object has the right type of data in it.

        Parameters
        ----------
        n : Integer, default =  5

        Returns
        -------
        The first n rows of the caller object.

        Examples
        --------
        >>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion']})
        >>> df.animal.head(2)  # doctest: +NORMALIZE_WHITESPACE
        0     alligator
        1     bee
        Name: animal, dtype: object
        """
        return first_series(self.to_dataframe().head(n))

    # TODO: Categorical type isn't supported (due to PySpark's limitation) and
    # some doctests related with timestamps were not added.

Source File: indexes.py From koalas with Apache License 2.0

5 votes

def __repr__(self):
        max_display_count = get_option("display.max_rows")
        if max_display_count is None:
            return repr(self.to_pandas())

        pindex = self._kdf._get_or_create_repr_pandas_cache(max_display_count).index

        pindex_length = len(pindex)
        repr_string = repr(pindex[:max_display_count])

        if pindex_length > max_display_count:
            footer = "\nShowing only the first {}".format(max_display_count)
            return repr_string + footer
        return repr_string

Source File: indexes.py From koalas with Apache License 2.0

5 votes

def argmin(self):
        """
        Return a minimum argument indexer.

        Parameters
        ----------
        skipna : bool, default True

        Returns
        -------
        minimum argument indexer

        Examples
        --------
        >>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
        >>> kidx
        Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')

        >>> kidx.argmin()
        7
        """
        sdf = self._internal.spark_frame.select(self.spark.column)
        sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
        sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)

        return sdf.orderBy(self.spark.column.asc(), F.col(sequence_col).asc()).first()[0]

Source File: indexes.py From koalas with Apache License 2.0

5 votes

def argmax(self):
        """
        Return a maximum argument indexer.

        Parameters
        ----------
        skipna : bool, default True

        Returns
        -------
        maximum argument indexer

        Examples
        --------
        >>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3])
        >>> kidx
        Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64')

        >>> kidx.argmax()
        4
        """
        sdf = self._internal.spark_frame.select(self.spark.column)
        sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__")
        sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col)
        # spark_frame here looks like below
        # +-----------------+---------------+
        # |__index_level_0__|__index_value__|
        # +-----------------+---------------+
        # |                0|             10|
        # |                4|            100|
        # |                2|              8|
        # |                3|              7|
        # |                6|              4|
        # |                5|              5|
        # |                7|              3|
        # |                8|            100|
        # |                1|              9|
        # +-----------------+---------------+

        return sdf.orderBy(self.spark.column.desc(), F.col(sequence_col).asc()).first()[0]

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_arbitrary(t, expr, scope, context=None, **kwargs):
    how = expr.op().how

    if how == 'first':
        fn = functools.partial(F.first, ignorenulls=True)
    elif how == 'last':
        fn = functools.partial(F.last, ignorenulls=True)
    else:
        raise NotImplementedError("Does not support 'how': {}".format(how))

    return compile_aggregator(t, expr, scope, fn, context)

Source File: series.py From koalas with Apache License 2.0

4 votes

def nsmallest(self, n: int = 5) -> "Series":
        """
        Return the smallest `n` elements.

        Parameters
        ----------
        n : int, default 5
            Return this many ascending sorted values.

        Returns
        -------
        Series
            The `n` smallest values in the Series, sorted in increasing order.

        See Also
        --------
        Series.nlargest: Get the `n` largest elements.
        Series.sort_values: Sort Series by values.
        Series.head: Return the first `n` rows.

        Notes
        -----
        Faster than ``.sort_values().head(n)`` for small `n` relative to
        the size of the ``Series`` object.
        In Koalas, thanks to Spark's lazy execution and query optimizer,
        the two would have same performance.

        Examples
        --------
        >>> data = [1, 2, 3, 4, np.nan ,6, 7, 8]
        >>> s = ks.Series(data)
        >>> s
        0    1.0
        1    2.0
        2    3.0
        3    4.0
        4    NaN
        5    6.0
        6    7.0
        7    8.0
        Name: 0, dtype: float64

        The `n` largest elements where ``n=5`` by default.

        >>> s.nsmallest()
        0    1.0
        1    2.0
        2    3.0
        3    4.0
        5    6.0
        Name: 0, dtype: float64

        >>> s.nsmallest(3)
        0    1.0
        1    2.0
        2    3.0
        Name: 0, dtype: float64
        """
        return first_series(self.to_frame().nsmallest(n=n, columns=self.name))

Source File: series.py From koalas with Apache License 2.0

4 votes

def _rank(self, method="average", ascending=True, part_cols=()):
        if method not in ["average", "min", "max", "first", "dense"]:
            msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
            raise ValueError(msg)

        if len(self._internal.index_spark_column_names) > 1:
            raise ValueError("rank do not support index now")

        if ascending:
            asc_func = lambda scol: scol.asc()
        else:
            asc_func = lambda scol: scol.desc()

        if method == "first":
            window = (
                Window.orderBy(
                    asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)),
                )
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.row_number().over(window)
        elif method == "dense":
            window = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            scol = F.dense_rank().over(window)
        else:
            if method == "average":
                stat_func = F.mean
            elif method == "min":
                stat_func = F.min
            elif method == "max":
                stat_func = F.max
            window1 = (
                Window.orderBy(asc_func(self.spark.column))
                .partitionBy(*part_cols)
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)
            )
            window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween(
                Window.unboundedPreceding, Window.unboundedFollowing
            )
            scol = stat_func(F.row_number().over(window1)).over(window2)
        kser = self._with_new_scol(scol).rename(self.name)
        return kser.astype(np.float64)

Source File: series.py From koalas with Apache License 2.0

4 votes

def combine_first(self, other):
        """
        Combine Series values, choosing the calling Series's values first.

        Parameters
        ----------
        other : Series
            The value(s) to be combined with the `Series`.

        Returns
        -------
        Series
            The result of combining the Series with the other object.

        See Also
        --------
        Series.combine : Perform elementwise operation on two Series
            using a given function.

        Notes
        -----
        Result index will be the union of the two indexes.

        Examples
        --------
        >>> s1 = ks.Series([1, np.nan])
        >>> s2 = ks.Series([3, 4])
        >>> s1.combine_first(s2)
        0    1.0
        1    4.0
        Name: 0, dtype: float64
        """
        if not isinstance(other, ks.Series):
            raise ValueError("`combine_first` only allows `Series` for parameter `other`")
        if same_anchor(self, other):
            this = self.spark.column
            that = other.spark.column
            combined = self._kdf
        else:
            with option_context("compute.ops_on_diff_frames", True):
                combined = combine_frames(self.to_frame(), other)
            this = combined["this"]._internal.spark_column_for(self._column_label)
            that = combined["that"]._internal.spark_column_for(other._column_label)
        # If `self` has missing value, use value of `other`
        cond = F.when(this.isNull(), that).otherwise(this)
        # If `self` and `other` come from same frame, the anchor should be kept
        if same_anchor(self, other):
            return self._with_new_scol(cond).rename(self.name)
        index_scols = combined._internal.index_spark_columns
        sdf = combined._internal.spark_frame.select(
            *index_scols, cond.alias(self._internal.data_spark_column_names[0])
        ).distinct()
        internal = InternalFrame(
            spark_frame=sdf,
            index_map=self._internal.index_map,
            column_labels=self._internal.column_labels,
            data_spark_columns=[scol_for(sdf, self._internal.data_spark_column_names[0])],
            column_label_names=self._internal.column_label_names,
        )
        return first_series(ks.DataFrame(internal))

Source File: series.py From koalas with Apache License 2.0

4 votes

def _fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, part_cols=()):
        axis = validate_axis(axis)
        inplace = validate_bool_kwarg(inplace, "inplace")
        if axis != 0:
            raise NotImplementedError("fillna currently only works for axis=0 or axis='index'")
        if (value is None) and (method is None):
            raise ValueError("Must specify a fillna 'value' or 'method' parameter.")
        if (method is not None) and (method not in ["ffill", "pad", "backfill", "bfill"]):
            raise ValueError("Expecting 'pad', 'ffill', 'backfill' or 'bfill'.")

        scol = self.spark.column

        if isinstance(self.spark.data_type, (FloatType, DoubleType)):
            cond = scol.isNull() | F.isnan(scol)
        else:
            if not self.spark.nullable:
                if inplace:
                    return
                else:
                    return self
            cond = scol.isNull()

        if value is not None:
            if not isinstance(value, (float, int, str, bool)):
                raise TypeError("Unsupported type %s" % type(value))
            if limit is not None:
                raise ValueError("limit parameter for value is not support now")
            scol = F.when(cond, value).otherwise(scol)
        else:
            if method in ["ffill", "pad"]:
                func = F.last
                end = Window.currentRow - 1
                if limit is not None:
                    begin = Window.currentRow - limit
                else:
                    begin = Window.unboundedPreceding
            elif method in ["bfill", "backfill"]:
                func = F.first
                begin = Window.currentRow + 1
                if limit is not None:
                    end = Window.currentRow + limit
                else:
                    end = Window.unboundedFollowing

            window = (
                Window.partitionBy(*part_cols)
                .orderBy(NATURAL_ORDER_COLUMN_NAME)
                .rowsBetween(begin, end)
            )
            scol = F.when(cond, func(scol, True).over(window)).otherwise(scol)

        if inplace:
            self._kdf._update_internal_frame(
                self._kdf._internal.with_new_spark_column(self._column_label, scol)
            )
        else:
            return self._with_new_scol(scol).rename(self.name)

Source File: groupby.py From koalas with Apache License 2.0

4 votes

def nlargest(self, n=5):
        """
        Return the first n rows ordered by columns in descending order in group.

        Return the first n rows with the smallest values in columns, in descending order.
        The columns that are not specified are returned as well, but not used for ordering.

        Parameters
        ----------
        n : int
            Number of items to retrieve.

        See Also
        --------
        databricks.koalas.Series.nlargest
        databricks.koalas.DataFrame.nlargest

        Examples
        --------
        >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
        ...                    'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])

        >>> df.groupby(['a'])['b'].nlargest(1).sort_index()  # doctest: +NORMALIZE_WHITESPACE
        a
        1  1    2
        2  4    3
        3  7    4
        Name: b, dtype: int64
        """
        if len(self._kdf._internal.index_names) > 1:
            raise ValueError("nlargest do not support multi-index now")

        sdf = self._kdf._internal.spark_frame
        name = self._agg_columns[0]._internal.data_spark_column_names[0]
        window = Window.partitionBy(self._groupkeys_scols).orderBy(
            self._agg_columns[0].spark.column.desc(), NATURAL_ORDER_COLUMN_NAME
        )
        sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)

        internal = InternalFrame(
            spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
            index_map=OrderedDict(
                [
                    (s._internal.data_spark_column_names[0], s._internal.column_labels[0])
                    for s in self._groupkeys
                ]
                + list(self._kdf._internal.index_map.items())
            ),
            data_spark_columns=[scol_for(sdf, name)],
        )
        return first_series(DataFrame(internal))

    # TODO: add bins, normalize parameter

Source File: groupby.py From koalas with Apache License 2.0

4 votes

def nsmallest(self, n=5):
        """
        Return the first n rows ordered by columns in ascending order in group.

        Return the first n rows with the smallest values in columns, in ascending order.
        The columns that are not specified are returned as well, but not used for ordering.

        Parameters
        ----------
        n : int
            Number of items to retrieve.

        See Also
        --------
        databricks.koalas.Series.nsmallest
        databricks.koalas.DataFrame.nsmallest

        Examples
        --------
        >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
        ...                    'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])

        >>> df.groupby(['a'])['b'].nsmallest(1).sort_index()  # doctest: +NORMALIZE_WHITESPACE
        a
        1  0    1
        2  3    2
        3  6    3
        Name: b, dtype: int64
        """
        if len(self._kdf._internal.index_names) > 1:
            raise ValueError("nsmallest do not support multi-index now")

        sdf = self._kdf._internal.spark_frame
        name = self._agg_columns[0]._internal.data_spark_column_names[0]
        window = Window.partitionBy(self._groupkeys_scols).orderBy(
            self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME
        )
        sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)

        internal = InternalFrame(
            spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
            index_map=OrderedDict(
                [
                    (s._internal.data_spark_column_names[0], s._internal.column_labels[0])
                    for s in self._groupkeys
                ]
                + list(self._kdf._internal.index_map.items())
            ),
            data_spark_columns=[scol_for(sdf, name)],
        )
        return first_series(DataFrame(internal))

    # TODO: add keep parameter

Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License

4 votes

def run(self, initial_scaffolds):
        randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType()))
        get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType()))
        remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType())

        results_df = self._initialize_results(initial_scaffolds)
        scaffolds_df = results_df.select("smiles", "scaffold", "decorations")
        i = 0
        while scaffolds_df.count() > 0:
            # generate randomized SMILES
            self._log("info", "Starting iteration #%d.", i)
            scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\
                .select(
                    "smiles", "scaffold", "decorations",
                    psf.explode("randomized_scaffold").alias("randomized_scaffold"))\
                .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\
                .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\
                .withColumn("id", psf.monotonically_increasing_id())\
                .persist()
            self._log("info", "Generated %d randomized SMILES from %d scaffolds.",
                      scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count())

            # sample each randomized scaffold N times
            scaffolds = scaffolds_df.select("id", "randomized_scaffold")\
                .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator()
            self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count())
            self._log("info", "Sampled %d scaffolds.", scaffolds_df.count())

            # merge decorated molecules
            joined_df = self._join_results(scaffolds_df).persist()

            if joined_df.count() > 0:
                self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds",
                          scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count())

            scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\
                .select("smiles", "scaffold", "decorations")\
                .where("smiles LIKE '%*%'")
            self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count())

            results_df = results_df.union(joined_df)\
                .groupBy("smiles")\
                .agg(
                    psf.first("scaffold").alias("scaffold"),
                    psf.first("decorations").alias("decorations"),
                    psf.sum("count").alias("count"))\
                .persist()
            i += 1

        return results_df

Python pyspark.sql.functions.first() Examples