Python pyspark.sql.functions.pandas_udf() Examples

The following are 9 code examples of pyspark.sql.functions.pandas_udf(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function

Example #1

Source File: series.py From koalas with Apache License 2.0

7 votes

def _cumprod(self, skipna, part_cols=()):
        from pyspark.sql.functions import pandas_udf

        def cumprod(scol):
            @pandas_udf(returnType=self.spark.data_type)
            def negative_check(s):
                assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), (
                    "values should be bigger than 0: %s" % s
                )
                return s

            return F.sum(F.log(negative_check(scol)))

        kser = self._cum(cumprod, skipna, part_cols)
        return kser._with_new_scol(F.exp(kser.spark.column)).rename(self.name)

    # ----------------------------------------------------------------------
    # Accessor Methods
    # ----------------------------------------------------------------------

Example #2

Source File: accessors.py From koalas with Apache License 2.0

6 votes

def _transform_batch(self, func, return_schema):
        from databricks.koalas.series import Series
        from databricks import koalas as ks

        if isinstance(func, np.ufunc):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        if return_schema is None:
            # TODO: In this case, it avoids the shortcut for now (but only infers schema)
            #  because it returns a series from a different DataFrame and it has a different
            #  anchor. We should fix this to allow the shortcut or only allow to infer
            #  schema.
            limit = ks.get_option("compute.shortcut_limit")
            pser = self._kser.head(limit)._to_internal_pandas()
            transformed = pser.transform(func)
            kser = Series(transformed)
            spark_return_type = kser.spark.data_type
        else:
            spark_return_type = return_schema

        pudf = pandas_udf(func, returnType=spark_return_type, functionType=PandasUDFType.SCALAR)
        return self._kser._with_new_scol(scol=pudf(self._kser.spark.column)).rename(self._kser.name)

Example #3

Source File: udf.py From ibis with Apache License 2.0

5 votes

def create_udf_node(self, udf_func):
        """Create a new UDF node type and adds a corresponding compile rule.

        Parameters
        ----------
        udf_func : function
            Should be the result of calling pyspark.sql.functions.udf or
            pyspark.sql.functions.pandas_udf on the user-specified func

        Returns
        -------
        result : type
            A new SparkUDFNode or SparkUDAFNode subclass
        """
        name = udf_func.__name__
        definition = next(_udf_name_cache[name])
        external_name = '{}_{:d}'.format(name, definition)

        UDFNode = type(
            external_name,
            (self.base_class,),
            {
                'signature': sig.TypeSignature.from_dtypes(self.input_type),
                'return_type': self.output_type,
            },
        )

        # Add udf_func as a property. If added to the class namespace dict, it
        # would be incorrectly used as a bound method, i.e.
        # udf_func(t.column) would be a call to bound method func with t.column
        # interpreted as self.
        UDFNode.udf_func = property(lambda self, udf_func=udf_func: udf_func)

        @compiles(UDFNode)
        def compiles_udf_node(t, expr):
            return '{}({})'.format(
                UDFNode.__name__, ', '.join(map(t.translate, expr.op().args))
            )

        return UDFNode

Example #4

Source File: udf.py From ibis with Apache License 2.0

5 votes

def pyspark_udf(self, func):
        return f.pandas_udf(func, self.spark_output_type, self.pandas_udf_type)

Example #5

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_strftime(t, expr, scope, **kwargs):
    op = expr.op()
    format_str = op.format_str.op().value

    @pandas_udf('string', PandasUDFType.SCALAR)
    def strftime(timestamps):
        return timestamps.dt.strftime(format_str)

    src_column = t.translate(op.arg, scope)
    return strftime(src_column)

Example #6

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compile_day_of_week_index(t, expr, scope, **kwargs):
    op = expr.op()

    @pandas_udf('short', PandasUDFType.SCALAR)
    def day_of_week(s):
        return s.dt.dayofweek

    src_column = t.translate(op.arg, scope)
    return day_of_week(src_column.cast('timestamp'))

Example #7

Source File: compiler.py From ibis with Apache License 2.0

5 votes

def compiles_day_of_week_name(t, expr, scope, **kwargs):
    op = expr.op()

    @pandas_udf('string', PandasUDFType.SCALAR)
    def day_name(s):
        return s.dt.day_name()

    src_column = t.translate(op.arg, scope)
    return day_name(src_column.cast('timestamp'))

Example #8

Source File: test_spark_dataset_converter.py From petastorm with Apache License 2.0

5 votes

def test_array_field(spark_test_ctx):
    @pandas_udf('array<float>')
    def gen_array(v):
        return v.map(lambda x: np.random.rand(10))
    df1 = spark_test_ctx.spark.range(10).withColumn('v', gen_array('id')).repartition(2)
    cv1 = make_spark_converter(df1)
    # we can auto infer one-dim array shape
    with cv1.make_tf_dataset(batch_size=4, num_epochs=1) as dataset:
        tf_iter = dataset.make_one_shot_iterator()
        next_op = tf_iter.get_next()
        with tf.Session() as sess:
            batch1 = sess.run(next_op)
        assert batch1.v.shape == (4, 10)

Example #9

Source File: groupby.py From koalas with Apache License 2.0

5 votes

def _spark_group_map_apply(kdf, func, groupkeys_scols, return_schema, retain_index):
        output_func = GroupBy._make_pandas_df_builder_func(kdf, func, return_schema, retain_index)
        grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(output_func)
        sdf = kdf._internal.spark_frame.drop(*HIDDEN_COLUMNS)
        return sdf.groupby(*groupkeys_scols).apply(grouped_map_func)