Python pyspark.sql.functions.pandas_udf() Examples
The following are 9
code examples of pyspark.sql.functions.pandas_udf().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: series.py From koalas with Apache License 2.0 | 7 votes |
def _cumprod(self, skipna, part_cols=()): from pyspark.sql.functions import pandas_udf def cumprod(scol): @pandas_udf(returnType=self.spark.data_type) def negative_check(s): assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), ( "values should be bigger than 0: %s" % s ) return s return F.sum(F.log(negative_check(scol))) kser = self._cum(cumprod, skipna, part_cols) return kser._with_new_scol(F.exp(kser.spark.column)).rename(self.name) # ---------------------------------------------------------------------- # Accessor Methods # ----------------------------------------------------------------------
Example #2
Source File: accessors.py From koalas with Apache License 2.0 | 6 votes |
def _transform_batch(self, func, return_schema): from databricks.koalas.series import Series from databricks import koalas as ks if isinstance(func, np.ufunc): f = func func = lambda *args, **kwargs: f(*args, **kwargs) if return_schema is None: # TODO: In this case, it avoids the shortcut for now (but only infers schema) # because it returns a series from a different DataFrame and it has a different # anchor. We should fix this to allow the shortcut or only allow to infer # schema. limit = ks.get_option("compute.shortcut_limit") pser = self._kser.head(limit)._to_internal_pandas() transformed = pser.transform(func) kser = Series(transformed) spark_return_type = kser.spark.data_type else: spark_return_type = return_schema pudf = pandas_udf(func, returnType=spark_return_type, functionType=PandasUDFType.SCALAR) return self._kser._with_new_scol(scol=pudf(self._kser.spark.column)).rename(self._kser.name)
Example #3
Source File: udf.py From ibis with Apache License 2.0 | 5 votes |
def create_udf_node(self, udf_func): """Create a new UDF node type and adds a corresponding compile rule. Parameters ---------- udf_func : function Should be the result of calling pyspark.sql.functions.udf or pyspark.sql.functions.pandas_udf on the user-specified func Returns ------- result : type A new SparkUDFNode or SparkUDAFNode subclass """ name = udf_func.__name__ definition = next(_udf_name_cache[name]) external_name = '{}_{:d}'.format(name, definition) UDFNode = type( external_name, (self.base_class,), { 'signature': sig.TypeSignature.from_dtypes(self.input_type), 'return_type': self.output_type, }, ) # Add udf_func as a property. If added to the class namespace dict, it # would be incorrectly used as a bound method, i.e. # udf_func(t.column) would be a call to bound method func with t.column # interpreted as self. UDFNode.udf_func = property(lambda self, udf_func=udf_func: udf_func) @compiles(UDFNode) def compiles_udf_node(t, expr): return '{}({})'.format( UDFNode.__name__, ', '.join(map(t.translate, expr.op().args)) ) return UDFNode
Example #4
Source File: udf.py From ibis with Apache License 2.0 | 5 votes |
def pyspark_udf(self, func): return f.pandas_udf(func, self.spark_output_type, self.pandas_udf_type)
Example #5
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_strftime(t, expr, scope, **kwargs): op = expr.op() format_str = op.format_str.op().value @pandas_udf('string', PandasUDFType.SCALAR) def strftime(timestamps): return timestamps.dt.strftime(format_str) src_column = t.translate(op.arg, scope) return strftime(src_column)
Example #6
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_day_of_week_index(t, expr, scope, **kwargs): op = expr.op() @pandas_udf('short', PandasUDFType.SCALAR) def day_of_week(s): return s.dt.dayofweek src_column = t.translate(op.arg, scope) return day_of_week(src_column.cast('timestamp'))
Example #7
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compiles_day_of_week_name(t, expr, scope, **kwargs): op = expr.op() @pandas_udf('string', PandasUDFType.SCALAR) def day_name(s): return s.dt.day_name() src_column = t.translate(op.arg, scope) return day_name(src_column.cast('timestamp'))
Example #8
Source File: test_spark_dataset_converter.py From petastorm with Apache License 2.0 | 5 votes |
def test_array_field(spark_test_ctx): @pandas_udf('array<float>') def gen_array(v): return v.map(lambda x: np.random.rand(10)) df1 = spark_test_ctx.spark.range(10).withColumn('v', gen_array('id')).repartition(2) cv1 = make_spark_converter(df1) # we can auto infer one-dim array shape with cv1.make_tf_dataset(batch_size=4, num_epochs=1) as dataset: tf_iter = dataset.make_one_shot_iterator() next_op = tf_iter.get_next() with tf.Session() as sess: batch1 = sess.run(next_op) assert batch1.v.shape == (4, 10)
Example #9
Source File: groupby.py From koalas with Apache License 2.0 | 5 votes |
def _spark_group_map_apply(kdf, func, groupkeys_scols, return_schema, retain_index): output_func = GroupBy._make_pandas_df_builder_func(kdf, func, return_schema, retain_index) grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(output_func) sdf = kdf._internal.spark_frame.drop(*HIDDEN_COLUMNS) return sdf.groupby(*groupkeys_scols).apply(grouped_map_func)