Python pyspark.sql.functions.countDistinct() Examples
The following are 6
code examples of pyspark.sql.functions.countDistinct().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: indexes.py From koalas with Apache License 2.0 | 6 votes |
def has_duplicates(self) -> bool: """ If index has duplicates, return True, otherwise False. Examples -------- >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac')) >>> kdf.index.has_duplicates True >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')]) >>> kdf.index.has_duplicates False >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')]) >>> kdf.index.has_duplicates True """ sdf = self._internal.spark_frame.select(self.spark.column) scol = scol_for(sdf, sdf.columns[0]) return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]
Example #2
Source File: indexes.py From koalas with Apache License 2.0 | 6 votes |
def levshape(self): """ A tuple with the length of each level. Examples -------- >>> midx = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) >>> midx # doctest: +SKIP MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')], ) >>> midx.levshape (3, 3) """ result = self._internal.spark_frame.agg( *(F.countDistinct(c) for c in self._internal.index_spark_columns) ).collect()[0] return tuple(result)
Example #3
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def _nunique(self, dropna=True, approx=False, rsd=0.05): colname = self._internal.data_spark_column_names[0] count_fn = partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct if dropna: return count_fn(self.spark.column).alias(colname) else: return ( count_fn(self.spark.column) + F.when( F.count(F.when(self.spark.column.isNull(), 1).otherwise(None)) >= 1, 1 ).otherwise(0) ).alias(colname)
Example #4
Source File: series.py From koalas with Apache License 2.0 | 5 votes |
def is_unique(self): """ Return boolean if values in the object are unique Returns ------- is_unique : boolean >>> ks.Series([1, 2, 3]).is_unique True >>> ks.Series([1, 2, 2]).is_unique False >>> ks.Series([1, 2, 3, None]).is_unique True """ scol = self.spark.column # Here we check: # 1. the distinct count without nulls and count without nulls for non-null values # 2. count null values and see if null is a distinct value. # # This workaround is in order to calculate the distinct count including nulls in # single pass. Note that COUNT(DISTINCT expr) in Spark is designed to ignore nulls. return self._internal.spark_frame.select( (F.count(scol) == F.countDistinct(scol)) & (F.count(F.when(scol.isNull(), 1).otherwise(None)) <= 1) ).collect()[0][0]
Example #5
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def nunique(self, dropna=True): """ Return DataFrame with number of distinct observations per group for each column. Parameters ---------- dropna : boolean, default True Don’t include NaN in the counts. Returns ------- nunique : DataFrame Examples -------- >>> df = ks.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', ... 'ham', 'ham'], ... 'value1': [1, 5, 5, 2, 5, 5], ... 'value2': list('abbaxy')}, columns=['id', 'value1', 'value2']) >>> df id value1 value2 0 spam 1 a 1 egg 5 b 2 egg 5 b 3 spam 2 a 4 ham 5 x 5 ham 5 y >>> df.groupby('id').nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE id value1 value2 id egg 1 1 1 ham 1 1 2 spam 1 2 1 >>> df.groupby('id')['value1'].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE id egg 1 ham 1 spam 2 Name: value1, dtype: int64 """ if dropna: stat_function = lambda col: F.countDistinct(col) else: stat_function = lambda col: ( F.countDistinct(col) + F.when(F.count(F.when(col.isNull(), 1).otherwise(None)) >= 1, 1).otherwise(0) ) should_include_groupkeys = isinstance(self, DataFrameGroupBy) return self._reduce_for_stat_function( stat_function, only_numeric=False, should_include_groupkeys=should_include_groupkeys )
Example #6
Source File: bookmark_validation.py From python_mozetl with MIT License | 4 votes |
def transform(spark): """Create the bookmark problem and summary tables.""" query = """ SELECT s.app_build_id, s.app_version, s.app_display_version, s.app_name, s.app_channel, s.uid, s.device_id AS device_id, s.submission_date_s3 AS submission_day, date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day, s.when, s.status, e.name AS engine_name, e.status AS engine_status, e.failure_reason AS engine_failure_reason, e.validation.problems IS NOT NULL AS engine_has_problems, e.validation.version AS engine_validation_version, e.validation.checked AS engine_validation_checked, e.validation.took AS engine_validation_took, p.name AS engine_validation_problem_name, p.count AS engine_validation_problem_count FROM sync_summary s LATERAL VIEW explode(s.engines) AS e LATERAL VIEW OUTER explode(e.validation.problems) AS p WHERE s.failure_reason IS NULL """ engine_validations = spark.sql(query) bookmark_validations = engine_validations.where( F.col("engine_name").isin("bookmarks", "bookmarks-buffered") ) bookmark_validation_problems = bookmark_validations.where( F.col("engine_has_problems") ) # Generate aggregates over all bookmarks bookmark_aggregates = ( bookmark_validations.where(F.col("engine_validation_checked").isNotNull()) # see bug 1410963 for submission date vs sync date .groupBy("submission_day").agg( F.countDistinct("uid", "device_id", "when").alias( "total_bookmark_validations" ), F.countDistinct("uid").alias("total_validated_users"), F.sum("engine_validation_checked").alias("total_bookmarks_checked"), ) ) bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems") bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day")