Python pyspark.sql.functions.first() Examples
The following are 22
code examples of pyspark.sql.functions.first().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: indexes.py From koalas with Apache License 2.0 | 6 votes |
def _summary(self, name=None): """ Return a summarized representation. Parameters ---------- name : str name to use in the summary representation Returns ------- String with a summarized representation of the index """ head, tail, total_count = self._internal.spark_frame.select( F.first(self.spark.column), F.last(self.spark.column), F.count(F.expr("*")) ).first() if total_count > 0: index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) else: index_summary = "" if name is None: name = type(self).__name__ return "%s: %s entries%s" % (name, total_count, index_summary)
Example #2
Source File: indexes.py From koalas with Apache License 2.0 | 6 votes |
def has_duplicates(self) -> bool: """ If index has duplicates, return True, otherwise False. Examples -------- >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac')) >>> kdf.index.has_duplicates True >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')]) >>> kdf.index.has_duplicates False >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')]) >>> kdf.index.has_duplicates True """ sdf = self._internal.spark_frame.select(self.spark.column) scol = scol_for(sdf, sdf.columns[0]) return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]
Example #3
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 6 votes |
def _join_results(self, scaffolds_df): def _read_rows(row): idx, _, dec = row.split("\t") return ps.Row(id=idx, decoration_smi=dec) sampled_df = SPARK.createDataFrame(SC.textFile(self._tmp_path( "sampled_decorations"), self.num_partitions).map(_read_rows)) if self.decorator_type == "single": processed_df = self._join_results_single(scaffolds_df, sampled_df) elif self.decorator_type == "multi": processed_df = self._join_results_multi(scaffolds_df, sampled_df) else: raise ValueError("decorator_type has an invalid value '{}'".format(self.decorator_type)) return processed_df\ .where("smiles IS NOT NULL")\ .groupBy("smiles")\ .agg( psf.first("scaffold").alias("scaffold"), psf.first("decorations").alias("decorations"), psf.count("smiles").alias("count"))
Example #4
Source File: series.py From koalas with Apache License 2.0 | 6 votes |
def __repr__(self): max_display_count = get_option("display.max_rows") if max_display_count is None: return self._to_internal_pandas().to_string(name=self.name, dtype=self.dtype) pser = self._kdf._get_or_create_repr_pandas_cache(max_display_count)[self.name] pser_length = len(pser) pser = pser.iloc[:max_display_count] if pser_length > max_display_count: repr_string = pser.to_string(length=True) rest, prev_footer = repr_string.rsplit("\n", 1) match = REPR_PATTERN.search(prev_footer) if match is not None: length = match.group("length") name = str(self.dtype.name) footer = "\nName: {name}, dtype: {dtype}\nShowing only the first {length}".format( length=length, name=self.name, dtype=pprint_thing(name) ) return rest + footer return pser.to_string(name=self.name, dtype=self.dtype)
Example #5
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_first_value(t, expr, scope, *, window, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.first(src_column).over(window)
Example #6
Source File: slice_db.py From reinvent-scaffold-decorator with MIT License | 5 votes |
def run(self): def _enumerate(row, max_cuts=self.max_cuts, enumerator=self.enumerator): fields = row.split("\t") smiles = fields[0] mol = uc.to_mol(smiles) out_rows = [] if mol: for cuts in range(1, max_cuts + 1): for sliced_mol in enumerator.enumerate(mol, cuts=cuts): # normalize scaffold and decorations scaff_smi, dec_smis = sliced_mol.to_smiles() dec_smis = [smi for num, smi in sorted(dec_smis.items())] out_rows.append(ps.Row( scaffold=scaff_smi, decorations=dec_smis, smiles=uc.to_smiles(mol), cuts=cuts )) return out_rows enumeration_df = SPARK.createDataFrame( SC.textFile(self.input_path) .repartition(self.partitions) .flatMap(_enumerate))\ .groupBy("scaffold", "decorations")\ .agg(psf.first("cuts").alias("cuts"), psf.first("smiles").alias("smiles"))\ .persist() self._log("info", "Obtained %d sliced molecules", enumeration_df.count()) if self.output_path: enumeration_df.write.parquet(self.output_path) return enumeration_df
Example #7
Source File: fields.py From python_mozetl with MIT License | 5 votes |
def agg_first(field_name): return F.first(field_name, ignorenulls=True).alias(field_name)
Example #8
Source File: groupby.py From sparklingpandas with Apache License 2.0 | 5 votes |
def first(self): """ Pull out the first from each group. Note: this is different than Spark's first. """ # If its possible to use Spark SQL grouping do it if self._can_use_new_school(): self._prep_spark_sql_groupby() import pyspark.sql.functions as func return self._use_aggregation(func.first) myargs = self._myargs mykwargs = self._mykwargs self._prep_pandas_groupby() def create_combiner(x): return x.groupby(*myargs, **mykwargs).first() def merge_value(x, y): return create_combiner(x) def merge_combiner(x, y): return x rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return DataFrame.fromDataFrameRDD(rddOfFirst, self.sql_ctx)
Example #9
Source File: series.py From koalas with Apache License 2.0 | 5 votes |
def first_series(df): """ Takes a DataFrame and returns the first column of the DataFrame as a Series """ assert isinstance(df, (DataFrame, pd.DataFrame)), type(df) if isinstance(df, DataFrame): return df._kser_for(df._internal.column_labels[0]) else: return df[df.columns[0]]
Example #10
Source File: series.py From koalas with Apache License 2.0 | 5 votes |
def item(self): """ Return the first element of the underlying data as a Python scalar. Returns ------- scalar The first element of Series. Raises ------ ValueError If the data is not length-1. Examples -------- >>> kser = ks.Series([10]) >>> kser.item() 10 """ return self.head(2).to_pandas().item()
Example #11
Source File: series.py From koalas with Apache License 2.0 | 5 votes |
def head(self, n: int = 5) -> "Series": """ Return the first n rows. This function returns the first n rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it. Parameters ---------- n : Integer, default = 5 Returns ------- The first n rows of the caller object. Examples -------- >>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion']}) >>> df.animal.head(2) # doctest: +NORMALIZE_WHITESPACE 0 alligator 1 bee Name: animal, dtype: object """ return first_series(self.to_dataframe().head(n)) # TODO: Categorical type isn't supported (due to PySpark's limitation) and # some doctests related with timestamps were not added.
Example #12
Source File: indexes.py From koalas with Apache License 2.0 | 5 votes |
def __repr__(self): max_display_count = get_option("display.max_rows") if max_display_count is None: return repr(self.to_pandas()) pindex = self._kdf._get_or_create_repr_pandas_cache(max_display_count).index pindex_length = len(pindex) repr_string = repr(pindex[:max_display_count]) if pindex_length > max_display_count: footer = "\nShowing only the first {}".format(max_display_count) return repr_string + footer return repr_string
Example #13
Source File: indexes.py From koalas with Apache License 2.0 | 5 votes |
def argmin(self): """ Return a minimum argument indexer. Parameters ---------- skipna : bool, default True Returns ------- minimum argument indexer Examples -------- >>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3]) >>> kidx Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64') >>> kidx.argmin() 7 """ sdf = self._internal.spark_frame.select(self.spark.column) sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__") sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col) return sdf.orderBy(self.spark.column.asc(), F.col(sequence_col).asc()).first()[0]
Example #14
Source File: indexes.py From koalas with Apache License 2.0 | 5 votes |
def argmax(self): """ Return a maximum argument indexer. Parameters ---------- skipna : bool, default True Returns ------- maximum argument indexer Examples -------- >>> kidx = ks.Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3]) >>> kidx Int64Index([10, 9, 8, 7, 100, 5, 4, 3, 100, 3], dtype='int64') >>> kidx.argmax() 4 """ sdf = self._internal.spark_frame.select(self.spark.column) sequence_col = verify_temp_column_name(sdf, "__distributed_sequence_column__") sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name=sequence_col) # spark_frame here looks like below # +-----------------+---------------+ # |__index_level_0__|__index_value__| # +-----------------+---------------+ # | 0| 10| # | 4| 100| # | 2| 8| # | 3| 7| # | 6| 4| # | 5| 5| # | 7| 3| # | 8| 100| # | 1| 9| # +-----------------+---------------+ return sdf.orderBy(self.spark.column.desc(), F.col(sequence_col).asc()).first()[0]
Example #15
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_arbitrary(t, expr, scope, context=None, **kwargs): how = expr.op().how if how == 'first': fn = functools.partial(F.first, ignorenulls=True) elif how == 'last': fn = functools.partial(F.last, ignorenulls=True) else: raise NotImplementedError("Does not support 'how': {}".format(how)) return compile_aggregator(t, expr, scope, fn, context)
Example #16
Source File: series.py From koalas with Apache License 2.0 | 4 votes |
def nsmallest(self, n: int = 5) -> "Series": """ Return the smallest `n` elements. Parameters ---------- n : int, default 5 Return this many ascending sorted values. Returns ------- Series The `n` smallest values in the Series, sorted in increasing order. See Also -------- Series.nlargest: Get the `n` largest elements. Series.sort_values: Sort Series by values. Series.head: Return the first `n` rows. Notes ----- Faster than ``.sort_values().head(n)`` for small `n` relative to the size of the ``Series`` object. In Koalas, thanks to Spark's lazy execution and query optimizer, the two would have same performance. Examples -------- >>> data = [1, 2, 3, 4, np.nan ,6, 7, 8] >>> s = ks.Series(data) >>> s 0 1.0 1 2.0 2 3.0 3 4.0 4 NaN 5 6.0 6 7.0 7 8.0 Name: 0, dtype: float64 The `n` largest elements where ``n=5`` by default. >>> s.nsmallest() 0 1.0 1 2.0 2 3.0 3 4.0 5 6.0 Name: 0, dtype: float64 >>> s.nsmallest(3) 0 1.0 1 2.0 2 3.0 Name: 0, dtype: float64 """ return first_series(self.to_frame().nsmallest(n=n, columns=self.name))
Example #17
Source File: series.py From koalas with Apache License 2.0 | 4 votes |
def _rank(self, method="average", ascending=True, part_cols=()): if method not in ["average", "min", "max", "first", "dense"]: msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" raise ValueError(msg) if len(self._internal.index_spark_column_names) > 1: raise ValueError("rank do not support index now") if ascending: asc_func = lambda scol: scol.asc() else: asc_func = lambda scol: scol.desc() if method == "first": window = ( Window.orderBy( asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)), ) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) scol = F.row_number().over(window) elif method == "dense": window = ( Window.orderBy(asc_func(self.spark.column)) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) scol = F.dense_rank().over(window) else: if method == "average": stat_func = F.mean elif method == "min": stat_func = F.min elif method == "max": stat_func = F.max window1 = ( Window.orderBy(asc_func(self.spark.column)) .partitionBy(*part_cols) .rowsBetween(Window.unboundedPreceding, Window.currentRow) ) window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing ) scol = stat_func(F.row_number().over(window1)).over(window2) kser = self._with_new_scol(scol).rename(self.name) return kser.astype(np.float64)
Example #18
Source File: series.py From koalas with Apache License 2.0 | 4 votes |
def combine_first(self, other): """ Combine Series values, choosing the calling Series's values first. Parameters ---------- other : Series The value(s) to be combined with the `Series`. Returns ------- Series The result of combining the Series with the other object. See Also -------- Series.combine : Perform elementwise operation on two Series using a given function. Notes ----- Result index will be the union of the two indexes. Examples -------- >>> s1 = ks.Series([1, np.nan]) >>> s2 = ks.Series([3, 4]) >>> s1.combine_first(s2) 0 1.0 1 4.0 Name: 0, dtype: float64 """ if not isinstance(other, ks.Series): raise ValueError("`combine_first` only allows `Series` for parameter `other`") if same_anchor(self, other): this = self.spark.column that = other.spark.column combined = self._kdf else: with option_context("compute.ops_on_diff_frames", True): combined = combine_frames(self.to_frame(), other) this = combined["this"]._internal.spark_column_for(self._column_label) that = combined["that"]._internal.spark_column_for(other._column_label) # If `self` has missing value, use value of `other` cond = F.when(this.isNull(), that).otherwise(this) # If `self` and `other` come from same frame, the anchor should be kept if same_anchor(self, other): return self._with_new_scol(cond).rename(self.name) index_scols = combined._internal.index_spark_columns sdf = combined._internal.spark_frame.select( *index_scols, cond.alias(self._internal.data_spark_column_names[0]) ).distinct() internal = InternalFrame( spark_frame=sdf, index_map=self._internal.index_map, column_labels=self._internal.column_labels, data_spark_columns=[scol_for(sdf, self._internal.data_spark_column_names[0])], column_label_names=self._internal.column_label_names, ) return first_series(ks.DataFrame(internal))
Example #19
Source File: series.py From koalas with Apache License 2.0 | 4 votes |
def _fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, part_cols=()): axis = validate_axis(axis) inplace = validate_bool_kwarg(inplace, "inplace") if axis != 0: raise NotImplementedError("fillna currently only works for axis=0 or axis='index'") if (value is None) and (method is None): raise ValueError("Must specify a fillna 'value' or 'method' parameter.") if (method is not None) and (method not in ["ffill", "pad", "backfill", "bfill"]): raise ValueError("Expecting 'pad', 'ffill', 'backfill' or 'bfill'.") scol = self.spark.column if isinstance(self.spark.data_type, (FloatType, DoubleType)): cond = scol.isNull() | F.isnan(scol) else: if not self.spark.nullable: if inplace: return else: return self cond = scol.isNull() if value is not None: if not isinstance(value, (float, int, str, bool)): raise TypeError("Unsupported type %s" % type(value)) if limit is not None: raise ValueError("limit parameter for value is not support now") scol = F.when(cond, value).otherwise(scol) else: if method in ["ffill", "pad"]: func = F.last end = Window.currentRow - 1 if limit is not None: begin = Window.currentRow - limit else: begin = Window.unboundedPreceding elif method in ["bfill", "backfill"]: func = F.first begin = Window.currentRow + 1 if limit is not None: end = Window.currentRow + limit else: end = Window.unboundedFollowing window = ( Window.partitionBy(*part_cols) .orderBy(NATURAL_ORDER_COLUMN_NAME) .rowsBetween(begin, end) ) scol = F.when(cond, func(scol, True).over(window)).otherwise(scol) if inplace: self._kdf._update_internal_frame( self._kdf._internal.with_new_spark_column(self._column_label, scol) ) else: return self._with_new_scol(scol).rename(self.name)
Example #20
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def nlargest(self, n=5): """ Return the first n rows ordered by columns in descending order in group. Return the first n rows with the smallest values in columns, in descending order. The columns that are not specified are returned as well, but not used for ordering. Parameters ---------- n : int Number of items to retrieve. See Also -------- databricks.koalas.Series.nlargest databricks.koalas.DataFrame.nlargest Examples -------- >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3], ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b']) >>> df.groupby(['a'])['b'].nlargest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE a 1 1 2 2 4 3 3 7 4 Name: b, dtype: int64 """ if len(self._kdf._internal.index_names) > 1: raise ValueError("nlargest do not support multi-index now") sdf = self._kdf._internal.spark_frame name = self._agg_columns[0]._internal.data_spark_column_names[0] window = Window.partitionBy(self._groupkeys_scols).orderBy( self._agg_columns[0].spark.column.desc(), NATURAL_ORDER_COLUMN_NAME ) sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n) internal = InternalFrame( spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME), index_map=OrderedDict( [ (s._internal.data_spark_column_names[0], s._internal.column_labels[0]) for s in self._groupkeys ] + list(self._kdf._internal.index_map.items()) ), data_spark_columns=[scol_for(sdf, name)], ) return first_series(DataFrame(internal)) # TODO: add bins, normalize parameter
Example #21
Source File: groupby.py From koalas with Apache License 2.0 | 4 votes |
def nsmallest(self, n=5): """ Return the first n rows ordered by columns in ascending order in group. Return the first n rows with the smallest values in columns, in ascending order. The columns that are not specified are returned as well, but not used for ordering. Parameters ---------- n : int Number of items to retrieve. See Also -------- databricks.koalas.Series.nsmallest databricks.koalas.DataFrame.nsmallest Examples -------- >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3], ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b']) >>> df.groupby(['a'])['b'].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE a 1 0 1 2 3 2 3 6 3 Name: b, dtype: int64 """ if len(self._kdf._internal.index_names) > 1: raise ValueError("nsmallest do not support multi-index now") sdf = self._kdf._internal.spark_frame name = self._agg_columns[0]._internal.data_spark_column_names[0] window = Window.partitionBy(self._groupkeys_scols).orderBy( self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME ) sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n) internal = InternalFrame( spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME), index_map=OrderedDict( [ (s._internal.data_spark_column_names[0], s._internal.column_labels[0]) for s in self._groupkeys ] + list(self._kdf._internal.index_map.items()) ), data_spark_columns=[scol_for(sdf, name)], ) return first_series(DataFrame(internal)) # TODO: add keep parameter
Example #22
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 4 votes |
def run(self, initial_scaffolds): randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType())) get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType())) remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType()) results_df = self._initialize_results(initial_scaffolds) scaffolds_df = results_df.select("smiles", "scaffold", "decorations") i = 0 while scaffolds_df.count() > 0: # generate randomized SMILES self._log("info", "Starting iteration #%d.", i) scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\ .select( "smiles", "scaffold", "decorations", psf.explode("randomized_scaffold").alias("randomized_scaffold"))\ .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\ .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\ .withColumn("id", psf.monotonically_increasing_id())\ .persist() self._log("info", "Generated %d randomized SMILES from %d scaffolds.", scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count()) # sample each randomized scaffold N times scaffolds = scaffolds_df.select("id", "randomized_scaffold")\ .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator() self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count()) self._log("info", "Sampled %d scaffolds.", scaffolds_df.count()) # merge decorated molecules joined_df = self._join_results(scaffolds_df).persist() if joined_df.count() > 0: self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds", scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count()) scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\ .select("smiles", "scaffold", "decorations")\ .where("smiles LIKE '%*%'") self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count()) results_df = results_df.union(joined_df)\ .groupBy("smiles")\ .agg( psf.first("scaffold").alias("scaffold"), psf.first("decorations").alias("decorations"), psf.sum("count").alias("count"))\ .persist() i += 1 return results_df