Python pyspark.sql.functions.lit() Examples
The following are 30
code examples of pyspark.sql.functions.lit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: transform.py From search-MjoLniR with MIT License | 7 votes |
def write_partition( df: DataFrame, output_table: str, output_path: str, partition_spec: Mapping[str, str], mode: str = 'overwrite' ) -> None: """Write dataframe to disk as parquet and add to hive metastore""" for k, v in partition_spec.items(): df = df.withColumn(k, F.lit(v)) expect_schema = df.sql_ctx.read.table(output_table).schema errors = _verify_schema_equality(expect_schema, df.schema) if errors: raise Exception('Output table has incompatible schema: {}'.format( ', '.join(errors))) df.write.mode(mode).parquet(output_path) df.sql_ctx.sparkSession.sql(_add_partition_ql( output_table, output_path, partition_spec)).collect() # Generic helpers for composing transformations
Example #2
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def explode_features(df, features=None): """Convert feature vector into individual columns Parameters ---------- df : pyspark.sql.DataFrame features : list of str or None Returns ------- pyspark.sql.DataFrame """ if features is None: features = df.schema['features'].metadata['features'] def extract_feature(features, idx): return float(features[idx]) extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType()) cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)] return df.select('*', *cols)
Example #3
Source File: test_tuning.py From search-MjoLniR with MIT License | 6 votes |
def test_split(spark): df = ( spark .range(1, 100 * 100) # convert into 100 "queries" with 100 values each. We need a # sufficiently large number of queries, or the split wont have # enough data for partitions to even out. .select(F.lit('foowiki').alias('wikiid'), (F.col('id')/100).cast('int').alias('norm_query_id'))) with_folds = mjolnir.training.tuning.split(df, (0.8, 0.2)).collect() fold_0 = [row for row in with_folds if row.fold == 0] fold_1 = [row for row in with_folds if row.fold == 1] # Check the folds are pretty close to requested total_len = float(len(with_folds)) assert 0.8 == pytest.approx(len(fold_0) / total_len, abs=0.015) assert 0.2 == pytest.approx(len(fold_1) / total_len, abs=0.015) # Check each norm query is only found on one side of the split queries_in_0 = set([row.norm_query_id for row in fold_0]) queries_in_1 = set([row.norm_query_id for row in fold_1]) assert len(queries_in_0.intersection(queries_in_1)) == 0
Example #4
Source File: compiler.py From ibis with Apache License 2.0 | 6 votes |
def compile_literal(t, expr, scope, raw=False, **kwargs): """ If raw is True, don't wrap the result with F.lit() """ value = expr.op().value if raw: return value if isinstance(value, collections.abc.Set): # Don't wrap set with F.lit if isinstance(value, frozenset): # Spark doens't like frozenset return set(value) else: return value elif isinstance(value, list): return F.array(*[F.lit(v) for v in value]) else: return F.lit(expr.op().value)
Example #5
Source File: indexes.py From koalas with Apache License 2.0 | 6 votes |
def _is_monotonic_decreasing(self): scol = self.spark.column window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1) prev = F.lag(scol, 1).over(window) cond = F.lit(True) for field in self.spark.data_type[::-1]: left = scol.getField(field.name) right = prev.getField(field.name) compare = MultiIndex._comparator_for_monotonic_decreasing(field.dataType) cond = F.when(left.eqNullSafe(right), cond).otherwise( compare(left, right, spark.Column.__lt__) ) cond = prev.isNull() | cond internal = InternalFrame( spark_frame=self._internal.spark_frame.select( self._internal.index_spark_columns + [cond] ), index_map=self._internal.index_map, ) return first_series(DataFrame(internal))
Example #6
Source File: indexing.py From koalas with Apache License 2.0 | 6 votes |
def _select_rows_by_iterable( self, rows_sel: Iterable ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: rows_sel = list(rows_sel) if len(rows_sel) == 0: return F.lit(False), None, None elif len(self._internal.index_spark_column_names) == 1: index_column = self._kdf_or_kser.index.to_series() index_data_type = index_column.spark.data_type if len(rows_sel) == 1: return ( index_column.spark.column == F.lit(rows_sel[0]).cast(index_data_type), None, None, ) else: return ( index_column.spark.column.isin( [F.lit(r).cast(index_data_type) for r in rows_sel] ), None, None, ) else: raise LocIndexer._NotImplemented("Cannot select with MultiIndex with Spark.")
Example #7
Source File: drybell_spark.py From snorkel-tutorials with Apache License 2.0 | 6 votes |
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
Example #8
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def max(self): def max(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.max(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(max)
Example #9
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def mean(self): def mean(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.mean(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(mean)
Example #10
Source File: TutorialClasses.py From KDD2019-HandsOn-Tutorial with MIT License | 5 votes |
def getTopEntities(self, e, targetType = '', maxCount = 20, minScore = 0.0): df1 = self.df row1 = df1.where(df1.EntityId == e).first() self.raiseErrorIfNotFound(row1, e) if targetType == '': df2 = df1.where(df1.EntityId != e) else : df2 = df1.where((df1.EntityId != e) & (df1.EntityType == targetType)) df3 = df2.select(df2.EntityId, df2.EntityType, udfCosineSimilarity(F.lit(row1.Data), df2.Data).alias('Score')) return df3.where(df3.Score >= minScore).orderBy(df3.Score.desc()).limit(maxCount) # COMMAND ---------- # MAGIC %md **PaperSimilarity** class to compute paper recommendations # COMMAND ---------- # Parameters: # resource: resource stream path # container: container name in Azure Storage (AS) account # account: Azure Storage (AS) account # sas: complete 'Blob service SAS URL' of the shared access signature (sas) for the container # key: access key for the container, if sas is specified, key is ignored # # Note: # resource does not have header # you need to provide value for either sas or key #
Example #11
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def std(self): def std(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.stddev(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(std)
Example #12
Source File: generic.py From koalas with Apache License 2.0 | 5 votes |
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column: # Special handle floating point types because Spark's count treats nan as a valid value, # whereas pandas count doesn't include nan. if isinstance(spark_type, (FloatType, DoubleType)): return F.count(F.nanvl(col, F.lit(None))) else: return F.count(col)
Example #13
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __add__(self, other): if isinstance(self.spark.data_type, StringType): # Concatenate string columns if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType): return column_op(F.concat)(self, other) # Handle df['col'] + 'literal' elif isinstance(other, str): return column_op(F.concat)(self, F.lit(other)) else: raise TypeError("string addition can only be applied to string series or literals.") else: return column_op(Column.__add__)(self, other)
Example #14
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __truediv__(self, other): """ __truediv__ has different behaviour between pandas and PySpark for several cases. 1. When divide np.inf by zero, PySpark returns null whereas pandas returns np.inf 2. When divide positive number by zero, PySpark returns null whereas pandas returns np.inf 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf +-------------------------------------------+ | dividend (divisor: 0) | PySpark | pandas | |-----------------------|---------|---------| | np.inf | null | np.inf | | -np.inf | null | -np.inf | | 10 | null | np.inf | | -10 | null | -np.inf | +-----------------------|---------|---------+ """ def truediv(left, right): return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise( F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( F.lit(np.inf).__div__(left) ) ) return numpy_column_op(truediv)(self, other)
Example #15
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __radd__(self, other): # Handle 'literal' + df['col'] if isinstance(self.spark.data_type, StringType) and isinstance(other, str): return self._with_new_scol(F.concat(F.lit(other), self.spark.column)) else: return column_op(Column.__radd__)(self, other)
Example #16
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __floordiv__(self, other): """ __floordiv__ has different behaviour between pandas and PySpark for several cases. 1. When divide np.inf by zero, PySpark returns null whereas pandas returns np.inf 2. When divide positive number by zero, PySpark returns null whereas pandas returns np.inf 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf +-------------------------------------------+ | dividend (divisor: 0) | PySpark | pandas | |-----------------------|---------|---------| | np.inf | null | np.inf | | -np.inf | null | -np.inf | | 10 | null | np.inf | | -10 | null | -np.inf | +-----------------------|---------|---------+ """ def floordiv(left, right): return F.when(F.lit(right is np.nan), np.nan).otherwise( F.when( F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right)) ).otherwise( F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( F.lit(np.inf).__div__(left) ) ) ) return numpy_column_op(floordiv)(self, other)
Example #17
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __rfloordiv__(self, other): def rfloordiv(left, right): return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise( F.when(F.lit(left) == np.nan, np.nan).otherwise(F.floor(F.lit(right).__div__(left))) ) return numpy_column_op(rfloordiv)(self, other)
Example #18
Source File: indexing.py From koalas with Apache License 2.0 | 5 votes |
def _select_rows_by_iterable( self, rows_sel: Iterable ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: sdf = self._internal.spark_frame if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel): offset = sdf.count() else: offset = 0 new_rows_sel = [] for key in list(rows_sel): if not isinstance(key, (int, np.int, np.int64, np.int32)): raise TypeError( "cannot do positional indexing with these indexers [{}] of {}".format( key, type(key) ) ) if key < 0: key = key + offset new_rows_sel.append(key) if len(new_rows_sel) != len(set(new_rows_sel)): raise NotImplementedError( "Duplicated row selection is not currently supported; " "however, normalised index was [%s]" % new_rows_sel ) sequence_scol = sdf[self._sequence_col] cond = [] for key in new_rows_sel: cond.append(sequence_scol == F.lit(int(key)).cast(LongType())) if len(cond) == 0: cond = [F.lit(False)] return reduce(lambda x, y: x | y, cond), None, None
Example #19
Source File: numpy_compat.py From koalas with Apache License 2.0 | 5 votes |
def maybe_dispatch_ufunc_to_spark_func( ser_or_index, ufunc: Callable, method: str, *inputs, **kwargs: Any ): from databricks.koalas.base import column_op op_name = ufunc.__name__ if ( method == "__call__" and (op_name in unary_np_spark_mappings or op_name in binary_np_spark_mappings) and kwargs.get("out") is None ): np_spark_map_func = unary_np_spark_mappings.get(op_name) or binary_np_spark_mappings.get( op_name ) def convert_arguments(*args): args = [ # type: ignore F.lit(inp) if not isinstance(inp, Column) else inp for inp in args ] # type: ignore return np_spark_map_func(*args) return column_op(convert_arguments)(*inputs) # type: ignore else: return NotImplemented
Example #20
Source File: transform.py From datadevops with MIT License | 5 votes |
def process_fact_parking(sensordata_sdf: DataFrame, dim_parkingbay_sdf: DataFrame, dim_location_sdf: DataFrame, dim_st_marker_sdf: DataFrame, load_id, loaded_on): """Transform sensordata into fact_parking""" dim_date_id = loaded_on.strftime("%Y%M%d") midnight = loaded_on.replace(hour=0, minute=0, second=0, microsecond=0) dim_time_id = (midnight - loaded_on).seconds # Build fact fact_parking = sensordata_sdf\ .join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\ .join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\ .join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\ .select( lit(dim_date_id).alias("dim_date_id"), lit(dim_time_id).alias("dim_time_id"), when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID)) .otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"), when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID)) .otherwise(col("l.dim_location_id")).alias("dim_location_id"), when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID)) .otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"), "status", lit(load_id).alias("load_id"), lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on") ) return fact_parking
Example #21
Source File: ga_chp_bq_ingest_avro_file.py From MorphL-Community-Edition with Apache License 2.0 | 5 votes |
def main(): spark_session = ( SparkSession.builder .appName(APPLICATION_NAME) .master(MASTER_URL) .config('spark.cassandra.connection.host', MORPHL_SERVER_IP_ADDRESS) .config('spark.cassandra.auth.username', MORPHL_CASSANDRA_USERNAME) .config('spark.cassandra.auth.password', MORPHL_CASSANDRA_PASSWORD) .config('spark.sql.shuffle.partitions', 16) .getOrCreate()) log4j = spark_session.sparkContext._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) avro_df = ( spark_session .read .format('avro') .load(LOCAL_AVRO_FILE)) save_options_ga_chp_bq_features_raw = { 'keyspace': MORPHL_CASSANDRA_KEYSPACE, 'table': 'ga_chp_bq_features_raw_t' if TRAINING_OR_PREDICTION == 'training' else 'ga_chp_bq_features_raw_p' } (avro_df .withColumn('day_of_data_capture', f.lit(DAY_OF_DATA_CAPTURE)) .withColumn('website_url', f.lit(WEBSITE_URL)) .write .format('org.apache.spark.sql.cassandra') .mode('append') .options(**save_options_ga_chp_bq_features_raw) .save())
Example #22
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_null_literal(t, expr, scope): return F.lit(None)
Example #23
Source File: TutorialClasses.py From KDD2019-HandsOn-Tutorial with MIT License | 5 votes |
def _getTopEntitiesByEmbedding(self, e, maxCount, minScore): df1 = self.df paperdf = self.mag.getDataframe('Papers') row1 = df1.where(df1.EntityId == e).first() df2 = df1.where(df1.EntityId != e) df3 = df2.select(df2.EntityId, udfCosineSimilarityN(F.lit(row1.Data), df2.Data).alias('Score')) return df3.join(paperdf, df3.EntityId == paperdf.PaperId, 'inner').select(paperdf.PaperId, paperdf.PaperTitle, df3.Score).where((~F.isnan(df3.Score)) & (df3.Score >= minScore)).orderBy(df3.Score.desc()).limit(maxCount)
Example #24
Source File: transform.py From search-MjoLniR with MIT License | 5 votes |
def read_partition( spark: SparkSession, table: str, partition_spec: Mapping[str, str], schema: Optional[T.StructType] = None, direct_parquet_read: bool = False ) -> DataFrame: """Read a single partition from a hive table. Verifies the partition specification describes a complete partition, that the partition exists, and optionally that the table is compatible with an expected schema. The partition could still be empty. """ # We don't need to do anything with the result, our goal is to # trigger AnalysisException when the arguments are invalid. spark.sql(_describe_partition_ql(table, partition_spec)).collect() partition_cond = F.lit(True) for k, v in partition_spec.items(): partition_cond &= F.col(k) == v df = spark.read.table(table).where(partition_cond) # The df we have now has types defined by the hive table, but this downgrades # non-standard types like VectorUDT() to it's sql equivalent. Use the first # df to find the files, then read them directly. if direct_parquet_read: input_files = list(df._jdf.inputFiles()) # type: ignore input_dirs = set(os.path.dirname(path) for path in input_files) if len(input_dirs) != 1: raise Exception('Expected single directory containing partition data: [{}]'.format( '],['.join(input_files))) df = spark.read.parquet(list(input_dirs)[0]) if schema is not None: # TODO: This only allows extra top level columns, anything # nested must be exactly the same. Fine for now. _verify_schema_compatability(schema, df.schema) df = df.select(*(field.name for field in schema)) # Drop partitioning columns. These are not part of the mjolnir transformations, and # are only an implementation detail of putting them on disk and tracking history. return df.drop(*partition_spec.keys())
Example #25
Source File: __init__.py From search-MjoLniR with MIT License | 5 votes |
def at_least_n_distinct(col, limit): """Count distinct that works with windows The standard distinct count in spark sql can't be applied in a window. This implementation allows that to work """ sc = SparkContext._active_spark_context j_cols = _to_seq(sc, [_to_java_column(col), _to_java_column(F.lit(limit))]) jc = sc._jvm.org.wikimedia.search.mjolnir.AtLeastNDistinct().apply(j_cols) return Column(jc)
Example #26
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_sign(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.when(src_column == 0, F.lit(0.0)).otherwise( F.when(src_column > 0, F.lit(1.0)).otherwise(-1.0) )
Example #27
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_string_find(t, expr, scope, **kwargs): op = expr.op() @F.udf('long') def str_find(s, substr, start, end): return s.find(substr, start, end) src_column = t.translate(op.arg, scope) substr_column = t.translate(op.substr, scope) start_column = t.translate(op.start, scope) if op.start else F.lit(None) end_column = t.translate(op.end, scope) if op.end else F.lit(None) return str_find(src_column, substr_column, start_column, end_column)
Example #28
Source File: window.py From koalas with Apache License 2.0 | 5 votes |
def min(self): def min(scol): return F.when( F.row_number().over(self._unbounded_window) >= self._min_periods, F.min(scol).over(self._window), ).otherwise(F.lit(None)) return self._apply_as_series_or_frame(min)
Example #29
Source File: compiler.py From ibis with Apache License 2.0 | 5 votes |
def compile_null_if(t, expr, scope, **kwargs): op = expr.op() col = t.translate(op.arg, scope) nullif_col = t.translate(op.null_if_expr, scope) return F.when(col == nullif_col, F.lit(None)).otherwise(col)
Example #30
Source File: Provider.py From cccatalog with MIT License | 5 votes |
def getData(self): spk = SparkSession.builder.getOrCreate() dataDF = spk.read.parquet(self.input) providerDF = dataDF.select(concat(concat('provider_domain', 'content_path'), \ when(col('content_query_string') != '', concat(lit('?'), col('content_query_string')))\ .otherwise(lit(''))).alias('url'), \ concat('warc_segment', lit('/warc/'), 'warc_filename').alias('warc_filename'), \ 'content_offset', 'deflate_length')\ .where(col('provider_domain').like('%{}'.format(self.domain)))\ .dropDuplicates(['url']) providerData = providerDF.rdd.map(lambda row: '\t'.join([str(col) for col in row])).collect() #convert dataframe into a list of tab delimited elements return providerData