Python pyspark.sql.functions.lit() Examples

The following are 30 code examples of pyspark.sql.functions.lit(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: transform.py    From search-MjoLniR with MIT License 7 votes vote down vote up
def write_partition(
    df: DataFrame, output_table: str, output_path: str,
    partition_spec: Mapping[str, str], mode: str = 'overwrite'
) -> None:
    """Write dataframe to disk as parquet and add to hive metastore"""
    for k, v in partition_spec.items():
        df = df.withColumn(k, F.lit(v))

    expect_schema = df.sql_ctx.read.table(output_table).schema
    errors = _verify_schema_equality(expect_schema, df.schema)
    if errors:
        raise Exception('Output table has incompatible schema: {}'.format(
            ', '.join(errors)))
    df.write.mode(mode).parquet(output_path)
    df.sql_ctx.sparkSession.sql(_add_partition_ql(
        output_table, output_path, partition_spec)).collect()


# Generic helpers for composing transformations 
Example #2
Source File: feature_engineering.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def explode_features(df, features=None):
    """Convert feature vector into individual columns

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    features : list of str or None

    Returns
    -------
    pyspark.sql.DataFrame
    """
    if features is None:
        features = df.schema['features'].metadata['features']

    def extract_feature(features, idx):
        return float(features[idx])
    extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
    cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
    return df.select('*', *cols) 
Example #3
Source File: test_tuning.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def test_split(spark):
    df = (
        spark
        .range(1, 100 * 100)
        # convert into 100 "queries" with 100 values each. We need a
        # sufficiently large number of queries, or the split wont have
        # enough data for partitions to even out.
        .select(F.lit('foowiki').alias('wikiid'),
                (F.col('id')/100).cast('int').alias('norm_query_id')))

    with_folds = mjolnir.training.tuning.split(df, (0.8, 0.2)).collect()

    fold_0 = [row for row in with_folds if row.fold == 0]
    fold_1 = [row for row in with_folds if row.fold == 1]

    # Check the folds are pretty close to requested
    total_len = float(len(with_folds))
    assert 0.8 == pytest.approx(len(fold_0) / total_len, abs=0.015)
    assert 0.2 == pytest.approx(len(fold_1) / total_len, abs=0.015)

    # Check each norm query is only found on one side of the split
    queries_in_0 = set([row.norm_query_id for row in fold_0])
    queries_in_1 = set([row.norm_query_id for row in fold_1])
    assert len(queries_in_0.intersection(queries_in_1)) == 0 
Example #4
Source File: compiler.py    From ibis with Apache License 2.0 6 votes vote down vote up
def compile_literal(t, expr, scope, raw=False, **kwargs):
    """ If raw is True, don't wrap the result with F.lit()
    """
    value = expr.op().value

    if raw:
        return value

    if isinstance(value, collections.abc.Set):
        # Don't wrap set with F.lit
        if isinstance(value, frozenset):
            # Spark doens't like frozenset
            return set(value)
        else:
            return value
    elif isinstance(value, list):
        return F.array(*[F.lit(v) for v in value])
    else:
        return F.lit(expr.op().value) 
Example #5
Source File: indexes.py    From koalas with Apache License 2.0 6 votes vote down vote up
def _is_monotonic_decreasing(self):
        scol = self.spark.column
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
        prev = F.lag(scol, 1).over(window)

        cond = F.lit(True)
        for field in self.spark.data_type[::-1]:
            left = scol.getField(field.name)
            right = prev.getField(field.name)
            compare = MultiIndex._comparator_for_monotonic_decreasing(field.dataType)
            cond = F.when(left.eqNullSafe(right), cond).otherwise(
                compare(left, right, spark.Column.__lt__)
            )

        cond = prev.isNull() | cond

        internal = InternalFrame(
            spark_frame=self._internal.spark_frame.select(
                self._internal.index_spark_columns + [cond]
            ),
            index_map=self._internal.index_map,
        )

        return first_series(DataFrame(internal)) 
Example #6
Source File: indexing.py    From koalas with Apache License 2.0 6 votes vote down vote up
def _select_rows_by_iterable(
        self, rows_sel: Iterable
    ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
        rows_sel = list(rows_sel)
        if len(rows_sel) == 0:
            return F.lit(False), None, None
        elif len(self._internal.index_spark_column_names) == 1:
            index_column = self._kdf_or_kser.index.to_series()
            index_data_type = index_column.spark.data_type
            if len(rows_sel) == 1:
                return (
                    index_column.spark.column == F.lit(rows_sel[0]).cast(index_data_type),
                    None,
                    None,
                )
            else:
                return (
                    index_column.spark.column.isin(
                        [F.lit(r).cast(index_data_type) for r in rows_sel]
                    ),
                    None,
                    None,
                )
        else:
            raise LocIndexer._NotImplemented("Cannot select with MultiIndex with Spark.") 
Example #7
Source File: drybell_spark.py    From snorkel-tutorials with Apache License 2.0 6 votes vote down vote up
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}") 
Example #8
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def max(self):
        def max(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.max(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(max) 
Example #9
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def mean(self):
        def mean(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.mean(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(mean) 
Example #10
Source File: TutorialClasses.py    From KDD2019-HandsOn-Tutorial with MIT License 5 votes vote down vote up
def getTopEntities(self, e, targetType = '', maxCount = 20, minScore = 0.0):
    df1 = self.df
    row1 = df1.where(df1.EntityId == e).first()
    self.raiseErrorIfNotFound(row1, e)

    if targetType == '':
      df2 = df1.where(df1.EntityId != e)
    else :
      df2 = df1.where((df1.EntityId != e) & (df1.EntityType == targetType))

    df3 = df2.select(df2.EntityId, df2.EntityType, udfCosineSimilarity(F.lit(row1.Data), df2.Data).alias('Score'))
    return df3.where(df3.Score >= minScore).orderBy(df3.Score.desc()).limit(maxCount)

# COMMAND ----------

# MAGIC %md **PaperSimilarity** class to compute paper recommendations

# COMMAND ----------

#   Parameters:
#     resource: resource stream path
#     container: container name in Azure Storage (AS) account
#     account: Azure Storage (AS) account
#     sas: complete 'Blob service SAS URL' of the shared access signature (sas) for the container
#     key: access key for the container, if sas is specified, key is ignored
#
#   Note:
#     resource does not have header
#     you need to provide value for either sas or key
# 
Example #11
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def std(self):
        def std(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.stddev(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(std) 
Example #12
Source File: generic.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column:
        # Special handle floating point types because Spark's count treats nan as a valid value,
        # whereas pandas count doesn't include nan.
        if isinstance(spark_type, (FloatType, DoubleType)):
            return F.count(F.nanvl(col, F.lit(None)))
        else:
            return F.count(col) 
Example #13
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def __add__(self, other):
        if isinstance(self.spark.data_type, StringType):
            # Concatenate string columns
            if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType):
                return column_op(F.concat)(self, other)
            # Handle df['col'] + 'literal'
            elif isinstance(other, str):
                return column_op(F.concat)(self, F.lit(other))
            else:
                raise TypeError("string addition can only be applied to string series or literals.")
        else:
            return column_op(Column.__add__)(self, other) 
Example #14
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def __truediv__(self, other):
        """
        __truediv__ has different behaviour between pandas and PySpark for several cases.
        1. When divide np.inf by zero, PySpark returns null whereas pandas returns np.inf
        2. When divide positive number by zero, PySpark returns null whereas pandas returns np.inf
        3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf
        4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf

        +-------------------------------------------+
        | dividend (divisor: 0) | PySpark |  pandas |
        |-----------------------|---------|---------|
        |         np.inf        |   null  |  np.inf |
        |        -np.inf        |   null  | -np.inf |
        |           10          |   null  |  np.inf |
        |          -10          |   null  | -np.inf |
        +-----------------------|---------|---------+
        """

        def truediv(left, right):
            return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise(
                F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
                    F.lit(np.inf).__div__(left)
                )
            )

        return numpy_column_op(truediv)(self, other) 
Example #15
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def __radd__(self, other):
        # Handle 'literal' + df['col']
        if isinstance(self.spark.data_type, StringType) and isinstance(other, str):
            return self._with_new_scol(F.concat(F.lit(other), self.spark.column))
        else:
            return column_op(Column.__radd__)(self, other) 
Example #16
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def __floordiv__(self, other):
        """
        __floordiv__ has different behaviour between pandas and PySpark for several cases.
        1. When divide np.inf by zero, PySpark returns null whereas pandas returns np.inf
        2. When divide positive number by zero, PySpark returns null whereas pandas returns np.inf
        3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf
        4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf

        +-------------------------------------------+
        | dividend (divisor: 0) | PySpark |  pandas |
        |-----------------------|---------|---------|
        |         np.inf        |   null  |  np.inf |
        |        -np.inf        |   null  | -np.inf |
        |           10          |   null  |  np.inf |
        |          -10          |   null  | -np.inf |
        +-----------------------|---------|---------+
        """

        def floordiv(left, right):
            return F.when(F.lit(right is np.nan), np.nan).otherwise(
                F.when(
                    F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right))
                ).otherwise(
                    F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
                        F.lit(np.inf).__div__(left)
                    )
                )
            )

        return numpy_column_op(floordiv)(self, other) 
Example #17
Source File: base.py    From koalas with Apache License 2.0 5 votes vote down vote up
def __rfloordiv__(self, other):
        def rfloordiv(left, right):
            return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise(
                F.when(F.lit(left) == np.nan, np.nan).otherwise(F.floor(F.lit(right).__div__(left)))
            )

        return numpy_column_op(rfloordiv)(self, other) 
Example #18
Source File: indexing.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _select_rows_by_iterable(
        self, rows_sel: Iterable
    ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
        sdf = self._internal.spark_frame

        if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel):
            offset = sdf.count()
        else:
            offset = 0

        new_rows_sel = []
        for key in list(rows_sel):
            if not isinstance(key, (int, np.int, np.int64, np.int32)):
                raise TypeError(
                    "cannot do positional indexing with these indexers [{}] of {}".format(
                        key, type(key)
                    )
                )
            if key < 0:
                key = key + offset
            new_rows_sel.append(key)

        if len(new_rows_sel) != len(set(new_rows_sel)):
            raise NotImplementedError(
                "Duplicated row selection is not currently supported; "
                "however, normalised index was [%s]" % new_rows_sel
            )

        sequence_scol = sdf[self._sequence_col]
        cond = []
        for key in new_rows_sel:
            cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))

        if len(cond) == 0:
            cond = [F.lit(False)]
        return reduce(lambda x, y: x | y, cond), None, None 
Example #19
Source File: numpy_compat.py    From koalas with Apache License 2.0 5 votes vote down vote up
def maybe_dispatch_ufunc_to_spark_func(
    ser_or_index, ufunc: Callable, method: str, *inputs, **kwargs: Any
):
    from databricks.koalas.base import column_op

    op_name = ufunc.__name__

    if (
        method == "__call__"
        and (op_name in unary_np_spark_mappings or op_name in binary_np_spark_mappings)
        and kwargs.get("out") is None
    ):

        np_spark_map_func = unary_np_spark_mappings.get(op_name) or binary_np_spark_mappings.get(
            op_name
        )

        def convert_arguments(*args):
            args = [  # type: ignore
                F.lit(inp) if not isinstance(inp, Column) else inp for inp in args
            ]  # type: ignore
            return np_spark_map_func(*args)

        return column_op(convert_arguments)(*inputs)  # type: ignore
    else:
        return NotImplemented 
Example #20
Source File: transform.py    From datadevops with MIT License 5 votes vote down vote up
def process_fact_parking(sensordata_sdf: DataFrame,
                         dim_parkingbay_sdf: DataFrame,
                         dim_location_sdf: DataFrame,
                         dim_st_marker_sdf: DataFrame,
                         load_id, loaded_on):
    """Transform sensordata into fact_parking"""

    dim_date_id = loaded_on.strftime("%Y%M%d")
    midnight = loaded_on.replace(hour=0, minute=0, second=0, microsecond=0)
    dim_time_id = (midnight - loaded_on).seconds

    # Build fact
    fact_parking = sensordata_sdf\
        .join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\
        .join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\
        .join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\
        .select(
            lit(dim_date_id).alias("dim_date_id"),
            lit(dim_time_id).alias("dim_time_id"),
            when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"),
            when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("l.dim_location_id")).alias("dim_location_id"),
            when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"),
            "status",
            lit(load_id).alias("load_id"),
            lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on")
        )
    return fact_parking 
Example #21
Source File: ga_chp_bq_ingest_avro_file.py    From MorphL-Community-Edition with Apache License 2.0 5 votes vote down vote up
def main():
    spark_session = (
        SparkSession.builder
        .appName(APPLICATION_NAME)
        .master(MASTER_URL)
        .config('spark.cassandra.connection.host', MORPHL_SERVER_IP_ADDRESS)
        .config('spark.cassandra.auth.username', MORPHL_CASSANDRA_USERNAME)
        .config('spark.cassandra.auth.password', MORPHL_CASSANDRA_PASSWORD)
        .config('spark.sql.shuffle.partitions', 16)
        .getOrCreate())

    log4j = spark_session.sparkContext._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

    avro_df = (
        spark_session
        .read
        .format('avro')
        .load(LOCAL_AVRO_FILE))

    save_options_ga_chp_bq_features_raw = {
        'keyspace': MORPHL_CASSANDRA_KEYSPACE,
        'table': 'ga_chp_bq_features_raw_t' if TRAINING_OR_PREDICTION ==
        'training' else 'ga_chp_bq_features_raw_p'
    }

    (avro_df
     .withColumn('day_of_data_capture', f.lit(DAY_OF_DATA_CAPTURE))
     .withColumn('website_url', f.lit(WEBSITE_URL))
     .write
     .format('org.apache.spark.sql.cassandra')
     .mode('append')
     .options(**save_options_ga_chp_bq_features_raw)
     .save()) 
Example #22
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_null_literal(t, expr, scope):
    return F.lit(None) 
Example #23
Source File: TutorialClasses.py    From KDD2019-HandsOn-Tutorial with MIT License 5 votes vote down vote up
def _getTopEntitiesByEmbedding(self, e, maxCount, minScore):
    df1 = self.df
    paperdf = self.mag.getDataframe('Papers')
    row1 = df1.where(df1.EntityId == e).first()
    df2 = df1.where(df1.EntityId != e)
    df3 = df2.select(df2.EntityId, udfCosineSimilarityN(F.lit(row1.Data), df2.Data).alias('Score'))
    return df3.join(paperdf, df3.EntityId == paperdf.PaperId, 'inner').select(paperdf.PaperId, paperdf.PaperTitle, df3.Score).where((~F.isnan(df3.Score)) & (df3.Score >= minScore)).orderBy(df3.Score.desc()).limit(maxCount) 
Example #24
Source File: transform.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def read_partition(
    spark: SparkSession,
    table: str,
    partition_spec: Mapping[str, str],
    schema: Optional[T.StructType] = None,
    direct_parquet_read: bool = False
) -> DataFrame:
    """Read a single partition from a hive table.

    Verifies the partition specification describes a complete partition,
    that the partition exists, and optionally that the table is compatible
    with an expected schema. The partition could still be empty.
    """
    # We don't need to do anything with the result, our goal is to
    # trigger AnalysisException when the arguments are invalid.
    spark.sql(_describe_partition_ql(table, partition_spec)).collect()

    partition_cond = F.lit(True)
    for k, v in partition_spec.items():
        partition_cond &= F.col(k) == v
    df = spark.read.table(table).where(partition_cond)
    # The df we have now has types defined by the hive table, but this downgrades
    # non-standard types like VectorUDT() to it's sql equivalent. Use the first
    # df to find the files, then read them directly.
    if direct_parquet_read:
        input_files = list(df._jdf.inputFiles())  # type: ignore
        input_dirs = set(os.path.dirname(path) for path in input_files)
        if len(input_dirs) != 1:
            raise Exception('Expected single directory containing partition data: [{}]'.format(
                '],['.join(input_files)))
        df = spark.read.parquet(list(input_dirs)[0])
    if schema is not None:
        # TODO: This only allows extra top level columns, anything
        # nested must be exactly the same. Fine for now.
        _verify_schema_compatability(schema, df.schema)
        df = df.select(*(field.name for field in schema))
    # Drop partitioning columns. These are not part of the mjolnir transformations, and
    # are only an implementation detail of putting them on disk and tracking history.
    return df.drop(*partition_spec.keys()) 
Example #25
Source File: __init__.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def at_least_n_distinct(col, limit):
    """Count distinct that works with windows

    The standard distinct count in spark sql can't be applied in
    a window. This implementation allows that to work
    """
    sc = SparkContext._active_spark_context
    j_cols = _to_seq(sc, [_to_java_column(col), _to_java_column(F.lit(limit))])
    jc = sc._jvm.org.wikimedia.search.mjolnir.AtLeastNDistinct().apply(j_cols)
    return Column(jc) 
Example #26
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_sign(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)

    return F.when(src_column == 0, F.lit(0.0)).otherwise(
        F.when(src_column > 0, F.lit(1.0)).otherwise(-1.0)
    ) 
Example #27
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_string_find(t, expr, scope, **kwargs):
    op = expr.op()

    @F.udf('long')
    def str_find(s, substr, start, end):
        return s.find(substr, start, end)

    src_column = t.translate(op.arg, scope)
    substr_column = t.translate(op.substr, scope)
    start_column = t.translate(op.start, scope) if op.start else F.lit(None)
    end_column = t.translate(op.end, scope) if op.end else F.lit(None)
    return str_find(src_column, substr_column, start_column, end_column) 
Example #28
Source File: window.py    From koalas with Apache License 2.0 5 votes vote down vote up
def min(self):
        def min(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.min(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(min) 
Example #29
Source File: compiler.py    From ibis with Apache License 2.0 5 votes vote down vote up
def compile_null_if(t, expr, scope, **kwargs):
    op = expr.op()
    col = t.translate(op.arg, scope)
    nullif_col = t.translate(op.null_if_expr, scope)
    return F.when(col == nullif_col, F.lit(None)).otherwise(col) 
Example #30
Source File: Provider.py    From cccatalog with MIT License 5 votes vote down vote up
def getData(self):
        spk         = SparkSession.builder.getOrCreate()
        dataDF      = spk.read.parquet(self.input)
        providerDF  = dataDF.select(concat(concat('provider_domain', 'content_path'), \
                            when(col('content_query_string') != '', concat(lit('?'), col('content_query_string')))\
                            .otherwise(lit(''))).alias('url'), \
                            concat('warc_segment', lit('/warc/'), 'warc_filename').alias('warc_filename'), \
                                     'content_offset', 'deflate_length')\
                            .where(col('provider_domain').like('%{}'.format(self.domain)))\
                            .dropDuplicates(['url'])

        providerData = providerDF.rdd.map(lambda row: '\t'.join([str(col) for col in row])).collect() #convert dataframe into a list of tab delimited elements

        return providerData