Python Examples of pyspark.SparkContext._active_spark

Source File: wrapper.py From LearningApacheSpark with MIT License

6 votes

def _new_java_array(pylist, java_class):
        """
        Create a Java array of given java_class type. Useful for
        calling a method with a Scala Array from Python with Py4J.

        :param pylist:
          Python list to convert to a Java Array.
        :param java_class:
          Java class to specify the type of Array. Should be in the
          form of sc._gateway.jvm.* (sc is a valid Spark Context).
        :return:
          Java Array of converted pylist.

        Example primitive Java classes:
          - basestring -> sc._gateway.jvm.java.lang.String
          - int -> sc._gateway.jvm.java.lang.Integer
          - float -> sc._gateway.jvm.java.lang.Double
          - bool -> sc._gateway.jvm.java.lang.Boolean
        """
        sc = SparkContext._active_spark_context
        java_array = sc._gateway.new_array(java_class, len(pylist))
        for i in xrange(len(pylist)):
            java_array[i] = pylist[i]
        return java_array

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def ntile(n):
    """
    Window function: returns the ntile group id (from 1 to `n` inclusive)
    in an ordered window partition. For example, if `n` is 4, the first
    quarter of the rows will get value 1, the second quarter will get 2,
    the third quarter will get 3, and the last quarter will get 4.

    This is equivalent to the NTILE function in SQL.

    :param n: an integer
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.ntile(int(n)))


# ---------------------- Date/Timestamp functions ------------------------------

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def grouping_id(*cols):
    """
    Aggregate function: returns the level of grouping, equals to

       (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)

    .. note:: The list of columns should match with grouping columns exactly, or empty (means all
        the grouping columns).

    >>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
    +-----+-------------+--------+
    | name|grouping_id()|sum(age)|
    +-----+-------------+--------+
    | null|            1|       7|
    |Alice|            0|       2|
    |  Bob|            0|       5|
    +-----+-------------+--------+
    """
    sc = SparkContext._active_spark_context
    jc = sc._jvm.functions.grouping_id(_to_seq(sc, cols, _to_java_column))
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def approx_count_distinct(col, rsd=None):
    """Aggregate function: returns a new :class:`Column` for approximate distinct count of
    column `col`.

    :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
        efficient to use :func:`countDistinct`

    >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()
    [Row(distinct_ages=2)]
    """
    sc = SparkContext._active_spark_context
    if rsd is None:
        jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col))
    else:
        jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd)
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def log(arg1, arg2=None):
    """Returns the first argument-based logarithm of the second argument.

    If there is only one argument, then this takes the natural logarithm of the argument.

    >>> df.select(log(10.0, df.age).alias('ten')).rdd.map(lambda l: str(l.ten)[:7]).collect()
    ['0.30102', '0.69897']

    >>> df.select(log(df.age).alias('e')).rdd.map(lambda l: str(l.e)[:7]).collect()
    ['0.69314', '1.60943']
    """
    sc = SparkContext._active_spark_context
    if arg2 is None:
        jc = sc._jvm.functions.log(_to_java_column(arg1))
    else:
        jc = sc._jvm.functions.log(arg1, _to_java_column(arg2))
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def months_between(date1, date2, roundOff=True):
    """
    Returns number of months between dates date1 and date2.
    If date1 is later than date2, then the result is positive.
    If date1 and date2 are on the same day of month, or both are the last day of month,
    returns an integer (time of day will be ignored).
    The result is rounded off to 8 digits unless `roundOff` is set to `False`.

    >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2'])
    >>> df.select(months_between(df.date1, df.date2).alias('months')).collect()
    [Row(months=3.94959677)]
    >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect()
    [Row(months=3.9495967741935485)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.months_between(
        _to_java_column(date1), _to_java_column(date2), roundOff))

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def to_date(col, format=None):
    """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or
    :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`
    using the optionally specified format. Specify formats according to
    `SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_.
    By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format
    is omitted (equivalent to ``col.cast("date")``).

    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t).alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]

    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]
    """
    sc = SparkContext._active_spark_context
    if format is None:
        jc = sc._jvm.functions.to_date(_to_java_column(col))
    else:
        jc = sc._jvm.functions.to_date(_to_java_column(col), format)
    return Column(jc)

Source File: wrapper.py From LearningApacheSpark with MIT License

6 votes

def _transfer_params_from_java(self):
        """
        Transforms the embedded params from the companion Java object.
        """
        sc = SparkContext._active_spark_context
        for param in self.params:
            if self._java_obj.hasParam(param.name):
                java_param = self._java_obj.getParam(param.name)
                # SPARK-14931: Only check set params back to avoid default params mismatch.
                if self._java_obj.isSet(java_param):
                    value = _java2py(sc, self._java_obj.getOrDefault(java_param))
                    self._set(**{param.name: value})
                # SPARK-10931: Temporary fix for params that have a default in Java
                if self._java_obj.hasDefault(java_param) and not self.isDefined(param):
                    value = _java2py(sc, self._java_obj.getDefault(java_param)).get()
                    self._setDefault(**{param.name: value})

Source File: wrapper.py From LearningApacheSpark with MIT License

6 votes

def _transfer_params_to_java(self):
        """
        Transforms the embedded params to the companion Java object.
        """
        pair_defaults = []
        for param in self.params:
            if self.isSet(param):
                pair = self._make_java_param_pair(param, self._paramMap[param])
                self._java_obj.set(pair)
            if self.hasDefault(param):
                pair = self._make_java_param_pair(param, self._defaultParamMap[param])
                pair_defaults.append(pair)
        if len(pair_defaults) > 0:
            sc = SparkContext._active_spark_context
            pair_defaults_seq = sc._jvm.PythonUtils.toSeq(pair_defaults)
            self._java_obj.setDefault(pair_defaults_seq)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def to_timestamp(col, format=None):
    """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or
    :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`
    using the optionally specified format. Specify formats according to
    `SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_.
    By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format
    is omitted (equivalent to ``col.cast("timestamp")``).

    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_timestamp(df.t).alias('dt')).collect()
    [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]

    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect()
    [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
    """
    sc = SparkContext._active_spark_context
    if format is None:
        jc = sc._jvm.functions.to_timestamp(_to_java_column(col))
    else:
        jc = sc._jvm.functions.to_timestamp(_to_java_column(col), format)
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def rand(seed=None):
    """Generates a random column with independent and identically distributed (i.i.d.) samples
    from U[0.0, 1.0].

    .. note:: The function is non-deterministic in general case.

    >>> df.withColumn('rand', rand(seed=42) * 3).collect()
    [Row(age=2, name=u'Alice', rand=1.1568609015300986),
     Row(age=5, name=u'Bob', rand=1.403379671529166)]
    """
    sc = SparkContext._active_spark_context
    if seed is not None:
        jc = sc._jvm.functions.rand(seed)
    else:
        jc = sc._jvm.functions.rand()
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def randn(seed=None):
    """Generates a column with independent and identically distributed (i.i.d.) samples from
    the standard normal distribution.

    .. note:: The function is non-deterministic in general case.

    >>> df.withColumn('randn', randn(seed=42)).collect()
    [Row(age=2, name=u'Alice', randn=-0.7556247885860078),
    Row(age=5, name=u'Bob', randn=-0.0861619008451133)]
    """
    sc = SparkContext._active_spark_context
    if seed is not None:
        jc = sc._jvm.functions.randn(seed)
    else:
        jc = sc._jvm.functions.randn()
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def when(condition, value):
    """Evaluates a list of conditions and returns one of multiple possible result expressions.
    If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.

    :param condition: a boolean :class:`Column` expression.
    :param value: a literal value, or a :class:`Column` expression.

    >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
    [Row(age=3), Row(age=4)]

    >>> df.select(when(df.age == 2, df.age + 1).alias("age")).collect()
    [Row(age=3), Row(age=None)]
    """
    sc = SparkContext._active_spark_context
    if not isinstance(condition, Column):
        raise TypeError("condition should be a Column")
    v = value._jc if isinstance(value, Column) else value
    jc = sc._jvm.functions.when(condition._jc, v)
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

6 votes

def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
    """
    Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default)
    to Unix time stamp (in seconds), using the default timezone and the default
    locale, return null if fail.

    if `timestamp` is None, then it returns current timestamp.

    >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
    >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect()
    [Row(unix_time=1428476400)]
    >>> spark.conf.unset("spark.sql.session.timeZone")
    """
    sc = SparkContext._active_spark_context
    if timestamp is None:
        return Column(sc._jvm.functions.unix_timestamp())
    return Column(sc._jvm.functions.unix_timestamp(_to_java_column(timestamp), format))

Source File: feature.py From LearningApacheSpark with MIT License

6 votes

def from_labels(cls, labels, inputCol, outputCol=None, handleInvalid=None):
        """
        Construct the model directly from an array of label strings,
        requires an active SparkContext.
        """
        sc = SparkContext._active_spark_context
        java_class = sc._gateway.jvm.java.lang.String
        jlabels = StringIndexerModel._new_java_array(labels, java_class)
        model = StringIndexerModel._create_from_java_class(
            "org.apache.spark.ml.feature.StringIndexerModel", jlabels)
        model.setInputCol(inputCol)
        if outputCol is not None:
            model.setOutputCol(outputCol)
        if handleInvalid is not None:
            model.setHandleInvalid(handleInvalid)
        return model

Source File: feature.py From LearningApacheSpark with MIT License

6 votes

def from_vocabulary(cls, vocabulary, inputCol, outputCol=None, minTF=None, binary=None):
        """
        Construct the model directly from a vocabulary list of strings,
        requires an active SparkContext.
        """
        sc = SparkContext._active_spark_context
        java_class = sc._gateway.jvm.java.lang.String
        jvocab = CountVectorizerModel._new_java_array(vocabulary, java_class)
        model = CountVectorizerModel._create_from_java_class(
            "org.apache.spark.ml.feature.CountVectorizerModel", jvocab)
        model.setInputCol(inputCol)
        if outputCol is not None:
            model.setOutputCol(outputCol)
        if minTF is not None:
            model.setMinTF(minTF)
        if binary is not None:
            model.setBinary(binary)
        model._set(vocabSize=len(vocabulary))
        return model

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def date_trunc(format, timestamp):
    """
    Returns timestamp truncated to the unit specified by the format.

    :param format: 'year', 'yyyy', 'yy', 'month', 'mon', 'mm',
        'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter'

    >>> df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['t'])
    >>> df.select(date_trunc('year', df.t).alias('year')).collect()
    [Row(year=datetime.datetime(1997, 1, 1, 0, 0))]
    >>> df.select(date_trunc('mon', df.t).alias('month')).collect()
    [Row(month=datetime.datetime(1997, 2, 1, 0, 0))]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.date_trunc(format, _to_java_column(timestamp)))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def add_months(start, months):
    """
    Returns the date that is `months` months after `start`

    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(add_months(df.dt, 1).alias('next_month')).collect()
    [Row(next_month=datetime.date(2015, 5, 8))]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.add_months(_to_java_column(start), months))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def next_day(date, dayOfWeek):
    """
    Returns the first date which is later than the value of the date column.

    Day of the week parameter is case insensitive, and accepts:
        "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".

    >>> df = spark.createDataFrame([('2015-07-27',)], ['d'])
    >>> df.select(next_day(df.d, 'Sun').alias('date')).collect()
    [Row(date=datetime.date(2015, 8, 2))]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.next_day(_to_java_column(date), dayOfWeek))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def year(col):
    """
    Extract the year of a given date as integer.

    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(year('dt').alias('year')).collect()
    [Row(year=2015)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.year(_to_java_column(col)))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def current_timestamp():
    """
    Returns the current timestamp as a :class:`TimestampType` column.
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.current_timestamp())

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def current_date():
    """
    Returns the current date as a :class:`DateType` column.
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.current_date())

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def hour(col):
    """
    Extract the hours of a given date as integer.

    >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts'])
    >>> df.select(hour('ts').alias('hour')).collect()
    [Row(hour=13)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.hour(_to_java_column(col)))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def lead(col, count=1, default=None):
    """
    Window function: returns the value that is `offset` rows after the current row, and
    `defaultValue` if there is less than `offset` rows after the current row. For example,
    an `offset` of one will return the next row at any given point in the window partition.

    This is equivalent to the LEAD function in SQL.

    :param col: name of column or expression
    :param count: number of row to extend
    :param default: default value
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.lead(_to_java_column(col), count, default))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def lag(col, count=1, default=None):
    """
    Window function: returns the value that is `offset` rows before the current row, and
    `defaultValue` if there is less than `offset` rows before the current row. For example,
    an `offset` of one will return the previous row at any given point in the window partition.

    This is equivalent to the LAG function in SQL.

    :param col: name of column or expression
    :param count: number of row to extend
    :param default: default value
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.lag(_to_java_column(col), count, default))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def conv(col, fromBase, toBase):
    """
    Convert a number in a string column from one base to another.

    >>> df = spark.createDataFrame([("010101",)], ['n'])
    >>> df.select(conv(df.n, 2, 16).alias('hex')).collect()
    [Row(hex=u'15')]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def log2(col):
    """Returns the base-2 logarithm of the argument.

    >>> spark.createDataFrame([(4,)], ['a']).select(log2('a').alias('log2')).collect()
    [Row(log2=2.0)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.log2(_to_java_column(col)))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def least(*cols):
    """
    Returns the least value of the list of column names, skipping null values.
    This function takes at least 2 parameters. It will return null iff all parameters are null.

    >>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])
    >>> df.select(least(df.a, df.b, df.c).alias("least")).collect()
    [Row(least=1)]
    """
    if len(cols) < 2:
        raise ValueError("least should take at least two columns")
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.least(_to_seq(sc, cols, _to_java_column)))

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def struct(*cols):
    """Creates a new struct column.

    :param cols: list of column names (string) or list of :class:`Column` expressions

    >>> df.select(struct('age', 'name').alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    >>> df.select(struct([df.age, df.name]).alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    """
    sc = SparkContext._active_spark_context
    if len(cols) == 1 and isinstance(cols[0], (list, set)):
        cols = cols[0]
    jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column))
    return Column(jc)

Source File: functions.py From LearningApacheSpark with MIT License

5 votes

def expr(str):
    """Parses the expression string into the column that it represents

    >>> df.select(expr("length(name)")).collect()
    [Row(length(name)=5), Row(length(name)=3)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.expr(str))

Python pyspark.SparkContext._active_spark_context() Examples