Python pyspark.sql.types.LongType() Examples

Example #1
def numpy_column_op(f):
    def wrapper(self, *args):
        # PySpark does not support NumPy type out of the box. For now, we convert NumPy types
        # into some primitive types understandable in PySpark.
        new_args = []
        for arg in args:
            # TODO: This is a quick hack to support NumPy type. We should revisit this.
            if isinstance(self.spark.data_type, LongType) and isinstance(arg, np.timedelta64):
                new_args.append(float(arg / np.timedelta64(1, "s")))
        return column_op(f)(self, *new_args)

    return wrapper 
Example #2
def infer_schema(example, binary_features=[]):
  """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

    :example: a tf.train.Example
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

    A DataFrame StructType schema
  def _infer_sql_type(k, v):
    # special handling for binary features
    if k in binary_features:
      return BinaryType()

    if v.int64_list.value:
      result = v.int64_list.value
      sql_type = LongType()
    elif v.float_list.value:
      result = v.float_list.value
      sql_type = DoubleType()
      result = v.bytes_list.value
      sql_type = StringType()

    if len(result) > 1:             # represent multi-item tensors as Spark SQL ArrayType() of base types
      return ArrayType(sql_type)
    else:                           # represent everything else as base types (and empty tensors as StringType())
      return sql_type

  return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())]) 
Example #3
def vertices_assign_ids(self, sc, sqlc, edges):
        source ='name'))
        target ='name'))

        ids = source.union(target) \

        if self.args.validate_host_names:
            is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid,
            ids = ids.filter(is_valid(

        if self.args.vertex_partitions == 1:
            ids = ids \
                    .coalesce(1) \
                    .sort('name') \
                    .withColumn('id', sqlf.monotonically_increasing_id())
            id_rdd = \
                        .map(lambda row: tuple(row)[0]) \
                        .sortBy(lambda x: x, True,
                                self.args.vertex_partitions) \
            id_schema = StructType([
                StructField("name", StringType(), True),
                StructField("id", LongType(), True)
            ids = sqlc.createDataFrame(id_rdd, schema=id_schema)

        if self.args.save_as_text is not None:
            ids = ids.persist()
  '\t',, \
                .write \
                .text(os.path.join(self.args.save_as_text, "vertices"),
        ids.write \
           .format(self.args.output_format) \
           .option("compression", self.args.output_compression) \
           .saveAsTable(self.args.output + '_vertices')

        return ids 
Example #4
def test_spark_udf(spark, model_path):
    reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path)

    pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)])
    spark_df = spark.createDataFrame(pandas_df)

    # Test all supported return types
    type_map = {"float": (FloatType(), np.number),
                "int": (IntegerType(), np.int32),
                "double": (DoubleType(), np.number),
                "long": (LongType(),,
                "string": (StringType(), None)}

    for tname, tdef in type_map.items():
        spark_type, np_type = tdef
        prediction_df = reloaded_pyfunc_model.predict(pandas_df)
        for is_array in [True, False]:
            t = ArrayType(spark_type) if is_array else spark_type
            if tname == "string":
                expected = prediction_df.applymap(str)
                expected = prediction_df.select_dtypes(np_type)
                if tname == "float":
                    expected = expected.astype(np.float32)

            expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()]
            pyfunc_udf = spark_udf(spark, model_path, result_type=t)
            new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
            actual = list("prediction").toPandas()['prediction'])
            assert expected == actual
            if not is_array:
                pyfunc_udf = spark_udf(spark, model_path, result_type=tname)
                new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
                actual = list("prediction").toPandas()['prediction'])
                assert expected == actual 
Example #5
def read(self, file_path, spark_session, indexcol=0, schema=None):
        Creates a dataframe from the csv file

        :param indexcol: if 1, create a tuple id column as auto increment
        :param schema: optional schema of file if known
        :param spark_session: The spark_session we created in Holoclean object
        :param file_path: The path to the file

        :return: dataframe
        if schema is None:
            df =, header=True)
            df =, header=True, schema=schema)

        if indexcol == 0:
            return df

        index_name = GlobalVariables.index_name

        new_cols = df.schema.names + [index_name]
        list_schema = []
        for index_attribute in range(len(df.schema.names)):
            list_schema.append(StructField("_" + str(index_attribute),
            StructField("_" + str(len(new_cols)), LongType(), True))

        schema = StructType(list_schema)
        ix_df = df.rdd.zipWithIndex().map(
            lambda (row, ix): row + (ix + 1,)).toDF(schema)
        tmp_cols = ix_df.schema.names
        new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx],
                        xrange(len(tmp_cols)), ix_df)
        new_df = self.checking_string_size(new_df)
        return new_df 
Example #6
def len(self) -> "ks.Series":
        Computes the length of each element in the Series.

        The element may be a sequence (such as a string, tuple or list).

        Series of int
            A Series of integer values indicating the length of each element in
            the Series.

        Returns the length (number of characters) in a string. Returns the
        number of entries for lists or tuples.

        >>> s1 = ks.Series(['dog', 'monkey'])
        >>> s1.str.len()
        0    3
        1    6
        Name: 0, dtype: int64

        >>> s2 = ks.Series([["a", "b", "c"], []])
        >>> s2.str.len()
        0    3
        1    0
        Name: 0, dtype: int64
        if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
            return column_op(lambda c: F.size(c).cast(LongType()))(self._data).alias(
            return column_op(lambda c: F.length(c).cast(LongType()))(self._data).alias(
Example #7
def as_spark_type(tpe) -> types.DataType:
    Given a python type, returns the equivalent spark type.
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int",, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
        raise TypeError("Type %s was not understood." % tpe) 
Example #8
def _select_rows_by_iterable(
        self, rows_sel: Iterable
    ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
        sdf = self._internal.spark_frame

        if any(isinstance(key, (int,, np.int64, np.int32)) and key < 0 for key in rows_sel):
            offset = sdf.count()
            offset = 0

        new_rows_sel = []
        for key in list(rows_sel):
            if not isinstance(key, (int,, np.int64, np.int32)):
                raise TypeError(
                    "cannot do positional indexing with these indexers [{}] of {}".format(
                        key, type(key)
            if key < 0:
                key = key + offset

        if len(new_rows_sel) != len(set(new_rows_sel)):
            raise NotImplementedError(
                "Duplicated row selection is not currently supported; "
                "however, normalised index was [%s]" % new_rows_sel

        sequence_scol = sdf[self._sequence_col]
        cond = []
        for key in new_rows_sel:
            cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))

        if len(cond) == 0:
            cond = [F.lit(False)]
        return reduce(lambda x, y: x | y, cond), None, None 
Example #9
def week(self) -> "ks.Series":
        The week ordinal of the year.
        return column_op(lambda c: F.weekofyear(c).cast(LongType()))(self._data).alias(
Example #10
def minute(self) -> "ks.Series":
        The minutes of the datetime.
        return column_op(lambda c: F.minute(c).cast(LongType()))(self._data).alias( 
Example #11
def hour(self) -> "ks.Series":
        The hours of the datetime.
        return column_op(lambda c: F.hour(c).cast(LongType()))(self._data).alias( 
Example #12
def day(self) -> "ks.Series":
        The days of the datetime.
        return column_op(lambda c: F.dayofmonth(c).cast(LongType()))(self._data).alias(
Example #13
def month(self) -> "ks.Series":
        The month of the timestamp as January = 1 December = 12.
        return column_op(lambda c: F.month(c).cast(LongType()))(self._data).alias( 
Example #14
def year(self) -> "ks.Series":
        The year of the datetime.
        return column_op(lambda c: F.year(c).cast(LongType()))(self._data).alias( 
Example #15
def encode(self, unischema_field, value):
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
        import pyspark.sql.types as sql_types

        # We treat ndarrays with shape=() as scalars
        unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
        # Validate the input to be a scalar (or an unsized numpy array)
        if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
            raise TypeError('Expected a scalar as a value for field \'{}\'. '
                            'Got a non-numpy type\'{}\''.format(, type(value)))

        if unischema_field.shape:
            raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
                             'to indicate a scalar. However, the actual shape is %s',
                   , unischema_field.shape)
        if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
            return int(value)
        if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
            return float(value)
        if isinstance(self._spark_type, sql_types.BooleanType):
            return bool(value)
        if isinstance(self._spark_type, sql_types.StringType):
            if not isinstance(value, str):
                raise ValueError(
                    'Expected a string value for field {}. Got type {}'.format(, type(value)))
            return str(value)

        return value 
Example #16
def test_create_schema_view_fails_validate():
    """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    with pytest.raises(ValueError, match='does not belong to the schema'):
        TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)]) 
Example #17
def test_invalid_schema_field(synthetic_dataset, reader_factory):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError, match='bogus_key'):
        reader_factory(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(),
Example #18
Source File:    From petastorm with Apache License 2.0 5 votes vote down vote up
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),

    return getattr(_numpy_to_spark_mapping, cache_attr_name)

# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk 
Example #19
def schema(self):
            Return the data type that represents a row from the received data list.
        from pyspark.sql.types import IntegerType, LongType, StringType, StructField, StructType

        return StructType(
                StructField('p_date', StringType(), True),
                StructField('p_time', StringType(), True),
                StructField('clientip', StringType(), True),
                StructField('host', StringType(), True),
                StructField('reqmethod', StringType(), True),
                StructField('useragent', StringType(), True),
                StructField('resconttype', StringType(), True),
                StructField('duration', LongType(), True),
                StructField('username', StringType(), True),
                StructField('authgroup', StringType(), True),
                StructField('exceptionid', StringType(), True),
                StructField('filterresult', StringType(), True),
                StructField('webcat', StringType(), True),
                StructField('referer', StringType(), True),
                StructField('respcode', StringType(), True),
                StructField('action', StringType(), True),
                StructField('urischeme', StringType(), True),
                StructField('uriport', StringType(), True),
                StructField('uripath', StringType(), True),
                StructField('uriquery', StringType(), True),
                StructField('uriextension', StringType(), True),
                StructField('serverip', StringType(), True),
                StructField('scbytes', IntegerType(), True),
                StructField('csbytes', IntegerType(), True),
                StructField('virusid', StringType(), True),
                StructField('bcappname', StringType(), True),
                StructField('bcappoper', StringType(), True),
                StructField('fulluri', StringType(), True),
                StructField('y', StringType(), True),
                StructField('m', StringType(), True),
                StructField('d', StringType(), True),
                StructField('h', StringType(), True)
Example #20
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
            "exploded", F.explode(longitudinal_addons.installed_addons)
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (  # noqa: E128
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
    )"Collecting final result of co-installations.")

    return addon_co_installations_collapsed 
Example #21
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
            "exploded", F.explode(longitudinal_addons.installed_addons)
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (  # noqa: E128
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
    )"Collecting final result of co-installations.")

    return addon_co_installations_collapsed 
Example #22
Source File:    From python_mozetl with MIT License 4 votes vote down vote up
    """"Generate a schema for sync_summary. This subset contains enough
    structure for testing bookmark validation. The schema is derived from

    failure_type = StructType([StructField("name", StringType(), False)])

    status_type = StructType([StructField("sync", StringType(), True)])

    validation_problems = StructType(
            StructField("name", StringType(), False),
            StructField("count", LongType(), False),

    validation_type = StructType(
            StructField("version", LongType(), False),
            StructField("checked", LongType(), False),
            StructField("took", LongType(), False),
            StructField("problems", ArrayType(validation_problems, False), True),

    engine_type = StructType(
            StructField("name", StringType(), False),
            StructField("status", StringType(), False),
            StructField("failure_reason", failure_type, True),
            StructField("validation", validation_type, True),

    return StructType(
            StructField("app_build_id", StringType(), True),
            StructField("app_version", StringType(), True),
            StructField("app_display_version", StringType(), True),
            StructField("app_name", StringType(), True),
            StructField("app_channel", StringType(), True),
            StructField("uid", StringType(), False),
            StructField("device_id", StringType(), True),
            StructField("when", LongType(), False),
            StructField("failure_reason", failure_type, True),
            StructField("status", status_type, False),
            StructField("engines", ArrayType(engine_type, False), True),
            StructField("submission_date_s3", StringType(), False),