Python pyspark.sql.types.LongType() Examples
The following are 22
code examples of pyspark.sql.types.LongType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def numpy_column_op(f): @wraps(f) def wrapper(self, *args): # PySpark does not support NumPy type out of the box. For now, we convert NumPy types # into some primitive types understandable in PySpark. new_args = [] for arg in args: # TODO: This is a quick hack to support NumPy type. We should revisit this. if isinstance(self.spark.data_type, LongType) and isinstance(arg, np.timedelta64): new_args.append(float(arg / np.timedelta64(1, "s"))) else: new_args.append(arg) return column_op(f)(self, *new_args) return wrapper
Example #2
Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def infer_schema(example, binary_features=[]): """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields). Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint" from the caller in the ``binary_features`` argument. Args: :example: a tf.train.Example :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays. Returns: A DataFrame StructType schema """ def _infer_sql_type(k, v): # special handling for binary features if k in binary_features: return BinaryType() if v.int64_list.value: result = v.int64_list.value sql_type = LongType() elif v.float_list.value: result = v.float_list.value sql_type = DoubleType() else: result = v.bytes_list.value sql_type = StringType() if len(result) > 1: # represent multi-item tensors as Spark SQL ArrayType() of base types return ArrayType(sql_type) else: # represent everything else as base types (and empty tensors as StringType()) return sql_type return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])
Example #3
Source File: hostlinks_to_graph.py From cc-pyspark with MIT License | 5 votes |
def vertices_assign_ids(self, sc, sqlc, edges): source = edges.select(edges.s.alias('name')) target = edges.select(edges.t.alias('name')) ids = source.union(target) \ .distinct() if self.args.validate_host_names: is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid, BooleanType()) ids = ids.filter(is_valid(ids.name)) if self.args.vertex_partitions == 1: ids = ids \ .coalesce(1) \ .sort('name') \ .withColumn('id', sqlf.monotonically_increasing_id()) else: id_rdd = ids.select(ids.name).rdd \ .map(lambda row: tuple(row)[0]) \ .sortBy(lambda x: x, True, self.args.vertex_partitions) \ .zipWithIndex() id_schema = StructType([ StructField("name", StringType(), True), StructField("id", LongType(), True) ]) ids = sqlc.createDataFrame(id_rdd, schema=id_schema) if self.args.save_as_text is not None: ids = ids.persist() ids.select(sqlf.concat_ws('\t', ids.id, ids.name)) \ .write \ .text(os.path.join(self.args.save_as_text, "vertices"), compression="gzip") ids.write \ .format(self.args.output_format) \ .option("compression", self.args.output_compression) \ .saveAsTable(self.args.output + '_vertices') return ids
Example #4
Source File: test_spark.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_udf(spark, model_path): mlflow.pyfunc.save_model( path=model_path, loader_module=__name__, code_path=[os.path.dirname(tests.__file__)], ) reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path) pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)]) spark_df = spark.createDataFrame(pandas_df) # Test all supported return types type_map = {"float": (FloatType(), np.number), "int": (IntegerType(), np.int32), "double": (DoubleType(), np.number), "long": (LongType(), np.int), "string": (StringType(), None)} for tname, tdef in type_map.items(): spark_type, np_type = tdef prediction_df = reloaded_pyfunc_model.predict(pandas_df) for is_array in [True, False]: t = ArrayType(spark_type) if is_array else spark_type if tname == "string": expected = prediction_df.applymap(str) else: expected = prediction_df.select_dtypes(np_type) if tname == "float": expected = expected.astype(np.float32) expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()] pyfunc_udf = spark_udf(spark, model_path, result_type=t) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual if not is_array: pyfunc_udf = spark_udf(spark, model_path, result_type=tname) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual
Example #5
Source File: reader.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 5 votes |
def read(self, file_path, spark_session, indexcol=0, schema=None): """ Creates a dataframe from the csv file :param indexcol: if 1, create a tuple id column as auto increment :param schema: optional schema of file if known :param spark_session: The spark_session we created in Holoclean object :param file_path: The path to the file :return: dataframe """ if schema is None: df = spark_session.read.csv(file_path, header=True) else: df = spark_session.read.csv(file_path, header=True, schema=schema) if indexcol == 0: return df index_name = GlobalVariables.index_name new_cols = df.schema.names + [index_name] list_schema = [] for index_attribute in range(len(df.schema.names)): list_schema.append(StructField("_" + str(index_attribute), df.schema[ index_attribute].dataType, True)) list_schema.append( StructField("_" + str(len(new_cols)), LongType(), True)) schema = StructType(list_schema) ix_df = df.rdd.zipWithIndex().map( lambda (row, ix): row + (ix + 1,)).toDF(schema) tmp_cols = ix_df.schema.names new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx], new_cols[idx]), xrange(len(tmp_cols)), ix_df) new_df = self.checking_string_size(new_df) return new_df
Example #6
Source File: strings.py From koalas with Apache License 2.0 | 5 votes |
def len(self) -> "ks.Series": """ Computes the length of each element in the Series. The element may be a sequence (such as a string, tuple or list). Returns ------- Series of int A Series of integer values indicating the length of each element in the Series. Examples -------- Returns the length (number of characters) in a string. Returns the number of entries for lists or tuples. >>> s1 = ks.Series(['dog', 'monkey']) >>> s1.str.len() 0 3 1 6 Name: 0, dtype: int64 >>> s2 = ks.Series([["a", "b", "c"], []]) >>> s2.str.len() 0 3 1 0 Name: 0, dtype: int64 """ if isinstance(self._data.spark.data_type, (ArrayType, MapType)): return column_op(lambda c: F.size(c).cast(LongType()))(self._data).alias( self._data.name ) else: return column_op(lambda c: F.length(c).cast(LongType()))(self._data).alias( self._data.name )
Example #7
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes,): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date,): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray,): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
Example #8
Source File: indexing.py From koalas with Apache License 2.0 | 5 votes |
def _select_rows_by_iterable( self, rows_sel: Iterable ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]: sdf = self._internal.spark_frame if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel): offset = sdf.count() else: offset = 0 new_rows_sel = [] for key in list(rows_sel): if not isinstance(key, (int, np.int, np.int64, np.int32)): raise TypeError( "cannot do positional indexing with these indexers [{}] of {}".format( key, type(key) ) ) if key < 0: key = key + offset new_rows_sel.append(key) if len(new_rows_sel) != len(set(new_rows_sel)): raise NotImplementedError( "Duplicated row selection is not currently supported; " "however, normalised index was [%s]" % new_rows_sel ) sequence_scol = sdf[self._sequence_col] cond = [] for key in new_rows_sel: cond.append(sequence_scol == F.lit(int(key)).cast(LongType())) if len(cond) == 0: cond = [F.lit(False)] return reduce(lambda x, y: x | y, cond), None, None
Example #9
Source File: datetimes.py From koalas with Apache License 2.0 | 5 votes |
def week(self) -> "ks.Series": """ The week ordinal of the year. """ return column_op(lambda c: F.weekofyear(c).cast(LongType()))(self._data).alias( self._data.name )
Example #10
Source File: datetimes.py From koalas with Apache License 2.0 | 5 votes |
def minute(self) -> "ks.Series": """ The minutes of the datetime. """ return column_op(lambda c: F.minute(c).cast(LongType()))(self._data).alias(self._data.name)
Example #11
Source File: datetimes.py From koalas with Apache License 2.0 | 5 votes |
def hour(self) -> "ks.Series": """ The hours of the datetime. """ return column_op(lambda c: F.hour(c).cast(LongType()))(self._data).alias(self._data.name)
Example #12
Source File: datetimes.py From koalas with Apache License 2.0 | 5 votes |
def day(self) -> "ks.Series": """ The days of the datetime. """ return column_op(lambda c: F.dayofmonth(c).cast(LongType()))(self._data).alias( self._data.name )
Example #13
Source File: datetimes.py From koalas with Apache License 2.0 | 5 votes |
def month(self) -> "ks.Series": """ The month of the timestamp as January = 1 December = 12. """ return column_op(lambda c: F.month(c).cast(LongType()))(self._data).alias(self._data.name)
Example #14
Source File: datetimes.py From koalas with Apache License 2.0 | 5 votes |
def year(self) -> "ks.Series": """ The year of the datetime. """ return column_op(lambda c: F.year(c).cast(LongType()))(self._data).alias(self._data.name)
Example #15
Source File: codecs.py From petastorm with Apache License 2.0 | 5 votes |
def encode(self, unischema_field, value): # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module import pyspark.sql.types as sql_types # We treat ndarrays with shape=() as scalars unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == () # Validate the input to be a scalar (or an unsized numpy array) if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)): raise TypeError('Expected a scalar as a value for field \'{}\'. ' 'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value))) if unischema_field.shape: raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' ' 'to indicate a scalar. However, the actual shape is %s', unischema_field.name, unischema_field.shape) if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType, sql_types.LongType)): return int(value) if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)): return float(value) if isinstance(self._spark_type, sql_types.BooleanType): return bool(value) if isinstance(self._spark_type, sql_types.StringType): if not isinstance(value, str): raise ValueError( 'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value))) return str(value) return value
Example #16
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_create_schema_view_fails_validate(): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with pytest.raises(ValueError, match='does not belong to the schema'): TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
Example #17
Source File: test_end_to_end.py From petastorm with Apache License 2.0 | 5 votes |
def test_invalid_schema_field(synthetic_dataset, reader_factory): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError, match='bogus_key'): reader_factory(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_row_groups=False, predicate=EqualPredicate(expected_values))
Example #18
Source File: unischema.py From petastorm with Apache License 2.0 | 5 votes |
def _numpy_to_spark_mapping(): """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation of multiple objects in each call.""" # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot' # notation to avoid copy/paste/typo mistakes cache_attr_name = 'cached_numpy_to_pyspark_types_map' if not hasattr(_numpy_to_spark_mapping, cache_attr_name): import pyspark.sql.types as T setattr(_numpy_to_spark_mapping, cache_attr_name, { np.int8: T.ByteType(), np.uint8: T.ShortType(), np.int16: T.ShortType(), np.uint16: T.IntegerType(), np.int32: T.IntegerType(), np.int64: T.LongType(), np.float32: T.FloatType(), np.float64: T.DoubleType(), np.string_: T.StringType(), np.str_: T.StringType(), np.unicode_: T.StringType(), np.bool_: T.BooleanType(), }) return getattr(_numpy_to_spark_mapping, cache_attr_name) # TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to # the dataset on disk
Example #19
Source File: streaming.py From incubator-spot with Apache License 2.0 | 4 votes |
def schema(self): ''' Return the data type that represents a row from the received data list. ''' from pyspark.sql.types import IntegerType, LongType, StringType, StructField, StructType return StructType( [ StructField('p_date', StringType(), True), StructField('p_time', StringType(), True), StructField('clientip', StringType(), True), StructField('host', StringType(), True), StructField('reqmethod', StringType(), True), StructField('useragent', StringType(), True), StructField('resconttype', StringType(), True), StructField('duration', LongType(), True), StructField('username', StringType(), True), StructField('authgroup', StringType(), True), StructField('exceptionid', StringType(), True), StructField('filterresult', StringType(), True), StructField('webcat', StringType(), True), StructField('referer', StringType(), True), StructField('respcode', StringType(), True), StructField('action', StringType(), True), StructField('urischeme', StringType(), True), StructField('uriport', StringType(), True), StructField('uripath', StringType(), True), StructField('uriquery', StringType(), True), StructField('uriextension', StringType(), True), StructField('serverip', StringType(), True), StructField('scbytes', IntegerType(), True), StructField('csbytes', IntegerType(), True), StructField('virusid', StringType(), True), StructField('bcappname', StringType(), True), StructField('bcappoper', StringType(), True), StructField('fulluri', StringType(), True), StructField('y', StringType(), True), StructField('m', StringType(), True), StructField('d', StringType(), True), StructField('h', StringType(), True) ] )
Example #20
Source File: taar_lite_guidguid.py From telemetry-airflow with Mozilla Public License 2.0 | 4 votes |
def transform(longitudinal_addons): # Only for logging, not used, but may be interesting for later analysis. guid_set_unique = ( longitudinal_addons.withColumn( "exploded", F.explode(longitudinal_addons.installed_addons) ) .select("exploded") # noqa: E501 - long lines .rdd.flatMap(lambda x: x) .distinct() .collect() ) logging.info( "Number of unique guids co-installed in sample: " + str(len(guid_set_unique)) ) restructured = longitudinal_addons.rdd.flatMap( lambda x: key_all(x.installed_addons) ).toDF(["key_addon", "coinstalled_addons"]) # Explode the list of co-installs and count pair occurrences. addon_co_installations = ( restructured.select( "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon") ) # noqa: E501 - long lines .groupBy("key_addon", "coinstalled_addon") .count() ) # Collect the set of coinstalled_addon, count pairs for each key_addon. combine_and_map_cols = F.udf( lambda x, y: (x, y), StructType([StructField("id", StringType()), StructField("n", LongType())]), ) # Spark functions are sometimes long and unwieldy. Tough luck. # Ignore E128 and E501 long line errors addon_co_installations_collapsed = ( addon_co_installations.select( # noqa: E128 "key_addon", combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501 "id_n" ), ) .groupby("key_addon") .agg(F.collect_list("id_n").alias("coinstallation_counts")) ) logging.info(addon_co_installations_collapsed.printSchema()) logging.info("Collecting final result of co-installations.") return addon_co_installations_collapsed
Example #21
Source File: taar_lite_guidguid.py From python_mozetl with MIT License | 4 votes |
def transform(longitudinal_addons): # Only for logging, not used, but may be interesting for later analysis. guid_set_unique = ( longitudinal_addons.withColumn( "exploded", F.explode(longitudinal_addons.installed_addons) ) .select("exploded") # noqa: E501 - long lines .rdd.flatMap(lambda x: x) .distinct() .collect() ) logging.info( "Number of unique guids co-installed in sample: " + str(len(guid_set_unique)) ) restructured = longitudinal_addons.rdd.flatMap( lambda x: key_all(x.installed_addons) ).toDF(["key_addon", "coinstalled_addons"]) # Explode the list of co-installs and count pair occurrences. addon_co_installations = ( restructured.select( "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon") ) # noqa: E501 - long lines .groupBy("key_addon", "coinstalled_addon") .count() ) # Collect the set of coinstalled_addon, count pairs for each key_addon. combine_and_map_cols = F.udf( lambda x, y: (x, y), StructType([StructField("id", StringType()), StructField("n", LongType())]), ) # Spark functions are sometimes long and unwieldy. Tough luck. # Ignore E128 and E501 long line errors addon_co_installations_collapsed = ( addon_co_installations.select( # noqa: E128 "key_addon", combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501 "id_n" ), ) .groupby("key_addon") .agg(F.collect_list("id_n").alias("coinstallation_counts")) ) logging.info(addon_co_installations_collapsed.printSchema()) logging.info("Collecting final result of co-installations.") return addon_co_installations_collapsed
Example #22
Source File: test_sync_bookmark.py From python_mozetl with MIT License | 4 votes |
def sync_summary_schema(): """"Generate a schema for sync_summary. This subset contains enough structure for testing bookmark validation. The schema is derived from [`telemetry-batch-view`][1]. [1]: https://git.io/vdQ5A """ failure_type = StructType([StructField("name", StringType(), False)]) status_type = StructType([StructField("sync", StringType(), True)]) validation_problems = StructType( [ StructField("name", StringType(), False), StructField("count", LongType(), False), ] ) validation_type = StructType( [ StructField("version", LongType(), False), StructField("checked", LongType(), False), StructField("took", LongType(), False), StructField("problems", ArrayType(validation_problems, False), True), ] ) engine_type = StructType( [ StructField("name", StringType(), False), StructField("status", StringType(), False), StructField("failure_reason", failure_type, True), StructField("validation", validation_type, True), ] ) return StructType( [ StructField("app_build_id", StringType(), True), StructField("app_version", StringType(), True), StructField("app_display_version", StringType(), True), StructField("app_name", StringType(), True), StructField("app_channel", StringType(), True), StructField("uid", StringType(), False), StructField("device_id", StringType(), True), StructField("when", LongType(), False), StructField("failure_reason", failure_type, True), StructField("status", status_type, False), StructField("engines", ArrayType(engine_type, False), True), StructField("submission_date_s3", StringType(), False), ] )