Python pyspark.sql.types.BinaryType() Examples
The following are 10
code examples of pyspark.sql.types.BinaryType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: imageIO.py From spark-deep-learning with Apache License 2.0 | 8 votes |
def filesToDF(sc, path, numPartitions=None): """ Read files from a directory to a DataFrame. :param sc: SparkContext. :param path: str, path to files. :param numPartition: int, number or partitions to use for reading files. :return: DataFrame, with columns: (filePath: str, fileData: BinaryType) """ numPartitions = numPartitions or sc.defaultParallelism schema = StructType([StructField("filePath", StringType(), False), StructField("fileData", BinaryType(), False)]) rdd = sc.binaryFiles( path, minPartitions=numPartitions).repartition(numPartitions) rdd = rdd.map(lambda x: (x[0], bytearray(x[1]))) return rdd.toDF(schema)
Example #2
Source File: test_imageIO.py From spark-deep-learning with Apache License 2.0 | 5 votes |
def test_filesTODF(self): df = imageIO.filesToDF(self.binaryFilesMock, "path", 217) self.assertEqual(df.rdd.getNumPartitions(), 217) df.schema.fields[0].dataType == StringType() df.schema.fields[0].dataType == BinaryType() first = df.first() self.assertTrue(hasattr(first, "filePath")) self.assertEqual(type(first.fileData), bytearray) # TODO: make unit tests for arrayToImageRow on arrays of varying shapes, channels, dtypes.
Example #3
Source File: codecs.py From petastorm with Apache License 2.0 | 5 votes |
def spark_dtype(self): # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module import pyspark.sql.types as sql_types return sql_types.BinaryType()
Example #4
Source File: codecs.py From petastorm with Apache License 2.0 | 5 votes |
def spark_dtype(self): # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module import pyspark.sql.types as sql_types return sql_types.BinaryType()
Example #5
Source File: codecs.py From petastorm with Apache License 2.0 | 5 votes |
def spark_dtype(self): # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module import pyspark.sql.types as sql_types return sql_types.BinaryType()
Example #6
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes,): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date,): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray,): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
Example #7
Source File: strings.py From koalas with Apache License 2.0 | 5 votes |
def __init__(self, series: "ks.Series"): if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)): raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type)) self._data = series self.name = self._data.name # Methods
Example #8
Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def loadTFRecords(sc, input_dir, binary_features=[]): """Load TFRecords from disk into a Spark DataFrame. This will attempt to automatically convert the tf.train.Example features into Spark DataFrame columns of equivalent types. Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint" from the caller in the ``binary_features`` argument. Args: :sc: SparkContext :input_dir: location of TFRecords on disk. :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays. Returns: A Spark DataFrame mirroring the tf.train.Example schema. """ import tensorflow as tf tfr_rdd = sc.newAPIHadoopFile(input_dir, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") # infer Spark SQL types from tf.Example record = tfr_rdd.take(1)[0] example = tf.train.Example() example.ParseFromString(bytes(record[0])) schema = infer_schema(example, binary_features) # convert serialized protobuf to tf.Example to Row example_rdd = tfr_rdd.mapPartitions(lambda x: fromTFExample(x, binary_features)) # create a Spark DataFrame from RDD[Row] df = example_rdd.toDF(schema) # save reference of this dataframe loadedDF[df] = input_dir return df
Example #9
Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def infer_schema(example, binary_features=[]): """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields). Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint" from the caller in the ``binary_features`` argument. Args: :example: a tf.train.Example :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays. Returns: A DataFrame StructType schema """ def _infer_sql_type(k, v): # special handling for binary features if k in binary_features: return BinaryType() if v.int64_list.value: result = v.int64_list.value sql_type = LongType() elif v.float_list.value: result = v.float_list.value sql_type = DoubleType() else: result = v.bytes_list.value sql_type = StringType() if len(result) > 1: # represent multi-item tensors as Spark SQL ArrayType() of base types return ArrayType(sql_type) else: # represent everything else as base types (and empty tensors as StringType()) return sql_type return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])
Example #10
Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0 | 4 votes |
def fromTFExample(iter, binary_features=[]): """mapPartition function to convert an RDD of serialized tf.train.Example bytestring into an RDD of Row. Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint" from the caller in the ``binary_features`` argument. Args: :iter: the RDD partition iterator :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays. Returns: An array/iterator of DataFrame Row with features converted into columns. """ # convert from protobuf-like dict to DataFrame-friendly dict def _get_value(k, v): if v.int64_list.value: result = v.int64_list.value elif v.float_list.value: result = v.float_list.value else: # string or bytearray if k in binary_features: return bytearray(v.bytes_list.value[0]) else: return v.bytes_list.value[0].decode('utf-8') if len(result) > 1: # represent multi-item tensors as python lists return list(result) elif len(result) == 1: # extract scalars from single-item tensors return result[0] else: # represent empty tensors as python None return None results = [] for record in iter: example = tf.train.Example() example.ParseFromString(bytes(record[0])) # record is (bytestr, None) d = {k: _get_value(k, v) for k, v in sorted(example.features.feature.items())} row = Row(**d) results.append(row) return results