Python pyspark.Row() Examples
The following are 14
code examples of pyspark.Row().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark
, or try the search function
.
Example #1
Source File: tf_image.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def _convertOutputToImage(self, df, tfs_output_col, output_shape): assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions" height = int(output_shape[1]) width = int(output_shape[2]) def to_image(orig_image, numeric_data): # Assume the returned image has float pixels but same #channels as input mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels) data = bytearray(np.array(numeric_data).astype(np.float32).tobytes()) nChannels = orig_image.nChannels return Row( origin="", mode=mode.ord, height=height, width=width, nChannels=nChannels, data=data) to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType) resDf = df.withColumn(self.getOutputCol(), to_image_udf(df[self.getInputCol()], df[tfs_output_col])) return resDf.drop(tfs_output_col)
Example #2
Source File: imageIO.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def imageArrayToStruct(imgArray, origin=""): """ Create a row representation of an image from an image array. :param imgArray: ndarray, image data. :return: Row, image as a DataFrame Row with schema==ImageSchema. """ # Sometimes tensors have a leading "batch-size" dimension. Assume to be 1 if it exists. if len(imgArray.shape) == 4: if imgArray.shape[0] != 1: raise ValueError( "The first dimension of a 4-d image array is expected to be 1.") imgArray = imgArray.reshape(imgArray.shape[1:]) imageType = _arrayToOcvMode(imgArray) height, width, nChannels = imgArray.shape data = bytearray(imgArray.tobytes()) return Row(origin=origin, mode=imageType.ord, height=height, width=width, nChannels=nChannels, data=data)
Example #3
Source File: imageIO.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def imageStructToPIL(imageRow): """ Convert the immage from image schema struct to PIL image :param imageRow: Row, must have ImageSchema :return PIL image """ imgType = imageTypeByOrdinal(imageRow.mode) if imgType.dtype != 'uint8': raise ValueError("Can not convert image of type " + imgType.dtype + " to PIL, can only deal with 8U format") ary = imageStructToArray(imageRow) # PIL expects RGB order, image schema is BGR # => we need to flip the order unless there is only one channel if imgType.nChannels != 1: ary = _reverseChannels(ary) if imgType.nChannels == 1: return Image.fromarray(obj=ary, mode='L') elif imgType.nChannels == 3: return Image.fromarray(obj=ary, mode='RGB') elif imgType.nChannels == 4: return Image.fromarray(obj=ary, mode='RGBA') else: raise ValueError("don't know how to convert " + imgType.name + " to PIL")
Example #4
Source File: process.py From kafka-compose with MIT License | 6 votes |
def process(timestamp, rdd): try: # Get the singleton instance of SparkSession spark = get_session(rdd.context.getConf()) # Convert RDD[List[String]] to RDD[Row] to DataFrame rows = rdd.flatMap(lambda a: a).map(lambda w: Row(word=w)) words_df = spark.createDataFrame(rows) # Creates a temporary view using the DataFrame words_df.createOrReplaceTempView('words') # Do word count on table using SQL and print it sql = "SELECT word, COUNT(1) AS total FROM words GROUP BY word" word_count_df = spark.sql(sql) word_count_df.show() except: pass
Example #5
Source File: imageIO.py From spark-deep-learning with Apache License 2.0 | 5 votes |
def imageStructToArray(imageRow): """ Convert an image to a numpy array. :param imageRow: Row, must use imageSchema. :return: ndarray, image data. """ imType = imageTypeByOrdinal(imageRow.mode) shape = (imageRow.height, imageRow.width, imageRow.nChannels) return np.ndarray(shape, imType.dtype, imageRow.data)
Example #6
Source File: vocabulary2id.py From code2vec with Apache License 2.0 | 5 votes |
def build_vocabularies(self, rows: RDD): """ Process rows to gather values and paths with their frequencies. :param rows: row structure is ((key, doc), val) where: * key: str with the path context * doc: file name * val: number of occurrences of key in doc """ def _flatten_row(row: Row): # 2: removes the namespace v. from the string to parse it as tuple k = Vocabulary2Id._unstringify_path_context(row) return [(k[0], 1), (k[1], 1), (k[2], 1)] rows = rows \ .flatMap(_flatten_row) \ .reduceByKey(operator.add) \ .persist() values = rows.filter(lambda x: type(x[0]) == str).collect() paths = rows.filter(lambda x: type(x[0]) == tuple).collect() value2index = {w: id for id, (w, _) in enumerate(values)} path2index = {w: id for id, (w, _) in enumerate(paths)} value2freq = {w: freq for _, (w, freq) in enumerate(values)} path2freq = {w: freq for _, (w, freq) in enumerate(paths)} rows.unpersist() return value2index, path2index, value2freq, path2freq
Example #7
Source File: vocabulary2id.py From code2vec with Apache License 2.0 | 5 votes |
def build_doc2pc(self, value2index: dict, path2index: dict, rows: RDD): """ Process rows and build elements (doc, [path_context_1, path_context_2, ...]) :param value2index_freq: value -> id :param path2index_freq: path -> id """ bc_value2index = self.sc.broadcast(value2index) bc_path2index = self.sc.broadcast(path2index) def _doc2pc(row: Row): (u, path, v), doc = Vocabulary2Id._unstringify_path_context(row), row[0][1] return doc, (bc_value2index.value[u], bc_path2index.value[path], bc_value2index.value[v]) rows = rows \ .map(_doc2pc) \ .distinct() \ .combineByKey(lambda value: [value], lambda x, value: x + [value], lambda x, y: x + y) bc_value2index.unpersist(blocking=True) bc_path2index.unpersist(blocking=True) return rows
Example #8
Source File: sys_exec.py From cadCAD with MIT License | 5 votes |
def align_type(init_condition: dict): def f(d): for y, x in init_condition.items(): d[y] = type(x)(d[y]) return Row(**d) return f ### Typefull Conversion: to Spark # rdd -> spark ### Typeless Conversion: to Spark # (rdd -> pandas) -> spark
Example #9
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_dict_to_spark_row_field_validation_scalar_types(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row) # Not a nullable field with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) # Wrong field type with pytest.raises(TypeError): isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row)
Example #10
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_dict_to_spark_row_field_validation_scalar_nullable(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True), ]) assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
Example #11
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_dict_to_spark_row_field_validation_ndarrays(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False), ]) assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row) # Null value into not nullable field with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) # Wrong dimensions with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row)
Example #12
Source File: main.py From python_mozetl with MIT License | 5 votes |
def ping_to_row(ping): return Row(client_id=ping["clientId"], os=ping["environment/system/os/name"])
Example #13
Source File: unischema.py From petastorm with Apache License 2.0 | 4 votes |
def dict_to_spark_row(unischema, row_dict): """Converts a single row into a spark Row object. Verifies that the data confirms with unischema definition types and encodes the data using the codec specified by the unischema. The parameters are keywords to allow use of functools.partial. :param unischema: an instance of Unischema object :param row_dict: a dictionary where the keys match name of fields in the unischema. :return: a single pyspark.Row object """ # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader) import pyspark assert isinstance(unischema, Unischema) # Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect copy_row_dict = copy.copy(row_dict) insert_explicit_nulls(unischema, copy_row_dict) if set(copy_row_dict.keys()) != set(unischema.fields.keys()): raise ValueError('Dictionary fields \n{}\n do not match schema fields \n{}'.format( '\n'.join(sorted(copy_row_dict.keys())), '\n'.join(unischema.fields.keys()))) encoded_dict = {} for field_name, value in copy_row_dict.items(): schema_field = unischema.fields[field_name] if value is None: if not schema_field.nullable: raise ValueError('Field {} is not "nullable", but got passes a None value') if schema_field.codec: encoded_dict[field_name] = schema_field.codec.encode(schema_field, value) if value is not None else None else: if isinstance(value, (np.generic,)): encoded_dict[field_name] = value.tolist() else: encoded_dict[field_name] = value field_list = list(unischema.fields.keys()) # generate a value list which match the schema column order. value_list = [encoded_dict[name] for name in field_list] # create a row by value list row = pyspark.Row(*value_list) # set row fields row.__fields__ = field_list return row
Example #14
Source File: test_common.py From petastorm with Apache License 2.0 | 4 votes |
def create_many_columns_non_petastorm_dataset(output_url, num_rows, num_columns=1000, num_files=4, spark=None): """Creates a dataset with the following properties (used in tests) 1. Has 1000 columns 2. Each column is an int32 integer 3. Parquet store consists of 4 files (controlled by ``num_files`` argument) :param output_url: The dataset is written to this url (e.g. ``file:///tmp/some_directory``) :param num_rows: Number of rows in the generated dataset :param num_columns: Number of columns (1000 is the default) :param num_files: Number of parquet files that will be created in the store :param spark: An instance of SparkSession object. A new instance will be created if non specified :return: """ shutdown = False if not spark: spark_session = SparkSession \ .builder \ .appName('petastorm_end_to_end_test') \ .master('local[*]') spark = spark_session.getOrCreate() shutdown = True column_names = ['col_{}'.format(col_id) for col_id in range(num_columns)] def generate_row(i): return {'col_{}'.format(col_id): i * 10000 for col_id, col_name in enumerate(column_names)} expected_data = [generate_row(row_number) for row_number in range(num_rows)] rows = [Row(**row) for row in expected_data] # WARNING: surprisingly, schema fields and row fields are matched only by order and not name. schema = StructType([StructField(column_name, IntegerType(), False) for column_name in column_names]) dataframe = spark.createDataFrame(rows, schema) dataframe. \ coalesce(num_files). \ write.option('compression', 'none'). \ mode('overwrite'). \ parquet(output_url) if shutdown: spark.stop() return expected_data