Python pyarrow.parquet.ParquetFile() Examples
The following are 19
code examples of pyarrow.parquet.ParquetFile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow.parquet
, or try the search function
.
Example #1
Source File: test_client.py From json2parquet with MIT License | 6 votes |
def test_convert_json(): """ Test converting a JSON file to Parquet """ schema = pa.schema([ pa.field("foo", pa.int32()), pa.field("bar", pa.int64()) ]) input_path = "{}/tests/fixtures/simple_json.txt".format(os.getcwd()) expected_file = "{}/tests/fixtures/simple.parquet".format(os.getcwd()) with tempfile.NamedTemporaryFile() as f: output_file = f.name client.convert_json(input_path, output_file, schema) output = pq.ParquetFile(output_file) expected = pq.ParquetFile(expected_file) assert output.metadata.num_columns == expected.metadata.num_columns assert output.metadata.num_rows == expected.metadata.num_rows assert output.schema.equals(expected.schema) assert output.read_row_group(0).to_pydict() == expected.read_row_group(0).to_pydict()
Example #2
Source File: test_parquet.py From kartothek with MIT License | 6 votes |
def test_predicate_accept_in(store, predicate_value, expected): df = pd.DataFrame({"A": [0, 4, 13, 29]}) # min = 0, max = 29 predicate = ("A", "in", predicate_value) serialiser = ParquetSerializer(chunk_size=None) key = serialiser.store(store, "prefix", df) parquet_file = ParquetFile(store.open(key)) row_meta = parquet_file.metadata.row_group(0) arrow_schema = parquet_file.schema.to_arrow_schema() parquet_reader = parquet_file.reader assert ( _predicate_accepts( predicate, row_meta=row_meta, arrow_schema=arrow_schema, parquet_reader=parquet_reader, ) == expected )
Example #3
Source File: parquet.py From ibis with Apache License 2.0 | 6 votes |
def table(self, name: str, path: Optional[str] = None) -> ir.TableExpr: if name not in self.list_tables(path): raise AttributeError(name) if path is None: path = self.root # get the schema f = path / "{}.parquet".format(name) parquet_file = pq.ParquetFile(str(f)) schema = sch.infer(parquet_file.schema) table = self.table_class(name, schema, self).to_expr() self.dictionary[name] = f return table
Example #4
Source File: test_write.py From csv2parquet with Apache License 2.0 | 6 votes |
def test_write_from_csv(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv']) pqf = pq.ParquetFile('csvs/simple.parquet') assert pqf.num_row_groups == 1 schema = pqf.schema assert schema.names == ['a', 'b'] assert schema.column(0).logical_type.type == 'STRING' assert schema.column(1).logical_type.type == 'STRING' row_group = pqf.read_row_group(0) assert row_group.num_rows == 3 row_group = pqf.read_row_group(0) assert row_group.num_rows == 3 col_a = row_group.column(0).to_pylist() assert col_a == ['1', '2', '3'] col_b = row_group.column(1).to_pylist() assert col_b == ['a', 'b', 'c']
Example #5
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_opt_invalid_types(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/invalid-types.csv', '--type', 'bool=bool?', 'float32=float32?', 'float64=float64?', 'int8=int8?', 'int16=int16?', 'int32=int32?', 'int64=int64?', 'string=string?', 'timestamp=timestamp?']) pqf = pq.ParquetFile('csvs/invalid-types.parquet') schema = pqf.schema assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'string', 'timestamp'] row_group = pqf.read_row_group(0) assert row_group.num_rows == 2 bools = row_group.column(0).to_pylist() assert bools == [True, None] float32 = row_group.column(1).to_pylist() assert len(float32) == 2 assert float32[0] == pytest.approx(0.5) assert float32[1] is None float64 = row_group.column(2).to_pylist() assert float64 == [0.75, None] int8 = row_group.column(3).to_pylist() assert int8 == [12, None] int16 = row_group.column(4).to_pylist() assert int16 == [400, None] int32 = row_group.column(5).to_pylist() assert int32 == [132000, None] int64 = row_group.column(6).to_pylist() assert int64 == [6000000000, None] string = row_group.column(7).to_pylist() assert string == ['string', 'blah'] timestamp = row_group.column(8).to_pylist() assert timestamp == [datetime(2018, 7, 9, 0, 0), None]
Example #6
Source File: test_parquet.py From kartothek with MIT License | 5 votes |
def test_rowgroup_writing(store, use_categorical, chunk_size): df = pd.DataFrame({"string": ["abc", "affe", "banane", "buchstabe"]}) serialiser = ParquetSerializer(chunk_size=2) # Arrow 0.9.0 has a bug in writing categorical columns to more than a single # RowGroup: "ArrowIOError: Column 2 had 2 while previous column had 4". # We have special handling for that in pandas-serialiser that should be # removed once we switch to 0.10.0 if use_categorical: df_write = df.astype({"string": "category"}) else: df_write = df key = serialiser.store(store, "prefix", df_write) parquet_file = ParquetFile(store.open(key)) assert parquet_file.num_row_groups == 2
Example #7
Source File: parquet.py From gcr-catalogs with BSD 3-Clause "New" or "Revised" License | 5 votes |
def handle(self): if self._handle is None: self._handle = pq.ParquetFile(self.path) return self._handle
Example #8
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_required_types(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/types.csv', '--type', 'bool=bool', 'float32=float32', 'float64=float64', 'int8=int8', 'int16=int16', 'int32=int32', 'int64=int64', 'string=string', 'timestamp=timestamp']) pqf = pq.ParquetFile('csvs/types.parquet') schema = pqf.schema assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'string', 'timestamp'] row_group = pqf.read_row_group(0) assert row_group.num_rows == 2 bools = row_group.column(0).to_pylist() assert bools == [True, False] float32 = row_group.column(1).to_pylist() assert float32 == pytest.approx([0.5, 0.6]) float64 = row_group.column(2).to_pylist() assert float64 == [0.75, 1.75] int8 = row_group.column(3).to_pylist() assert int8 == [12, 13] int16 = row_group.column(4).to_pylist() assert int16 == [400, 401] int32 = row_group.column(5).to_pylist() assert int32 == [132000, 132001] int64 = row_group.column(6).to_pylist() assert int64 == [6000000000, 6000000001] string = row_group.column(7).to_pylist() assert string == ['string', 'string'] timestamp = row_group.column(8).to_pylist() assert timestamp == [datetime(2018, 7, 9, 0, 0), datetime(2018, 7, 10, 0, 0)]
Example #9
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_write_exclude_by_index(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', '0']) pqf = pq.ParquetFile('csvs/simple.parquet') schema = pqf.schema assert schema.names == ['b'] row_group = pqf.read_row_group(0) assert row_group.num_rows == 3 col_b = row_group.column(0).to_pylist() assert col_b == ['a', 'b', 'c']
Example #10
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_write_exclude_by_name(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', 'a']) pqf = pq.ParquetFile('csvs/simple.parquet') schema = pqf.schema assert schema.names == ['b'] row_group = pqf.read_row_group(0) assert row_group.num_rows == 3 col_b = row_group.column(0).to_pylist() assert col_b == ['a', 'b', 'c']
Example #11
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_write_include_by_index(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--include', '0']) pqf = pq.ParquetFile('csvs/simple.parquet') schema = pqf.schema assert schema.names == ['a'] row_group = pqf.read_row_group(0) assert row_group.num_rows == 3 col_a = row_group.column(0).to_pylist() assert col_a == ['1', '2', '3']
Example #12
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_write_limit(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--rows', '1']) pqf = pq.ParquetFile('csvs/simple.parquet') row_group = pqf.read_row_group(0) assert row_group.num_rows == 1
Example #13
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_write_row_group_size(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--row-group-size', '1']) pqf = pq.ParquetFile('csvs/simple.parquet') assert pqf.num_row_groups == 3
Example #14
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_write_rename(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--rename', '0=alpha', 'b=bee']) pqf = pq.ParquetFile('csvs/simple.parquet') schema = pqf.schema assert schema.names == ['alpha', 'bee']
Example #15
Source File: test_write.py From csv2parquet with Apache License 2.0 | 5 votes |
def test_write_from_tsv(): csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple2.tsv']) pqf = pq.ParquetFile('csvs/simple2.parquet') assert pqf.num_row_groups == 1 schema = pqf.schema assert schema.names == ['a', 'b'] assert schema.column(0).logical_type.type == 'STRING' assert schema.column(1).logical_type.type == 'STRING' row_group = pqf.read_row_group(0) assert row_group.num_rows == 1 col_a = row_group.column(0).to_pylist() assert col_a == ['1'] col_b = row_group.column(1).to_pylist() assert col_b == ['b']
Example #16
Source File: test_schema.py From ibis with Apache License 2.0 | 4 votes |
def parquet_schema(): np.random.seed(0) size = 100 df = pd.DataFrame( { 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, # TODO(wesm): Test other timestamp resolutions now that arrow # supports them 'datetime': np.arange( "2016-01-01T00:00:00.001", size, dtype='datetime64[ms]' ), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size, 'bytes': [b'foo'] * size, }, columns=[ 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64', 'bool', 'datetime', 'str', 'str_with_nulls', 'empty_str', 'bytes', ], ) with tempfile.TemporaryFile() as path: table = pa.Table.from_pandas(df) pq.write_table(table, path) parquet_file = pq.ParquetFile(path) return parquet_file.schema
Example #17
Source File: arrow_reader_worker.py From petastorm with Apache License 2.0 | 4 votes |
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition): """Main worker function. Loads and returns all rows matching the predicate from a rowgroup Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified, columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria the rest of the columns are not loaded. :param piece_index: :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number of partitions. :return: """ if not self._dataset: self._dataset = pq.ParquetDataset( self._dataset_path_or_paths, filesystem=self._filesystem, validate_schema=False) if self._dataset.partitions is None: # When read from parquet file list, the `dataset.partitions` will be None. # But other petastorm code require at least an empty `ParquetPartitions` object. self._dataset.partitions = pq.ParquetPartitions() piece = self._split_pieces[piece_index] # Create pyarrow file system parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) if not isinstance(self._local_cache, NullCache): if worker_predicate: raise RuntimeError('Local cache is not supported together with predicates, ' 'unless the dataset is partitioned by the column the predicate operates on.') if shuffle_row_drop_partition[1] != 1: raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1') if worker_predicate: all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition) else: # Using hash of the dataset path with the relative path in order to: # 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts # 2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with # some cache implementations # 3. Still leave relative path and the piece_index in plain text to make it easier to debug if isinstance(self._dataset_path_or_paths, list): path_str = ','.join(self._dataset_path_or_paths) else: path_str = self._dataset_path_or_paths cache_key = '{}:{}:{}'.format(hashlib.md5(path_str.encode('utf-8')).hexdigest(), piece.path, piece_index) all_cols = self._local_cache.get(cache_key, lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition)) if all_cols: self.publish_func(all_cols)
Example #18
Source File: py_dict_reader_worker.py From petastorm with Apache License 2.0 | 4 votes |
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition): """Main worker function. Loads and returns all rows matching the predicate from a rowgroup Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified, columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria the rest of the columns are not loaded. :param piece_index: :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number of partitions. :return: """ if not self._dataset: self._dataset = pq.ParquetDataset( self._dataset_path, filesystem=self._filesystem, validate_schema=False) piece = self._split_pieces[piece_index] # Create pyarrow file system parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) if not isinstance(self._local_cache, NullCache): if worker_predicate: raise RuntimeError('Local cache is not supported together with predicates, ' 'unless the dataset is partitioned by the column the predicate operates on.') if shuffle_row_drop_partition[1] != 1: raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1') if worker_predicate: all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition) else: # Using hash of the dataset path with the relative path in order to: # 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts # 2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with # some cache implementations # 3. Still leave relative path and the piece_index in plain text to make it easier to debug cache_key = '{}:{}:{}'.format(hashlib.md5(self._dataset_path.encode('utf-8')).hexdigest(), piece.path, piece_index) all_cols = self._local_cache.get(cache_key, lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition)) if self._ngram: all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema) if all_cols: self.publish_func(all_cols)
Example #19
Source File: test_common_metadata.py From kartothek with MIT License | 4 votes |
def test_store_schema_metadata(store, df_all_types): store_schema_metadata( schema=make_meta(df_all_types, origin="df_all_types"), dataset_uuid="some_uuid", store=store, table="some_table", ) key = "some_uuid/some_table/_common_metadata" assert key in store.keys() pq_file = pq.ParquetFile(store.open(key)) actual_schema = pq_file.schema.to_arrow_schema() fields = [ pa.field("array_float32", pa.list_(pa.float64())), pa.field("array_float64", pa.list_(pa.float64())), pa.field("array_int16", pa.list_(pa.int64())), pa.field("array_int32", pa.list_(pa.int64())), pa.field("array_int64", pa.list_(pa.int64())), pa.field("array_int8", pa.list_(pa.int64())), pa.field("array_uint16", pa.list_(pa.uint64())), pa.field("array_uint32", pa.list_(pa.uint64())), pa.field("array_uint64", pa.list_(pa.uint64())), pa.field("array_uint8", pa.list_(pa.uint64())), pa.field("array_unicode", pa.list_(pa.string())), pa.field("bool", pa.bool_()), pa.field("byte", pa.binary()), pa.field("date", pa.date32()), pa.field("datetime64", pa.timestamp("us")), pa.field("float32", pa.float64()), pa.field("float64", pa.float64()), pa.field("int16", pa.int64()), pa.field("int32", pa.int64()), pa.field("int64", pa.int64()), pa.field("int8", pa.int64()), pa.field("null", pa.null()), pa.field("uint16", pa.uint64()), pa.field("uint32", pa.uint64()), pa.field("uint64", pa.uint64()), pa.field("uint8", pa.uint64()), pa.field("unicode", pa.string()), ] expected_schema = pa.schema(fields) assert actual_schema.remove_metadata() == expected_schema