Python pyarrow.parquet.ParquetDataset() Examples
The following are 15
code examples of pyarrow.parquet.ParquetDataset().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow.parquet
, or try the search function
.
Example #1
Source File: dataset_metadata.py From petastorm with Apache License 2.0 | 6 votes |
def get_schema_from_dataset_url(dataset_url_or_urls, hdfs_driver='libhdfs3'): """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url. :param dataset_url_or_urls: a url to a parquet directory or a url list (with the same scheme) to parquet files. :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :return: A :class:`petastorm.unischema.Unischema` object """ fs, path_or_paths = get_filesystem_and_path_or_paths(dataset_url_or_urls, hdfs_driver) dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10) # Get a unischema stored in the dataset metadata. stored_schema = get_schema(dataset) return stored_schema
Example #2
Source File: test_parquet_reader.py From petastorm with Apache License 2.0 | 6 votes |
def test_asymetric_parquet_pieces(reader_factory, tmpdir): """Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can be fully read """ url = 'file://' + tmpdir.strpath ROWS_COUNT = 1000 # id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row # groups create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700']) # We verify we have pieces with different number of row-groups dataset = pq.ParquetDataset(tmpdir.strpath) row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces) assert len(row_group_counts) > 1 # Make sure we are not missing any rows. with reader_factory(url, schema_fields=['id']) as reader: row_ids_batched = [row.id for row in reader] actual_row_ids = list(itertools.chain(*row_ids_batched)) assert ROWS_COUNT == len(actual_row_ids)
Example #3
Source File: test_generate_metadata.py From petastorm with Apache License 2.0 | 6 votes |
def test_regenerate_metadata(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) # Make sure we can read dataset before _check_reader(a_moved_path) # Delete both metadata files dataset = pq.ParquetDataset(a_moved_path) os.remove(dataset.common_metadata_path) # make_reader should not be able to read a dataset without Petastorm metadat. with pytest.raises(RuntimeError, match='make_reader supports reading only Petastorm datasets'): _check_reader(a_moved_path) # Regenerate all metadata including unischema information petastorm_generate_metadata._main([ '--dataset_url', 'file://{}'.format(a_moved_path), '--unischema_class', 'petastorm.tests.test_common.TestSchema', ]) # Reader should now work again (row group selector will not since we removed all metadata) _check_reader(a_moved_path)
Example #4
Source File: test_parquet.py From recruit with Apache License 2.0 | 5 votes |
def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ['bool', 'int'] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols)
Example #5
Source File: filesystem.py From mars with Apache License 2.0 | 5 votes |
def read_parquet(self, path, columns=None, metadata=None, schema=None, use_threads=True, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files Parameters ---------- path : str Single file path or directory columns : List[str], optional Subset of columns to read metadata : pyarrow.parquet.FileMetaData Known metadata to validate files against schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument use_threads : boolean, default True Perform multi-threaded column reads use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata)
Example #6
Source File: reader.py From petastorm with Apache License 2.0 | 5 votes |
def _filter_row_groups(self, dataset, row_groups, predicate, rowgroup_selector, cur_shard, shard_count): """Calculates which rowgroups will be read during. The following filters are applied: - predicates; - row-group selector (our indexing mechanism); - training partition :param dataset: ParquetDataset instance :param row_groups: a list of row groups (a list of ParquetDatasetPiece objects) :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param cur_shard: An int denoting the current shard number used. Each node should pass in a unique partition number in the range [0, shard_count). :param shard_count An int denoting the number of reader shards :return: (filtered_row_group_indexes, worker_predicate): filtered_row_group_indexes an integer index into row_groups array. worker_predicate contains only predicates that could not be resolved on the partitioned fields and need to be evaluated by workers. """ filtered_row_group_indexes, worker_predicate = \ self._apply_predicate_to_row_groups(dataset, row_groups, predicate) if rowgroup_selector: filtered_row_group_indexes = self._apply_row_group_selector(dataset, rowgroup_selector, filtered_row_group_indexes) if cur_shard is not None or shard_count is not None: filtered_row_group_indexes = self._partition_row_groups(dataset, row_groups, shard_count, cur_shard, filtered_row_group_indexes) if not filtered_row_group_indexes: warnings.warn('No matching data is available for loading after rowgroup ' 'selector were applied and the data was sharded.') return filtered_row_group_indexes, worker_predicate
Example #7
Source File: dataset_metadata.py From petastorm with Apache License 2.0 | 5 votes |
def _generate_unischema_metadata(dataset, schema): """ Generates the serialized unischema and adds it to the dataset parquet metadata to be used upon reading. :param dataset: (ParquetDataset) Dataset to attach schema :param schema: (Unischema) Schema to attach to dataset :return: None """ # TODO(robbieg): Simply pickling unischema will break if the UnischemaField class is changed, # or the codec classes are changed. We likely need something more robust. assert schema serialized_schema = pickle.dumps(schema) utils.add_to_dataset_metadata(dataset, UNISCHEMA_KEY, serialized_schema)
Example #8
Source File: dataset_metadata.py From petastorm with Apache License 2.0 | 5 votes |
def _generate_num_row_groups_per_file(dataset, spark_context, filesystem_factory): """ Generates the metadata file containing the number of row groups in each file for the parquet dataset located at the dataset_url. It does this in spark by opening all parquet files in the dataset on the executors and collecting the number of row groups in each file back on the driver. :param dataset: :class:`pyarrow.parquet.ParquetDataset` :param spark_context: spark context to use for retrieving the number of row groups in each parquet file in parallel :return: None, upon successful completion the metadata file will exist. """ if not isinstance(dataset.paths, str): raise ValueError('Expected dataset.paths to be a single path, not a list of paths') # Get the common prefix of all the base path in order to retrieve a relative path paths = [piece.path for piece in dataset.pieces] # Needed pieces from the dataset must be extracted for spark because the dataset object is not serializable base_path = dataset.paths def get_row_group_info(path): fs = filesystem_factory() relative_path = os.path.relpath(path, base_path) pq_file = fs.open(path) num_row_groups = pq.read_metadata(pq_file).num_row_groups pq_file.close() return relative_path, num_row_groups row_groups = spark_context.parallelize(paths, len(paths)) \ .map(get_row_group_info) \ .collect() num_row_groups_str = json.dumps(dict(row_groups)) # Add the dict for the number of row groups in each file to the parquet file metadata footer utils.add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, num_row_groups_str)
Example #9
Source File: test_reader.py From petastorm with Apache License 2.0 | 5 votes |
def test_normalize_shuffle_partitions(synthetic_dataset): dataset = pq.ParquetDataset(synthetic_dataset.path) row_drop_partitions = Reader._normalize_shuffle_options(2, dataset) assert row_drop_partitions == 2 row_drop_partitions = Reader._normalize_shuffle_options(1000, dataset) assert row_drop_partitions == 10
Example #10
Source File: test_parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ['bool', 'int'] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols)
Example #11
Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def parquet_file_schema(file_name): import pyarrow.parquet as pq col_names = [] col_types = [] pq_dataset = pq.ParquetDataset(file_name) col_names = pq_dataset.schema.names pa_schema = pq_dataset.schema.to_arrow_schema() col_types = [_get_numba_typ_from_pa_typ(pa_schema.field_by_name(c).type) for c in col_names] # TODO: close file? return col_names, col_types
Example #12
Source File: io_exp.py From modin with Apache License 2.0 | 5 votes |
def _read_parquet_columns(path, columns, num_splits, kwargs): # pragma: no cover """Use a Ray task to read columns from Parquet into a Pandas DataFrame. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: path: The path of the Parquet file. columns: The list of column names to read. num_splits: The number of partitions to split the column into. Returns: A list containing the split Pandas DataFrames and the Index as the last element. If there is not `index_col` set, then we just return the length. This is used to determine the total length of the DataFrame to build a default Index. """ import pyarrow.parquet as pq df = ( pq.ParquetDataset(path, **kwargs) .read(columns=columns, use_pandas_metadata=True) .to_pandas() ) df = df[columns] # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index)]
Example #13
Source File: arrow_reader_worker.py From petastorm with Apache License 2.0 | 4 votes |
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition): """Main worker function. Loads and returns all rows matching the predicate from a rowgroup Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified, columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria the rest of the columns are not loaded. :param piece_index: :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number of partitions. :return: """ if not self._dataset: self._dataset = pq.ParquetDataset( self._dataset_path_or_paths, filesystem=self._filesystem, validate_schema=False) if self._dataset.partitions is None: # When read from parquet file list, the `dataset.partitions` will be None. # But other petastorm code require at least an empty `ParquetPartitions` object. self._dataset.partitions = pq.ParquetPartitions() piece = self._split_pieces[piece_index] # Create pyarrow file system parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) if not isinstance(self._local_cache, NullCache): if worker_predicate: raise RuntimeError('Local cache is not supported together with predicates, ' 'unless the dataset is partitioned by the column the predicate operates on.') if shuffle_row_drop_partition[1] != 1: raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1') if worker_predicate: all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition) else: # Using hash of the dataset path with the relative path in order to: # 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts # 2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with # some cache implementations # 3. Still leave relative path and the piece_index in plain text to make it easier to debug if isinstance(self._dataset_path_or_paths, list): path_str = ','.join(self._dataset_path_or_paths) else: path_str = self._dataset_path_or_paths cache_key = '{}:{}:{}'.format(hashlib.md5(path_str.encode('utf-8')).hexdigest(), piece.path, piece_index) all_cols = self._local_cache.get(cache_key, lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition)) if all_cols: self.publish_func(all_cols)
Example #14
Source File: py_dict_reader_worker.py From petastorm with Apache License 2.0 | 4 votes |
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition): """Main worker function. Loads and returns all rows matching the predicate from a rowgroup Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified, columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria the rest of the columns are not loaded. :param piece_index: :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number of partitions. :return: """ if not self._dataset: self._dataset = pq.ParquetDataset( self._dataset_path, filesystem=self._filesystem, validate_schema=False) piece = self._split_pieces[piece_index] # Create pyarrow file system parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) if not isinstance(self._local_cache, NullCache): if worker_predicate: raise RuntimeError('Local cache is not supported together with predicates, ' 'unless the dataset is partitioned by the column the predicate operates on.') if shuffle_row_drop_partition[1] != 1: raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1') if worker_predicate: all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition) else: # Using hash of the dataset path with the relative path in order to: # 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts # 2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with # some cache implementations # 3. Still leave relative path and the piece_index in plain text to make it easier to debug cache_key = '{}:{}:{}'.format(hashlib.md5(self._dataset_path.encode('utf-8')).hexdigest(), piece.path, piece_index) all_cols = self._local_cache.get(cache_key, lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition)) if self._ngram: all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema) if all_cols: self.publish_func(all_cols)
Example #15
Source File: rowgroup_indexing.py From petastorm with Apache License 2.0 | 4 votes |
def build_rowgroup_index(dataset_url, spark_context, indexers, hdfs_driver='libhdfs3'): """ Build index for given list of fields to use for fast rowgroup selection :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config) :param spark_context: (SparkContext) :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file """ if dataset_url and dataset_url[-1] == '/': dataset_url = dataset_url[:-1] # Create pyarrow file system resolver = FilesystemResolver(dataset_url, spark_context._jsc.hadoopConfiguration(), hdfs_driver=hdfs_driver, user=spark_context.sparkUser()) dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(), validate_schema=False) split_pieces = dataset_metadata.load_row_groups(dataset) schema = dataset_metadata.get_schema(dataset) # We need direct reference on partitions object partitions = dataset.partitions pieces_num = len(split_pieces) piece_info_list = [] for piece_index in range(pieces_num): # indexes relies on the ordering of the split dataset pieces. # This relies on how the dataset pieces are split and sorted which although should not change, # still might and we should make sure not to forget that could break this. piece = split_pieces[piece_index] piece_info_list.append(PieceInfo(piece_index, piece.path, piece.row_group, piece.partition_keys)) start_time = time.time() piece_info_rdd = spark_context.parallelize(piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM)) indexer_rdd = piece_info_rdd.map(lambda piece_info: _index_columns(piece_info, dataset_url, partitions, indexers, schema, hdfs_driver=hdfs_driver)) indexer_list = indexer_rdd.reduce(_combine_indexers) indexer_dict = {indexer.index_name: indexer for indexer in indexer_list} serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL) utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, serialized_indexers) logger.info("Elapsed time of index creation: %f s", (time.time() - start_time))