Python Examples of pyarrow.parquet.ParquetDataset

Source File: dataset_metadata.py From petastorm with Apache License 2.0

6 votes

def get_schema_from_dataset_url(dataset_url_or_urls, hdfs_driver='libhdfs3'):
    """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.

    :param dataset_url_or_urls: a url to a parquet directory or a url list (with the same scheme) to parquet files.
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: A :class:`petastorm.unischema.Unischema` object
    """
    fs, path_or_paths = get_filesystem_and_path_or_paths(dataset_url_or_urls, hdfs_driver)

    dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10)

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema(dataset)

    return stored_schema

Source File: test_parquet_reader.py From petastorm with Apache License 2.0

6 votes

def test_asymetric_parquet_pieces(reader_factory, tmpdir):
    """Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can
    be fully read """
    url = 'file://' + tmpdir.strpath

    ROWS_COUNT = 1000
    # id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row
    # groups
    create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700'])

    # We verify we have pieces with different number of row-groups
    dataset = pq.ParquetDataset(tmpdir.strpath)
    row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces)
    assert len(row_group_counts) > 1

    # Make sure we are not missing any rows.
    with reader_factory(url, schema_fields=['id']) as reader:
        row_ids_batched = [row.id for row in reader]
        actual_row_ids = list(itertools.chain(*row_ids_batched))

    assert ROWS_COUNT == len(actual_row_ids)

Source File: test_generate_metadata.py From petastorm with Apache License 2.0

6 votes

def test_regenerate_metadata(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Delete both metadata files
    dataset = pq.ParquetDataset(a_moved_path)
    os.remove(dataset.common_metadata_path)

    # make_reader should not be able to read a dataset without Petastorm metadat.
    with pytest.raises(RuntimeError, match='make_reader supports reading only Petastorm datasets'):
        _check_reader(a_moved_path)

    # Regenerate all metadata including unischema information
    petastorm_generate_metadata._main([
        '--dataset_url', 'file://{}'.format(a_moved_path),
        '--unischema_class', 'petastorm.tests.test_common.TestSchema',
    ])

    # Reader should now work again (row group selector will not since we removed all metadata)
    _check_reader(a_moved_path)

Source File: test_parquet.py From recruit with Apache License 2.0

5 votes

def test_partition_cols_supported(self, pa, df_full):
        # GH #23283
        partition_cols = ['bool', 'int']
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(path, partition_cols=partition_cols,
                          compression=None)
            import pyarrow.parquet as pq
            dataset = pq.ParquetDataset(path, validate_schema=False)
            assert len(dataset.partitions.partition_names) == 2
            assert dataset.partitions.partition_names == set(partition_cols)

Source File: filesystem.py From mars with Apache License 2.0

5 votes

def read_parquet(self, path, columns=None, metadata=None, schema=None,
                         use_threads=True, use_pandas_metadata=False):
            """
            Read Parquet data from path in file system. Can read from a single file
            or a directory of files

            Parameters
            ----------
            path : str
                Single file path or directory
            columns : List[str], optional
                Subset of columns to read
            metadata : pyarrow.parquet.FileMetaData
                Known metadata to validate files against
            schema : pyarrow.parquet.Schema
                Known schema to validate files against. Alternative to metadata
                argument
            use_threads : boolean, default True
                Perform multi-threaded column reads
            use_pandas_metadata : boolean, default False
                If True and file has custom pandas schema metadata, ensure that
                index columns are also loaded

            Returns
            -------
            table : pyarrow.Table
            """
            from pyarrow.parquet import ParquetDataset
            dataset = ParquetDataset(path, schema=schema, metadata=metadata,
                                     filesystem=self)
            return dataset.read(columns=columns, use_threads=use_threads,
                                use_pandas_metadata=use_pandas_metadata)

Source File: reader.py From petastorm with Apache License 2.0

5 votes

def _filter_row_groups(self, dataset, row_groups, predicate, rowgroup_selector, cur_shard,
                           shard_count):
        """Calculates which rowgroups will be read during.

        The following filters are applied:
        - predicates;
        - row-group selector (our indexing mechanism);
        - training partition

        :param dataset: ParquetDataset instance
        :param row_groups: a list of row groups (a list of ParquetDatasetPiece objects)
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param cur_shard: An int denoting the current shard number used. Each node should
                       pass in a unique partition number in the range [0, shard_count).
        :param shard_count An int denoting the number of reader shards
        :return: (filtered_row_group_indexes, worker_predicate): filtered_row_group_indexes an integer index into
        row_groups array. worker_predicate contains only predicates that could not be resolved on the partitioned fields
        and need to be evaluated by workers.
        """

        filtered_row_group_indexes, worker_predicate = \
            self._apply_predicate_to_row_groups(dataset, row_groups, predicate)

        if rowgroup_selector:
            filtered_row_group_indexes = self._apply_row_group_selector(dataset, rowgroup_selector,
                                                                        filtered_row_group_indexes)

        if cur_shard is not None or shard_count is not None:
            filtered_row_group_indexes = self._partition_row_groups(dataset, row_groups, shard_count,
                                                                    cur_shard,
                                                                    filtered_row_group_indexes)

        if not filtered_row_group_indexes:
            warnings.warn('No matching data is available for loading after rowgroup '
                          'selector were applied and the data was sharded.')

        return filtered_row_group_indexes, worker_predicate

Source File: dataset_metadata.py From petastorm with Apache License 2.0

5 votes

def _generate_unischema_metadata(dataset, schema):
    """
    Generates the serialized unischema and adds it to the dataset parquet metadata to be used upon reading.
    :param dataset: (ParquetDataset) Dataset to attach schema
    :param schema:  (Unischema) Schema to attach to dataset
    :return: None
    """
    # TODO(robbieg): Simply pickling unischema will break if the UnischemaField class is changed,
    #  or the codec classes are changed. We likely need something more robust.
    assert schema
    serialized_schema = pickle.dumps(schema)
    utils.add_to_dataset_metadata(dataset, UNISCHEMA_KEY, serialized_schema)

Source File: dataset_metadata.py From petastorm with Apache License 2.0

5 votes

def _generate_num_row_groups_per_file(dataset, spark_context, filesystem_factory):
    """
    Generates the metadata file containing the number of row groups in each file
    for the parquet dataset located at the dataset_url. It does this in spark by
    opening all parquet files in the dataset on the executors and collecting the
    number of row groups in each file back on the driver.
    :param dataset: :class:`pyarrow.parquet.ParquetDataset`
    :param spark_context: spark context to use for retrieving the number of row groups
    in each parquet file in parallel
    :return: None, upon successful completion the metadata file will exist.
    """
    if not isinstance(dataset.paths, str):
        raise ValueError('Expected dataset.paths to be a single path, not a list of paths')

    # Get the common prefix of all the base path in order to retrieve a relative path
    paths = [piece.path for piece in dataset.pieces]

    # Needed pieces from the dataset must be extracted for spark because the dataset object is not serializable
    base_path = dataset.paths

    def get_row_group_info(path):
        fs = filesystem_factory()
        relative_path = os.path.relpath(path, base_path)
        pq_file = fs.open(path)
        num_row_groups = pq.read_metadata(pq_file).num_row_groups
        pq_file.close()
        return relative_path, num_row_groups

    row_groups = spark_context.parallelize(paths, len(paths)) \
        .map(get_row_group_info) \
        .collect()
    num_row_groups_str = json.dumps(dict(row_groups))
    # Add the dict for the number of row groups in each file to the parquet file metadata footer
    utils.add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, num_row_groups_str)

Source File: test_reader.py From petastorm with Apache License 2.0

5 votes

def test_normalize_shuffle_partitions(synthetic_dataset):
    dataset = pq.ParquetDataset(synthetic_dataset.path)
    row_drop_partitions = Reader._normalize_shuffle_options(2, dataset)
    assert row_drop_partitions == 2

    row_drop_partitions = Reader._normalize_shuffle_options(1000, dataset)
    assert row_drop_partitions == 10

Source File: test_parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_partition_cols_supported(self, pa, df_full):
        # GH #23283
        partition_cols = ['bool', 'int']
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(path, partition_cols=partition_cols,
                          compression=None)
            import pyarrow.parquet as pq
            dataset = pq.ParquetDataset(path, validate_schema=False)
            assert len(dataset.partitions.partition_names) == 2
            assert dataset.partitions.partition_names == set(partition_cols)

Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def parquet_file_schema(file_name):
    import pyarrow.parquet as pq
    col_names = []
    col_types = []

    pq_dataset = pq.ParquetDataset(file_name)
    col_names = pq_dataset.schema.names
    pa_schema = pq_dataset.schema.to_arrow_schema()

    col_types = [_get_numba_typ_from_pa_typ(pa_schema.field_by_name(c).type)
                 for c in col_names]
    # TODO: close file?
    return col_names, col_types

Source File: io_exp.py From modin with Apache License 2.0

5 votes

def _read_parquet_columns(path, columns, num_splits, kwargs):  # pragma: no cover
    """Use a Ray task to read columns from Parquet into a Pandas DataFrame.

    Note: Ray functions are not detected by codecov (thus pragma: no cover)

    Args:
        path: The path of the Parquet file.
        columns: The list of column names to read.
        num_splits: The number of partitions to split the column into.

    Returns:
            A list containing the split Pandas DataFrames and the Index as the last
            element. If there is not `index_col` set, then we just return the length.
            This is used to determine the total length of the DataFrame to build a
            default Index.
    """
    import pyarrow.parquet as pq

    df = (
        pq.ParquetDataset(path, **kwargs)
        .read(columns=columns, use_pandas_metadata=True)
        .to_pandas()
    )
    df = df[columns]
    # Append the length of the index here to build it externally
    return _split_result_for_readers(0, num_splits, df) + [len(df.index)]

Source File: arrow_reader_worker.py From petastorm with Apache License 2.0

4 votes

def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path_or_paths,
                filesystem=self._filesystem,
                validate_schema=False)

        if self._dataset.partitions is None:
            # When read from parquet file list, the `dataset.partitions` will be None.
            # But other petastorm code require at least an empty `ParquetPartitions` object.
            self._dataset.partitions = pq.ParquetPartitions()

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
        else:
            # Using hash of the dataset path with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            if isinstance(self._dataset_path_or_paths, list):
                path_str = ','.join(self._dataset_path_or_paths)
            else:
                path_str = self._dataset_path_or_paths
            cache_key = '{}:{}:{}'.format(hashlib.md5(path_str.encode('utf-8')).hexdigest(),
                                          piece.path, piece_index)
            all_cols = self._local_cache.get(cache_key,
                                             lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))

        if all_cols:
            self.publish_func(all_cols)

Source File: py_dict_reader_worker.py From petastorm with Apache License 2.0

4 votes

def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path,
                filesystem=self._filesystem,
                validate_schema=False)

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
        else:
            # Using hash of the dataset path with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            cache_key = '{}:{}:{}'.format(hashlib.md5(self._dataset_path.encode('utf-8')).hexdigest(),
                                          piece.path, piece_index)
            all_cols = self._local_cache.get(cache_key,
                                             lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))

        if self._ngram:
            all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema)

        if all_cols:
            self.publish_func(all_cols)

Source File: rowgroup_indexing.py From petastorm with Apache License 2.0

4 votes

def build_rowgroup_index(dataset_url, spark_context, indexers, hdfs_driver='libhdfs3'):
    """
    Build index for given list of fields to use for fast rowgroup selection
    :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config)
    :param spark_context: (SparkContext)
    :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
    libhdfs (java through JNI) or libhdfs3 (C++)
    :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file
    """

    if dataset_url and dataset_url[-1] == '/':
        dataset_url = dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(dataset_url, spark_context._jsc.hadoopConfiguration(),
                                  hdfs_driver=hdfs_driver, user=spark_context.sparkUser())
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    split_pieces = dataset_metadata.load_row_groups(dataset)
    schema = dataset_metadata.get_schema(dataset)

    # We need direct reference on partitions object
    partitions = dataset.partitions
    pieces_num = len(split_pieces)
    piece_info_list = []
    for piece_index in range(pieces_num):
        #  indexes relies on the ordering of the split dataset pieces.
        # This relies on how the dataset pieces are split and sorted which although should not change,
        # still might and we should make sure not to forget that could break this.
        piece = split_pieces[piece_index]
        piece_info_list.append(PieceInfo(piece_index, piece.path, piece.row_group, piece.partition_keys))

    start_time = time.time()
    piece_info_rdd = spark_context.parallelize(piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM))
    indexer_rdd = piece_info_rdd.map(lambda piece_info: _index_columns(piece_info, dataset_url, partitions,
                                                                       indexers, schema, hdfs_driver=hdfs_driver))
    indexer_list = indexer_rdd.reduce(_combine_indexers)

    indexer_dict = {indexer.index_name: indexer for indexer in indexer_list}
    serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL)
    utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, serialized_indexers)
    logger.info("Elapsed time of index creation: %f s", (time.time() - start_time))

Python pyarrow.parquet.ParquetDataset() Examples