org.apache.parquet.filter2.compat.FilterCompat.Filter Java Examples
The following examples show how to use
org.apache.parquet.filter2.compat.FilterCompat.Filter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReader.java From tajo with Apache License 2.0 | 6 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
Example #2
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
public static List<Group> readFile(File f, Filter filter) throws IOException { ParquetReader<Group> reader = createReader(new Path(f.getAbsolutePath()), filter); Group current; List<Group> users = new ArrayList<Group>(); current = reader.read(); while (current != null) { users.add(current); current = reader.read(); } return users; }
Example #3
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private static ParquetReader<Group> createReader(Path file, Filter filter) throws IOException { Configuration conf = new Configuration(); GroupWriteSupport.setSchema(schema, conf); return ParquetReader.builder(new GroupReadSupport(), file) .withConf(conf) .withFilter(filter) .build(); }
Example #4
Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0 | 5 votes |
private List<User> readUsersWithProjection(Filter filter, MessageType schema, boolean useOtherFiltering, boolean useColumnIndexFilter) throws IOException { return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file) .withFilter(filter) .useDictionaryFilter(useOtherFiltering) .useStatsFilter(useOtherFiltering) .useRecordFilter(useOtherFiltering) .useColumnIndexFilter(useColumnIndexFilter) .set(ReadSupport.PARQUET_READ_SCHEMA, schema.toString())); }
Example #5
Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0 | 5 votes |
private List<User> readUsers(Filter filter, boolean useOtherFiltering, boolean useColumnIndexFilter) throws IOException { return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file) .withFilter(filter) .useDictionaryFilter(useOtherFiltering) .useStatsFilter(useOtherFiltering) .useRecordFilter(useOtherFiltering) .useColumnIndexFilter(useColumnIndexFilter)); }
Example #6
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, FilterCompat.Filter filter) throws IOException { this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)), HadoopReadOptions.builder(conf) .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null")) .build(), readSupport); }
Example #7
Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateFileWithComplexFilter(Path file, List<Group> data) throws IOException { Binary binaryValueB = fromString("b"); Filter filter = FilterCompat.get( and( gtEq(intColumn("id"), 0), and( lt(binaryColumn("name"), binaryValueB), notEq(binaryColumn("comment"), null)))); Predicate<Group> predicate = group -> group.getInteger("id", 0) >= 0 && BINARY_COMPARATOR.compare(group.getBinary("name", 0), binaryValueB) < 0 && group.getFieldRepetitionCount("comment") > 0; validateFile(file, filter, data.stream().filter(predicate)); }
Example #8
Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateFile(Path file, Filter filter, Stream<Group> data) throws IOException { try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file) .withFilter(filter) .build()) { for (Iterator<Group> it = data.iterator(); it.hasNext();) { assertEquals(it.next().toString(), reader.read().toString()); } } }
Example #9
Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0 | 4 votes |
/** * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro. * @param filter for filtering individual records */ public InternalParquetRecordReader(ReadSupport<T> readSupport, Filter filter) { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); }
Example #10
Source File: ThriftParquetReader.java From parquet-mr with Apache License 2.0 | 4 votes |
public Builder<T> withFilter(Filter filter) { this.filter = Objects.requireNonNull(filter, "filter cannot be null"); return this; }
Example #11
Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0 | 4 votes |
private List<User> readUsers(Filter filter, boolean useOtherFiltering) throws IOException { return readUsers(filter, useOtherFiltering, true); }
Example #12
Source File: RowGroupFilter.java From parquet-mr with Apache License 2.0 | 4 votes |
public static List<BlockMetaData> filterRowGroups(List<FilterLevel> levels, Filter filter, List<BlockMetaData> blocks, ParquetFileReader reader) { Objects.requireNonNull(filter, "filter cannot be null"); return filter.accept(new RowGroupFilter(levels, blocks, reader)); }
Example #13
Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers, long maxSplitSize, long minSplitSize, ReadContext readContext) throws IOException { List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); Filter filter = ParquetInputFormat.getFilter(configuration); long rowGroupsDropped = 0; long totalRowGroups = 0; for (Footer footer : footers) { final Path file = footer.getFile(); LOG.debug("{}", file); FileSystem fs = file.getFileSystem(configuration); FileStatus fileStatus = fs.getFileStatus(file); ParquetMetadata parquetMetaData = footer.getParquetMetadata(); List<BlockMetaData> blocks = parquetMetaData.getBlocks(); List<BlockMetaData> filteredBlocks; totalRowGroups += blocks.size(); filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema()); rowGroupsDropped += blocks.size() - filteredBlocks.size(); if (filteredBlocks.isEmpty()) { continue; } BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); splits.addAll( generateSplits( filteredBlocks, fileBlockLocations, fileStatus, readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize, maxSplitSize) ); } if (rowGroupsDropped > 0 && totalRowGroups > 0) { int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100); LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped); } else { LOG.info("There were no row groups that could be dropped due to filter predicates"); } return splits; }
Example #14
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 4 votes |
public Builder<T> withFilter(Filter filter) { this.filter = filter; optionsBuilder.withRecordFilter(filter); return this; }
Example #15
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 4 votes |
/** * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro. * @param filter for filtering individual records */ public InternalParquetRecordReader(ReadSupport<T> readSupport, Filter filter) { this.readSupport = readSupport; this.filter = Objects.requireNonNull(filter, "filter cannot be null"); }
Example #16
Source File: MessageColumnIO.java From parquet-mr with Apache License 2.0 | 4 votes |
public <T> RecordReader<T> getRecordReader(final PageReadStore columns, final RecordMaterializer<T> recordMaterializer, final Filter filter) { Objects.requireNonNull(columns, "columns cannot be null"); Objects.requireNonNull(recordMaterializer, "recordMaterializer cannot be null"); Objects.requireNonNull(filter, "filter cannot be null"); if (leaves.isEmpty()) { return new EmptyRecordReader<>(recordMaterializer); } return filter.accept(new Visitor<RecordReader<T>>() { @Override public RecordReader<T> visit(FilterPredicateCompat filterPredicateCompat) { FilterPredicate predicate = filterPredicateCompat.getFilterPredicate(); IncrementallyUpdatedFilterPredicateBuilder builder = new IncrementallyUpdatedFilterPredicateBuilder(leaves); IncrementallyUpdatedFilterPredicate streamingPredicate = builder.build(predicate); RecordMaterializer<T> filteringRecordMaterializer = new FilteringRecordMaterializer<T>( recordMaterializer, leaves, builder.getValueInspectorsByColumn(), streamingPredicate); return new RecordReaderImplementation<>( MessageColumnIO.this, filteringRecordMaterializer, validating, new ColumnReadStoreImpl(columns, filteringRecordMaterializer.getRootConverter(), getType(), createdBy)); } @Override public RecordReader<T> visit(UnboundRecordFilterCompat unboundRecordFilterCompat) { return new FilteredRecordReader<>( MessageColumnIO.this, recordMaterializer, validating, new ColumnReadStoreImpl(columns, recordMaterializer.getRootConverter(), getType(), createdBy), unboundRecordFilterCompat.getUnboundRecordFilter(), columns.getRowCount() ); } @Override public RecordReader<T> visit(NoOpFilter noOpFilter) { return new RecordReaderImplementation<>( MessageColumnIO.this, recordMaterializer, validating, new ColumnReadStoreImpl(columns, recordMaterializer.getRootConverter(), getType(), createdBy)); } }); }
Example #17
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 4 votes |
public ParquetRecordReader(ReadSupport<T> readSupport, MessageType readSchema, Filter filter) { this.filter = checkNotNull(filter, "readSupport"); this.readSupport = checkNotNull(readSupport, "readSchema"); this.readSchema = checkNotNull(readSchema, "filter"); }
Example #18
Source File: ParquetReader.java From tajo with Apache License 2.0 | 4 votes |
public Builder<T> withFilter(Filter filter) { this.filter = checkNotNull(filter, "filter"); return this; }
Example #19
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 4 votes |
public ParquetRecordReader(ReadSupport<T> readSupport, MessageType readSchema, Filter filter) { this.filter = checkNotNull(filter, "readSupport"); this.readSupport = checkNotNull(readSupport, "readSchema"); this.readSchema = checkNotNull(readSchema, "filter"); }
Example #20
Source File: RowGroupFilter.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * @param filter a filter * @param blocks a list of block metadata to filter * @param schema the file schema * @return a filtered list of block metadata * @deprecated will be removed in 2.0.0. */ @Deprecated public static List<BlockMetaData> filterRowGroups(Filter filter, List<BlockMetaData> blocks, MessageType schema) { Objects.requireNonNull(filter, "filter cannot be null"); return filter.accept(new RowGroupFilter(blocks, schema)); }
Example #21
Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro. * @param filter for filtering individual records */ public ParquetRecordReader(ReadSupport<T> readSupport, Filter filter) { internalReader = new InternalParquetRecordReader<T>(readSupport, filter); }
Example #22
Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * Returns a non-null Filter, which is a wrapper around either a * FilterPredicate, an UnboundRecordFilter, or a no-op filter. * * @param conf a configuration * @return a filter for the unbound record filter specified in conf */ public static Filter getFilter(Configuration conf) { return FilterCompat.get(getFilterPredicate(conf), getUnboundRecordFilterInstance(conf)); }