org.apache.parquet.filter2.compat.FilterCompat Java Examples
The following examples show how to use
org.apache.parquet.filter2.compat.FilterCompat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFilterOnInteger() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 1); // Get first record RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("DocId", equalTo(10l)))); readOne(recordReader, "r2 filtered out", r1); // Get second record recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("DocId", equalTo(20l)))); readOne(recordReader, "r1 filtered out", r2); }
Example #2
Source File: ColumnIndexFilter.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * Calculates the row ranges containing the indexes of the rows might match the specified filter. * * @param filter * to be used for filtering the rows * @param columnIndexStore * the store for providing column/offset indexes * @param paths * the paths of the columns used in the actual projection; a column not being part of the projection will be * handled as containing {@code null} values only even if the column has values written in the file * @param rowCount * the total number of rows in the row-group * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows if any of * the required offset index is missing */ public static RowRanges calculateRowRanges(FilterCompat.Filter filter, ColumnIndexStore columnIndexStore, Set<ColumnPath> paths, long rowCount) { return filter.accept(new FilterCompat.Visitor<RowRanges>() { @Override public RowRanges visit(FilterPredicateCompat filterPredicateCompat) { try { return filterPredicateCompat.getFilterPredicate() .accept(new ColumnIndexFilter(columnIndexStore, paths, rowCount)); } catch (MissingOffsetIndexException e) { LOGGER.info(e.getMessage()); return RowRanges.createSingle(rowCount); } } @Override public RowRanges visit(UnboundRecordFilterCompat unboundRecordFilterCompat) { return RowRanges.createSingle(rowCount); } @Override public RowRanges visit(NoOpFilter noOpFilter) { return RowRanges.createSingle(rowCount); } }); }
Example #3
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testPaged() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 6); RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(page(4, 4))); List<Group> all = readAll(recordReader); assertEquals("expecting records " + all, 4, all.size()); for (int i = 0; i < all.size(); i++) { assertEquals("expecting record", (i%2 == 0 ? r2 : r1).toString(), all.get(i).toString()); } }
Example #4
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFilteredAndPaged() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 8); RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(and(column("DocId", equalTo(10l)), page(2, 4)))); List<Group> all = readAll(recordReader); assertEquals("expecting 4 records " + all, 4, all.size()); for (int i = 0; i < all.size(); i++) { assertEquals("expecting record1", r1.toString(), all.get(i).toString()); } }
Example #5
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFilteredOrPaged() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 8); RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(or(column("DocId", equalTo(10l)), column("DocId", equalTo(20l))))); List<Group> all = readAll(recordReader); assertEquals("expecting 8 records " + all, 16, all.size()); for (int i = 0; i < all.size () / 2; i++) { assertEquals("expecting record1", r1.toString(), all.get(2 * i).toString()); assertEquals("expecting record2", r2.toString(), all.get(2 * i + 1).toString()); } }
Example #6
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFilteredNotPaged() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 8); RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(not(column("DocId", equalTo(10l))))); List<Group> all = readAll(recordReader); assertEquals("expecting 8 records " + all, 8, all.size()); for (int i = 0; i < all.size(); i++) { assertEquals("expecting record2", r2.toString(), all.get(i).toString()); } }
Example #7
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException { // set up data filters based on configured levels List<RowGroupFilter.FilterLevel> levels = new ArrayList<>(); if (options.useStatsFilter()) { levels.add(STATISTICS); } if (options.useDictionaryFilter()) { levels.add(DICTIONARY); } if (options.useBloomFilter()) { levels.add(BLOOMFILTER); } FilterCompat.Filter recordFilter = options.getRecordFilter(); if (recordFilter != null) { return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this); } return blocks; }
Example #8
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testApplyFunctionFilterOnLong() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 1); // Get first record RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("DocId", equalTo(10l)))); readOne(recordReader, "r2 filtered out", r1); // Get second record recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("DocId", applyFunctionToLong(new LongGreaterThan15Predicate())))); readOne(recordReader, "r1 filtered out", r2); }
Example #9
Source File: HadoopReadOptions.java From parquet-mr with Apache License 2.0 | 6 votes |
private HadoopReadOptions(boolean useSignedStringMinMax, boolean useStatsFilter, boolean useDictionaryFilter, boolean useRecordFilter, boolean useColumnIndexFilter, boolean usePageChecksumVerification, boolean useBloomFilter, FilterCompat.Filter recordFilter, MetadataFilter metadataFilter, CompressionCodecFactory codecFactory, ByteBufferAllocator allocator, int maxAllocationSize, Map<String, String> properties, Configuration conf) { super( useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter, usePageChecksumVerification, useBloomFilter, recordFilter, metadataFilter, codecFactory, allocator, maxAllocationSize, properties ); this.conf = conf; }
Example #10
Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFilteringWithAllNullPages() { Set<ColumnPath> paths = paths("column1", "column5"); assertAllRows(calculateRowRanges(FilterCompat.get( notEq(longColumn("column5"), 1234567L)), STORE, paths, TOTAL_ROW_COUNT), TOTAL_ROW_COUNT); assertAllRows(calculateRowRanges(FilterCompat.get( or(gtEq(intColumn("column1"), 10), notEq(longColumn("column5"), 1234567L))), STORE, paths, TOTAL_ROW_COUNT), TOTAL_ROW_COUNT); assertRows(calculateRowRanges(FilterCompat.get( eq(longColumn("column5"), 1234567L)), STORE, paths, TOTAL_ROW_COUNT)); assertRows(calculateRowRanges(FilterCompat.get( and(lt(intColumn("column1"), 20), gtEq(longColumn("column5"), 1234567L))), STORE, paths, TOTAL_ROW_COUNT)); }
Example #11
Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testNoFiltering() throws IOException { // Column index filtering with no-op filter assertEquals(DATA, readUsers(FilterCompat.NOOP, false)); assertEquals(DATA, readUsers(FilterCompat.NOOP, true)); // Column index filtering turned off assertEquals(DATA.stream().filter(user -> user.getId() == 1234).collect(Collectors.toList()), readUsers(eq(longColumn("id"), 1234l), true, false)); assertEquals(DATA.stream().filter(user -> "miller".equals(user.getName())).collect(Collectors.toList()), readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), true, false)); assertEquals(DATA.stream().filter(user -> user.getName() == null).collect(Collectors.toList()), readUsers(eq(binaryColumn("name"), null), true, false)); // Every filtering mechanism turned off assertEquals(DATA, readUsers(eq(longColumn("id"), 1234l), false, false)); assertEquals(DATA, readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), false, false)); assertEquals(DATA, readUsers(eq(binaryColumn("name"), null), false, false)); }
Example #12
Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFilteringWithProjection() throws IOException { // All rows shall be retrieved because all values in column 'name' shall be handled as null values assertEquals( DATA.stream().map(user -> user.cloneWithName(null)).collect(toList()), readUsersWithProjection(FilterCompat.get(eq(binaryColumn("name"), null)), SCHEMA_WITHOUT_NAME, true, true)); // Column index filter shall drop all pages because all values in column 'name' shall be handled as null values assertEquals( emptyList(), readUsersWithProjection(FilterCompat.get(notEq(binaryColumn("name"), null)), SCHEMA_WITHOUT_NAME, false, true)); assertEquals( emptyList(), readUsersWithProjection(FilterCompat.get(userDefined(binaryColumn("name"), NameStartsWithVowel.class)), SCHEMA_WITHOUT_NAME, false, true)); }
Example #13
Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0 | 6 votes |
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{ ParquetReader<Group> reader = ParquetReader .builder(new GroupReadSupport(), path) .withFilter(FilterCompat.get(pred)) .build(); long count = 0; try { while (reader.read() != null) { count += 1; } } finally { reader.close(); } return count; }
Example #14
Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testUserDefinedByInstance() throws Exception { LongColumn name = longColumn("id"); final HashSet<Long> h = new HashSet<Long>(); h.add(20L); h.add(27L); h.add(28L); FilterPredicate pred = userDefined(name, new SetInFilter(h)); List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred)); assertFilter(found, new UserFilter() { @Override public boolean keep(User u) { return u != null && h.contains(u.getId()); } }); }
Example #15
Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0 | 5 votes |
public ParquetColumnarRowSplitReader( boolean utcTimestamp, boolean caseSensitive, Configuration conf, LogicalType[] selectedTypes, String[] selectedFieldNames, ColumnBatchGenerator generator, int batchSize, Path path, long splitStart, long splitLength) throws IOException { this.utcTimestamp = utcTimestamp; this.selectedTypes = selectedTypes; this.batchSize = batchSize; // then we need to apply the predicate push down filter ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(conf); List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); this.reader = new ParquetFileReader( conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); long totalRowCount = 0; for (BlockMetaData block : blocks) { totalRowCount += block.getRowCount(); } this.totalRowCount = totalRowCount; this.nextRow = 0; this.rowsInBatch = 0; this.rowsReturned = 0; checkSchema(); this.writableVectors = createWritableVectors(); this.columnarBatch = generator.generate(createReadableVectors()); this.row = new ColumnarRowData(columnarBatch); }
Example #16
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testFilterOnString() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 1); // First try matching against the A url in record 1 RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("Name.Url", equalTo("http://A")))); readOne(recordReader, "r2 filtered out", r1); // Second try matching against the B url in record 1 - it should fail as we only match // against the first instance of a recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("Name.Url", equalTo("http://B")))); List<Group> all = readAll(recordReader); assertEquals("There should be no matching records: " + all , 0, all.size()); // Finally try matching against the C url in record 2 recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("Name.Url", equalTo("http://C")))); readOne(recordReader, "r1 filtered out", r2); }
Example #17
Source File: TestFiltered.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testApplyFunctionFilterOnString() { MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema); MemPageStore memPageStore = writeTestRecords(columnIO, 1); // First try matching against the A url in record 1 RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema); RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("Name.Url", applyFunctionToString(new StringEndsWithAPredicate())))); readOne(recordReader, "r2 filtered out", r1); // Second try matching against the B url in record 1 - it should fail as we only match // against the first instance of a recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("Name.Url", equalTo("http://B")))); List<Group> all = readAll(recordReader); assertEquals("There should be no matching records: " + all , 0, all.size()); // Finally try matching against the C url in record 2 recordReader = (RecordReaderImplementation<Group>) columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("Name.Url", equalTo("http://C")))); readOne(recordReader, "r1 filtered out", r2); }
Example #18
Source File: ParquetFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
/** * Opens the resource for read. * * @throws IOException if opening the resource failed */ @Override public boolean openForRead() throws IOException { file = new Path(context.getDataSource()); FileSplit fileSplit = HdfsUtilities.parseFileSplit(context); // Read the original schema from the parquet file MessageType originalSchema = getSchema(file, fileSplit); // Get a map of the column name to Types for the given schema Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema); // Get the read schema. This is either the full set or a subset (in // case of column projection) of the greenplum schema. MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema); // Get the record filter in case of predicate push-down FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema); // add column projection configuration.set(PARQUET_READ_SCHEMA, readSchema.toString()); fileReader = ParquetReader.builder(new GroupReadSupport(), file) .withConf(configuration) // Create reader for a given split, read a range in file .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength()) .withFilter(recordFilter) .build(); context.setMetadata(readSchema); return true; }
Example #19
Source File: ParquetFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
/** * Returns the parquet record filter for the given filter string * * @param filterString the filter string * @param originalFieldsMap a map of field names to types * @param schema the parquet schema * @return the parquet record filter for the given filter string */ private FilterCompat.Filter getRecordFilter(String filterString, Map<String, Type> originalFieldsMap, MessageType schema) { if (StringUtils.isBlank(filterString)) { return FilterCompat.NOOP; } ParquetRecordFilterBuilder filterBuilder = new ParquetRecordFilterBuilder( context.getTupleDescription(), originalFieldsMap); TreeVisitor pruner = new ParquetOperatorPrunerAndTransformer( context.getTupleDescription(), originalFieldsMap, SUPPORTED_OPERATORS); try { // Parse the filter string into a expression tree Node Node root = new FilterParser().parse(filterString); // Prune the parsed tree with valid supported operators and then // traverse the pruned tree with the ParquetRecordFilterBuilder to // produce a record filter for parquet TRAVERSER.traverse(root, pruner, filterBuilder); return filterBuilder.getRecordFilter(); } catch (Exception e) { LOG.error(String.format("%s-%d: %s--%s Unable to generate Parquet Record Filter for filter", context.getTransactionId(), context.getSegmentId(), context.getDataSource(), context.getFilterString()), e); return FilterCompat.NOOP; } }
Example #20
Source File: ParquetRecordFilterBuilder.java From pxf with Apache License 2.0 | 5 votes |
/** * Returns the built record filter * * @return the built record filter */ public FilterCompat.Filter getRecordFilter() { FilterPredicate predicate = filterQueue.poll(); if (!filterQueue.isEmpty()) { throw new IllegalStateException("Filter queue is not empty after visiting all nodes"); } return predicate != null ? FilterCompat.get(predicate) : FilterCompat.NOOP; }
Example #21
Source File: ParquetInputFormat.java From flink with Apache License 2.0 | 5 votes |
@Override public void open(FileInputSplit split) throws IOException { // reset the flag when open a new split this.skipThisSplit = false; org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration(); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); MessageType fileSchema = fileReader.getFileMetaData().getSchema(); MessageType readSchema = getReadSchema(fileSchema, split.getPath()); if (skipThisSplit) { LOG.warn(String.format( "Escaped the file split [%s] due to mismatch of file schema to expected result schema", split.getPath().toString())); } else { this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema, filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate)); this.parquetRecordReader.initialize(fileReader, configuration); this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); if (this.recordConsumed == null) { this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed"); } LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString())); } }
Example #22
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private void checkRead() throws IOException { if (current == totalCountLoadedSoFar) { if (current != 0) { totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt); if (LOG.isInfoEnabled()) { LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms"); final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes; if (totalTime != 0) { final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime; final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime; LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)"); } } } LOG.info("at row " + current + ". reading next block"); long t0 = System.currentTimeMillis(); PageReadStore pages = reader.readNextFilteredRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total); } long timeSpentReading = System.currentTimeMillis() - t0; totalTimeSpentReadingBytes += timeSpentReading; BenchmarkCounter.incrementTime(timeSpentReading); if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount()); LOG.debug("initializing Record assembly with requested schema {}", requestedSchema); MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking); recordReader = columnIO.getRecordReader(pages, recordConverter, filterRecords ? filter : FilterCompat.NOOP); startedAssemblingCurrentBlockAt = System.currentTimeMillis(); totalCountLoadedSoFar += pages.getRowCount(); ++ currentBlock; } }
Example #23
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, FilterCompat.Filter filter) throws IOException { this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)), HadoopReadOptions.builder(conf) .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null")) .build(), readSupport); }
Example #24
Source File: ParquetReadOptions.java From parquet-mr with Apache License 2.0 | 5 votes |
ParquetReadOptions(boolean useSignedStringMinMax, boolean useStatsFilter, boolean useDictionaryFilter, boolean useRecordFilter, boolean useColumnIndexFilter, boolean usePageChecksumVerification, boolean useBloomFilter, FilterCompat.Filter recordFilter, ParquetMetadataConverter.MetadataFilter metadataFilter, CompressionCodecFactory codecFactory, ByteBufferAllocator allocator, int maxAllocationSize, Map<String, String> properties) { this.useSignedStringMinMax = useSignedStringMinMax; this.useStatsFilter = useStatsFilter; this.useDictionaryFilter = useDictionaryFilter; this.useRecordFilter = useRecordFilter; this.useColumnIndexFilter = useColumnIndexFilter; this.usePageChecksumVerification = usePageChecksumVerification; this.useBloomFilter = useBloomFilter; this.recordFilter = recordFilter; this.metadataFilter = metadataFilter; this.codecFactory = codecFactory; this.allocator = allocator; this.maxAllocationSize = maxAllocationSize; this.properties = Collections.unmodifiableMap(properties); }
Example #25
Source File: TestBloomFiltering.java From parquet-mr with Apache License 2.0 | 5 votes |
private List<PhoneBookWriter.User> readUsers(FilterPredicate filter, boolean useOtherFiltering, boolean useBloomFilter) throws IOException { return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file) .withFilter(FilterCompat.get(filter)) .useDictionaryFilter(useOtherFiltering) .useStatsFilter(useOtherFiltering) .useRecordFilter(useOtherFiltering) .useBloomFilter(useBloomFilter) .useColumnIndexFilter(useOtherFiltering)); }
Example #26
Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateFileWithComplexFilter(Path file, List<Group> data) throws IOException { Binary binaryValueB = fromString("b"); Filter filter = FilterCompat.get( and( gtEq(intColumn("id"), 0), and( lt(binaryColumn("name"), binaryValueB), notEq(binaryColumn("comment"), null)))); Predicate<Group> predicate = group -> group.getInteger("id", 0) >= 0 && BINARY_COMPARATOR.compare(group.getBinary("name", 0), binaryValueB) < 0 && group.getFieldRepetitionCount("comment") > 0; validateFile(file, filter, data.stream().filter(predicate)); }
Example #27
Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testNoFilter() throws Exception { List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.NOOP); assertFilter(found, new UserFilter() { @Override public boolean keep(User u) { return true; } }); }
Example #28
Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testAllFilter() throws Exception { BinaryColumn name = binaryColumn("name"); FilterPredicate pred = eq(name, Binary.fromString("no matches")); List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred)); assertEquals(new ArrayList<Group>(), found); }
Example #29
Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testNameNotNull() throws Exception { BinaryColumn name = binaryColumn("name"); FilterPredicate pred = notEq(name, null); List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred)); assertFilter(found, new UserFilter() { @Override public boolean keep(User u) { return u.getName() != null; } }); }
Example #30
Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testNameNotStartWithP() throws Exception { BinaryColumn name = binaryColumn("name"); FilterPredicate pred = not(userDefined(name, StartWithP.class)); List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred)); assertFilter(found, new UserFilter() { @Override public boolean keep(User u) { return u.getName() == null || !u.getName().startsWith("p"); } }); }