org.apache.parquet.filter2.compat.FilterCompat Java Exaples

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilterOnInteger() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // Get first record
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("DocId", equalTo(10l))));

  readOne(recordReader, "r2 filtered out", r1);

  // Get second record
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", equalTo(20l))));

  readOne(recordReader, "r1 filtered out", r2);

}

Source File: ColumnIndexFilter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Calculates the row ranges containing the indexes of the rows might match the specified filter.
 *
 * @param filter
 *          to be used for filtering the rows
 * @param columnIndexStore
 *          the store for providing column/offset indexes
 * @param paths
 *          the paths of the columns used in the actual projection; a column not being part of the projection will be
 *          handled as containing {@code null} values only even if the column has values written in the file
 * @param rowCount
 *          the total number of rows in the row-group
 * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows if any of
 *         the required offset index is missing
 */
public static RowRanges calculateRowRanges(FilterCompat.Filter filter, ColumnIndexStore columnIndexStore,
    Set<ColumnPath> paths, long rowCount) {
  return filter.accept(new FilterCompat.Visitor<RowRanges>() {
    @Override
    public RowRanges visit(FilterPredicateCompat filterPredicateCompat) {
      try {
        return filterPredicateCompat.getFilterPredicate()
            .accept(new ColumnIndexFilter(columnIndexStore, paths, rowCount));
      } catch (MissingOffsetIndexException e) {
        LOGGER.info(e.getMessage());
        return RowRanges.createSingle(rowCount);
      }
    }

    @Override
    public RowRanges visit(UnboundRecordFilterCompat unboundRecordFilterCompat) {
      return RowRanges.createSingle(rowCount);
    }

    @Override
    public RowRanges visit(NoOpFilter noOpFilter) {
      return RowRanges.createSingle(rowCount);
    }
  });
}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 6);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(page(4, 4)));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting records " + all, 4, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record", (i%2 == 0 ? r2 : r1).toString(), all.get(i).toString());
  }
}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilteredAndPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(and(column("DocId", equalTo(10l)), page(2, 4))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 4 records " + all, 4, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record1", r1.toString(), all.get(i).toString());
  }

}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilteredOrPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(or(column("DocId", equalTo(10l)),
              column("DocId", equalTo(20l)))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 8 records " + all, 16, all.size());
  for (int i = 0; i < all.size () / 2; i++) {
    assertEquals("expecting record1", r1.toString(), all.get(2 * i).toString());
    assertEquals("expecting record2", r2.toString(), all.get(2 * i + 1).toString());
  }
}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilteredNotPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(not(column("DocId", equalTo(10l)))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 8 records " + all, 8, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record2", r2.toString(), all.get(i).toString());
  }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException {
  // set up data filters based on configured levels
  List<RowGroupFilter.FilterLevel> levels = new ArrayList<>();

  if (options.useStatsFilter()) {
    levels.add(STATISTICS);
  }

  if (options.useDictionaryFilter()) {
    levels.add(DICTIONARY);
  }

  if (options.useBloomFilter()) {
    levels.add(BLOOMFILTER);
  }

  FilterCompat.Filter recordFilter = options.getRecordFilter();
  if (recordFilter != null) {
    return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this);
  }

  return blocks;
}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testApplyFunctionFilterOnLong() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // Get first record
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", equalTo(10l))));

  readOne(recordReader, "r2 filtered out", r1);

  // Get second record
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", applyFunctionToLong(new LongGreaterThan15Predicate()))));

  readOne(recordReader, "r1 filtered out", r2);
}

Source File: HadoopReadOptions.java From parquet-mr with Apache License 2.0

6 votes

private HadoopReadOptions(boolean useSignedStringMinMax,
                          boolean useStatsFilter,
                          boolean useDictionaryFilter,
                          boolean useRecordFilter,
                          boolean useColumnIndexFilter,
                          boolean usePageChecksumVerification,
                          boolean useBloomFilter,
                          FilterCompat.Filter recordFilter,
                          MetadataFilter metadataFilter,
                          CompressionCodecFactory codecFactory,
                          ByteBufferAllocator allocator,
                          int maxAllocationSize,
                          Map<String, String> properties,
                          Configuration conf) {
  super(
      useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter,
      usePageChecksumVerification, useBloomFilter, recordFilter, metadataFilter, codecFactory, allocator,
      maxAllocationSize, properties
  );
  this.conf = conf;
}

Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilteringWithAllNullPages() {
  Set<ColumnPath> paths = paths("column1", "column5");

  assertAllRows(calculateRowRanges(FilterCompat.get(
      notEq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertAllRows(calculateRowRanges(FilterCompat.get(
      or(gtEq(intColumn("column1"), 10),
          notEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertRows(calculateRowRanges(FilterCompat.get(
      eq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT));
  assertRows(calculateRowRanges(FilterCompat.get(
      and(lt(intColumn("column1"), 20),
          gtEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT));
}

Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testNoFiltering() throws IOException {
  // Column index filtering with no-op filter
  assertEquals(DATA, readUsers(FilterCompat.NOOP, false));
  assertEquals(DATA, readUsers(FilterCompat.NOOP, true));

  // Column index filtering turned off
  assertEquals(DATA.stream().filter(user -> user.getId() == 1234).collect(Collectors.toList()),
      readUsers(eq(longColumn("id"), 1234l), true, false));
  assertEquals(DATA.stream().filter(user -> "miller".equals(user.getName())).collect(Collectors.toList()),
      readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), true, false));
  assertEquals(DATA.stream().filter(user -> user.getName() == null).collect(Collectors.toList()),
      readUsers(eq(binaryColumn("name"), null), true, false));

  // Every filtering mechanism turned off
  assertEquals(DATA, readUsers(eq(longColumn("id"), 1234l), false, false));
  assertEquals(DATA, readUsers(eq(binaryColumn("name"), Binary.fromString("miller")), false, false));
  assertEquals(DATA, readUsers(eq(binaryColumn("name"), null), false, false));
}

Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilteringWithProjection() throws IOException {
  // All rows shall be retrieved because all values in column 'name' shall be handled as null values
  assertEquals(
      DATA.stream().map(user -> user.cloneWithName(null)).collect(toList()),
      readUsersWithProjection(FilterCompat.get(eq(binaryColumn("name"), null)), SCHEMA_WITHOUT_NAME, true, true));

  // Column index filter shall drop all pages because all values in column 'name' shall be handled as null values
  assertEquals(
      emptyList(),
      readUsersWithProjection(FilterCompat.get(notEq(binaryColumn("name"), null)), SCHEMA_WITHOUT_NAME, false, true));
  assertEquals(
      emptyList(),
      readUsersWithProjection(FilterCompat.get(userDefined(binaryColumn("name"), NameStartsWithVowel.class)),
          SCHEMA_WITHOUT_NAME, false, true));
}

Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0

6 votes

public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), path)
      .withFilter(FilterCompat.get(pred))
      .build();

  long count = 0;
  try {
    while (reader.read() != null) {
      count += 1;
    }
  } finally {
    reader.close();
  }
  return count;
}

Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testUserDefinedByInstance() throws Exception {
  LongColumn name = longColumn("id");

  final HashSet<Long> h = new HashSet<Long>();
  h.add(20L); 
  h.add(27L);
  h.add(28L);
  
  FilterPredicate pred = userDefined(name, new SetInFilter(h));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u != null && h.contains(u.getId());
    }
  });
}

Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0

5 votes

public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://A"))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testApplyFunctionFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", applyFunctionToString(new StringEndsWithAPredicate()))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Opens the resource for read.
 *
 * @throws IOException if opening the resource failed
 */
@Override
public boolean openForRead() throws IOException {
    file = new Path(context.getDataSource());
    FileSplit fileSplit = HdfsUtilities.parseFileSplit(context);

    // Read the original schema from the parquet file
    MessageType originalSchema = getSchema(file, fileSplit);
    // Get a map of the column name to Types for the given schema
    Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema);
    // Get the read schema. This is either the full set or a subset (in
    // case of column projection) of the greenplum schema.
    MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema);
    // Get the record filter in case of predicate push-down
    FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema);

    // add column projection
    configuration.set(PARQUET_READ_SCHEMA, readSchema.toString());

    fileReader = ParquetReader.builder(new GroupReadSupport(), file)
            .withConf(configuration)
            // Create reader for a given split, read a range in file
            .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength())
            .withFilter(recordFilter)
            .build();
    context.setMetadata(readSchema);
    return true;
}

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Returns the parquet record filter for the given filter string
 *
 * @param filterString      the filter string
 * @param originalFieldsMap a map of field names to types
 * @param schema            the parquet schema
 * @return the parquet record filter for the given filter string
 */
private FilterCompat.Filter getRecordFilter(String filterString, Map<String, Type> originalFieldsMap, MessageType schema) {
    if (StringUtils.isBlank(filterString)) {
        return FilterCompat.NOOP;
    }

    ParquetRecordFilterBuilder filterBuilder = new ParquetRecordFilterBuilder(
            context.getTupleDescription(), originalFieldsMap);
    TreeVisitor pruner = new ParquetOperatorPrunerAndTransformer(
            context.getTupleDescription(), originalFieldsMap, SUPPORTED_OPERATORS);

    try {
        // Parse the filter string into a expression tree Node
        Node root = new FilterParser().parse(filterString);
        // Prune the parsed tree with valid supported operators and then
        // traverse the pruned tree with the ParquetRecordFilterBuilder to
        // produce a record filter for parquet
        TRAVERSER.traverse(root, pruner, filterBuilder);
        return filterBuilder.getRecordFilter();
    } catch (Exception e) {
        LOG.error(String.format("%s-%d: %s--%s Unable to generate Parquet Record Filter for filter",
                context.getTransactionId(),
                context.getSegmentId(),
                context.getDataSource(),
                context.getFilterString()), e);
        return FilterCompat.NOOP;
    }
}

Source File: ParquetRecordFilterBuilder.java From pxf with Apache License 2.0

5 votes

/**
 * Returns the built record filter
 *
 * @return the built record filter
 */
public FilterCompat.Filter getRecordFilter() {
    FilterPredicate predicate = filterQueue.poll();
    if (!filterQueue.isEmpty()) {
        throw new IllegalStateException("Filter queue is not empty after visiting all nodes");
    }
    return predicate != null ? FilterCompat.get(predicate) : FilterCompat.NOOP;
}

Source File: ParquetInputFormat.java From flink with Apache License 2.0

5 votes

@Override
public void open(FileInputSplit split) throws IOException {
	// reset the flag when open a new split
	this.skipThisSplit = false;
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
	MessageType fileSchema = fileReader.getFileMetaData().getSchema();
	MessageType readSchema = getReadSchema(fileSchema, split.getPath());
	if (skipThisSplit) {
		LOG.warn(String.format(
			"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
			split.getPath().toString()));
	} else {
		this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema,
			filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate));
		this.parquetRecordReader.initialize(fileReader, configuration);
		this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);

		if (this.recordConsumed == null) {
			this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
		}

		LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
	}
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

5 votes

private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (LOG.isInfoEnabled()) {
          LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
          final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
          if (totalTime != 0) {
              final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
              final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
              LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
          }
      }
    }

    LOG.info("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextFilteredRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
    LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter,
        filterRecords ? filter : FilterCompat.NOOP);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}

Source File: ParquetReadOptions.java From parquet-mr with Apache License 2.0

5 votes

ParquetReadOptions(boolean useSignedStringMinMax,
                   boolean useStatsFilter,
                   boolean useDictionaryFilter,
                   boolean useRecordFilter,
                   boolean useColumnIndexFilter,
                   boolean usePageChecksumVerification,
                   boolean useBloomFilter,
                   FilterCompat.Filter recordFilter,
                   ParquetMetadataConverter.MetadataFilter metadataFilter,
                   CompressionCodecFactory codecFactory,
                   ByteBufferAllocator allocator,
                   int maxAllocationSize,
                   Map<String, String> properties) {
  this.useSignedStringMinMax = useSignedStringMinMax;
  this.useStatsFilter = useStatsFilter;
  this.useDictionaryFilter = useDictionaryFilter;
  this.useRecordFilter = useRecordFilter;
  this.useColumnIndexFilter = useColumnIndexFilter;
  this.usePageChecksumVerification = usePageChecksumVerification;
  this.useBloomFilter = useBloomFilter;
  this.recordFilter = recordFilter;
  this.metadataFilter = metadataFilter;
  this.codecFactory = codecFactory;
  this.allocator = allocator;
  this.maxAllocationSize = maxAllocationSize;
  this.properties = Collections.unmodifiableMap(properties);
}

Source File: TestBloomFiltering.java From parquet-mr with Apache License 2.0

5 votes

private List<PhoneBookWriter.User> readUsers(FilterPredicate filter, boolean useOtherFiltering,
                                             boolean useBloomFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
    .withFilter(FilterCompat.get(filter))
    .useDictionaryFilter(useOtherFiltering)
    .useStatsFilter(useOtherFiltering)
    .useRecordFilter(useOtherFiltering)
    .useBloomFilter(useBloomFilter)
    .useColumnIndexFilter(useOtherFiltering));
}

Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0

5 votes

private void validateFileWithComplexFilter(Path file, List<Group> data) throws IOException {
  Binary binaryValueB = fromString("b");
  Filter filter = FilterCompat.get(
      and(
          gtEq(intColumn("id"), 0),
          and(
              lt(binaryColumn("name"), binaryValueB),
              notEq(binaryColumn("comment"), null))));
  Predicate<Group> predicate = group -> group.getInteger("id", 0) >= 0
      && BINARY_COMPARATOR.compare(group.getBinary("name", 0), binaryValueB) < 0
      && group.getFieldRepetitionCount("comment") > 0;
  validateFile(file, filter, data.stream().filter(predicate));
}

Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNoFilter() throws Exception {
  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.NOOP);
  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return true;
    }
  });
}

Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testAllFilter() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = eq(name, Binary.fromString("no matches"));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));
  assertEquals(new ArrayList<Group>(), found);
}

Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNameNotNull() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = notEq(name, null);

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u.getName() != null;
    }
  });
}

Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNameNotStartWithP() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = not(userDefined(name, StartWithP.class));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));

  assertFilter(found, new UserFilter() {
    @Override
    public boolean keep(User u) {
      return u.getName() == null || !u.getName().startsWith("p");
    }
  });
}

org.apache.parquet.filter2.compat.FilterCompat Java Examples