org.apache.parquet.hadoop.metadata.ColumnPath Java Exaples

Source File: TestFilterApiMethods.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilterPredicateCreation() {
  FilterPredicate outerAnd = predicate;

  assertTrue(outerAnd instanceof And);

  FilterPredicate not = ((And) outerAnd).getLeft();
  FilterPredicate gt = ((And) outerAnd).getRight();
  assertTrue(not instanceof Not);

  FilterPredicate or = ((Not) not).getPredicate();
  assertTrue(or instanceof Or);

  FilterPredicate leftEq = ((Or) or).getLeft();
  FilterPredicate rightNotEq = ((Or) or).getRight();
  assertTrue(leftEq instanceof Eq);
  assertTrue(rightNotEq instanceof NotEq);
  assertEquals(7, ((Eq) leftEq).getValue());
  assertEquals(17, ((NotEq) rightNotEq).getValue());
  assertEquals(ColumnPath.get("a", "b", "c"), ((Eq) leftEq).getColumn().getColumnPath());
  assertEquals(ColumnPath.get("a", "b", "c"), ((NotEq) rightNotEq).getColumn().getColumnPath());

  assertTrue(gt instanceof Gt);
  assertEquals(100.0, ((Gt) gt).getValue());
  assertEquals(ColumnPath.get("x", "y", "z"), ((Gt) gt).getColumn().getColumnPath());
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * start a column inside a block
 * @param descriptor the column descriptor
 * @param valueCount the value count in this column
 * @param compressionCodecName a compression codec name
 * @throws IOException if there is an error while writing
 */
public void startColumn(ColumnDescriptor descriptor,
                        long valueCount,
                        CompressionCodecName compressionCodecName) throws IOException {
  state = state.startColumn();
  encodingStatsBuilder.clear();
  currentEncodings = new HashSet<Encoding>();
  currentChunkPath = ColumnPath.get(descriptor.getPath());
  currentChunkType = descriptor.getPrimitiveType();
  currentChunkCodec = compressionCodecName;
  currentChunkValueCount = valueCount;
  currentChunkFirstDataPage = out.getPos();
  compressedLength = 0;
  uncompressedLength = 0;
  // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
  currentStatistics = null;

  columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength);
  offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
  firstPageOffset = -1;
}

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

6 votes

public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}

Source File: PruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

private Type pruneColumnsInField(Type field, List<String> currentPath, Set<ColumnPath> prunePaths) {
  String fieldName = field.getName();
  currentPath.add(fieldName);
  ColumnPath path = ColumnPath.get(currentPath.toArray(new String[0]));
  Type prunedField = null;
  if (!prunePaths.contains(path)) {
    if (field.isPrimitive()) {
      prunedField = field;
    } else {
      List<Type> childFields = ((GroupType) field).getFields();
      List<Type> prunedFields = pruneColumnsInFields(childFields, currentPath, prunePaths);
      if (prunedFields.size() > 0) {
        prunedField = ((GroupType) field).withNewFields(prunedFields);
      }
    } 
  }

  currentPath.remove(fieldName);
  return prunedField;
}

Source File: FilteringRecordMaterializer.java From parquet-mr with Apache License 2.0

6 votes

public FilteringRecordMaterializer(
    RecordMaterializer<T> delegate,
    List<PrimitiveColumnIO> columnIOs,
    Map<ColumnPath, List<ValueInspector>> valueInspectorsByColumn,
    IncrementallyUpdatedFilterPredicate filterPredicate) {

  Objects.requireNonNull(columnIOs, "columnIOs cannot be null");
  Objects.requireNonNull(valueInspectorsByColumn, "valueInspectorsByColumn cannot be null");
  this.filterPredicate = Objects.requireNonNull(filterPredicate, "filterPredicate cannot be null");
  this.delegate = Objects.requireNonNull(delegate, "delegate cannot be null");

  // keep track of which path of indices leads to which primitive column
  Map<List<Integer>, PrimitiveColumnIO> columnIOsByIndexFieldPath = new HashMap<>();

  for (PrimitiveColumnIO c : columnIOs) {
    List<Integer> indexFieldPath = Arrays.stream(c.getIndexFieldPath())
        .boxed().collect(Collectors.toList());
    columnIOsByIndexFieldPath.put(indexFieldPath, c);
  }

  // create a proxy for the delegate's root converter
  this.rootConverter = new FilteringGroupConverter(
      delegate.getRootConverter(), Collections.emptyList(),
      valueInspectorsByColumn, columnIOsByIndexFieldPath);
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: FilteringGroupConverter.java From parquet-mr with Apache License 2.0

6 votes

@Override
public Converter getConverter(int fieldIndex) {

  // get the real converter from the delegate
  Converter delegateConverter = Objects.requireNonNull(delegate.getConverter(fieldIndex), "delegate converter cannot be null");

  // determine the indexFieldPath for the converter proxy we're about to make, which is
  // this converter's path + the requested fieldIndex
  List<Integer> newIndexFieldPath = new ArrayList<>(indexFieldPath.size() + 1);
  newIndexFieldPath.addAll(indexFieldPath);
  newIndexFieldPath.add(fieldIndex);

  if (delegateConverter.isPrimitive()) {
    PrimitiveColumnIO columnIO = getColumnIO(newIndexFieldPath);
    ColumnPath columnPath = ColumnPath.get(columnIO.getColumnDescriptor().getPath());
    ValueInspector[] valueInspectors = getValueInspectors(columnPath);
    return new FilteringPrimitiveConverter(delegateConverter.asPrimitiveConverter(), valueInspectors);
  } else {
    return new FilteringGroupConverter(delegateConverter.asGroupConverter(), newIndexFieldPath, valueInspectorsByColumn, columnIOsByIndexFieldPath);
  }

}

Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFilteringWithAllNullPages() {
  Set<ColumnPath> paths = paths("column1", "column5");

  assertAllRows(calculateRowRanges(FilterCompat.get(
      notEq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertAllRows(calculateRowRanges(FilterCompat.get(
      or(gtEq(intColumn("column1"), 10),
          notEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertRows(calculateRowRanges(FilterCompat.get(
      eq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT));
  assertRows(calculateRowRanges(FilterCompat.get(
      and(lt(intColumn("column1"), 20),
          gtEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT));
}

Source File: ReadConf.java From iceberg with Apache License 2.0

6 votes

private List<Map<ColumnPath, ColumnChunkMetaData>> getColumnChunkMetadataForRowGroups() {
  Set<ColumnPath> projectedColumns = projection.getColumns().stream()
      .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())).collect(Collectors.toSet());
  ImmutableList.Builder<Map<ColumnPath, ColumnChunkMetaData>> listBuilder = ImmutableList.builder();
  for (int i = 0; i < rowGroups.size(); i++) {
    if (!shouldSkip[i]) {
      BlockMetaData blockMetaData = rowGroups.get(i);
      ImmutableMap.Builder<ColumnPath, ColumnChunkMetaData> mapBuilder = ImmutableMap.builder();
      blockMetaData.getColumns().stream()
          .filter(columnChunkMetaData -> projectedColumns.contains(columnChunkMetaData.getPath()))
          .forEach(columnChunkMetaData -> mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData));
      listBuilder.add(mapBuilder.build());
    } else {
      listBuilder.add(ImmutableMap.of());
    }
  }
  return listBuilder.build();
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
  this.converter = new ParquetMetadataConverter(options);
  this.file = file;
  this.f = file.newStream();
  this.options = options;
  try {
    this.footer = readFooter(file, options, f, converter);
  } catch (Exception e) {
    // In case that reading footer throws an exception in the constructor, the new stream
    // should be closed. Otherwise, there's no way to close this outside.
    f.close();
    throw e;
  }
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: ColumnIndexFilter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Calculates the row ranges containing the indexes of the rows might match the specified filter.
 *
 * @param filter
 *          to be used for filtering the rows
 * @param columnIndexStore
 *          the store for providing column/offset indexes
 * @param paths
 *          the paths of the columns used in the actual projection; a column not being part of the projection will be
 *          handled as containing {@code null} values only even if the column has values written in the file
 * @param rowCount
 *          the total number of rows in the row-group
 * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows if any of
 *         the required offset index is missing
 */
public static RowRanges calculateRowRanges(FilterCompat.Filter filter, ColumnIndexStore columnIndexStore,
    Set<ColumnPath> paths, long rowCount) {
  return filter.accept(new FilterCompat.Visitor<RowRanges>() {
    @Override
    public RowRanges visit(FilterPredicateCompat filterPredicateCompat) {
      try {
        return filterPredicateCompat.getFilterPredicate()
            .accept(new ColumnIndexFilter(columnIndexStore, paths, rowCount));
      } catch (MissingOffsetIndexException e) {
        LOGGER.info(e.getMessage());
        return RowRanges.createSingle(rowCount);
      }
    }

    @Override
    public RowRanges visit(UnboundRecordFilterCompat unboundRecordFilterCompat) {
      return RowRanges.createSingle(rowCount);
    }

    @Override
    public RowRanges visit(NoOpFilter noOpFilter) {
      return RowRanges.createSingle(rowCount);
    }
  });
}

Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0

6 votes

@Override
public OffsetIndex getOffsetIndex(ColumnPath column) {
  switch (column.toDotString()) {
    case "column1":
      return COLUMN1_OI;
    case "column2":
      return COLUMN2_OI;
    case "column3":
      return COLUMN3_OI;
    case "column4":
      return COLUMN4_OI;
    case "column5":
      return COLUMN5_OI;
    default:
      throw new MissingOffsetIndexException(column);
  }
}

Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0

6 votes

@Override
public ColumnIndex getColumnIndex(ColumnPath column) {
  switch (column.toDotString()) {
    case "column1":
      return COLUMN1_CI;
    case "column2":
      return COLUMN2_CI;
    case "column3":
      return COLUMN3_CI;
    case "column4":
      return COLUMN4_CI;
    case "column5":
      return COLUMN5_CI;
    default:
      return null;
  }
}

Source File: TestStatisticsFilter.java From parquet-mr with Apache License 2.0

5 votes

private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
    long valueCount) {
  return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      0L, 0L, valueCount, 0L, 0L);
}

Source File: TestInputFormat.java From parquet-mr with Apache License 2.0

5 votes

public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
  BlockMetaData blockMetaData = new BlockMetaData();

  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      100l, 100l, valueCount, 100l, 100l);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(200l);
  blockMetaData.setRowCount(valueCount);
  return blockMetaData;
}

Source File: ParquetUtil.java From iceberg with Apache License 2.0

5 votes

private static boolean shouldStoreBounds(ColumnPath columnPath, Schema schema) {
  Iterator<String> pathIterator = columnPath.iterator();
  Type currentType = schema.asStruct();

  while (pathIterator.hasNext()) {
    if (currentType == null || !currentType.isStructType()) {
      return false;
    }
    String fieldName = pathIterator.next();
    currentType = currentType.asStructType().fieldType(fieldName);
  }

  return currentType != null && currentType.isPrimitiveType();
}

Source File: IncrementallyUpdatedFilterPredicateBuilderBase.java From parquet-mr with Apache License 2.0

5 votes

public IncrementallyUpdatedFilterPredicateBuilderBase(List<PrimitiveColumnIO> leaves) {
  for (PrimitiveColumnIO leaf : leaves) {
    ColumnDescriptor descriptor = leaf.getColumnDescriptor();
    ColumnPath path = ColumnPath.get(descriptor.getPath());
    PrimitiveComparator<?> comparator = descriptor.getPrimitiveType().comparator();
    comparatorsByColumn.put(path, comparator);
  }
}

Source File: VectorizedArrowReader.java From iceberg with Apache License 2.0

5 votes

@Override
public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
  ColumnChunkMetaData chunkMetaData = metadata.get(ColumnPath.get(columnDescriptor.getPath()));
  this.dictionary = vectorizedColumnIterator.setRowGroupInfo(
      source.getPageReader(columnDescriptor),
      !ParquetUtil.hasNonDictionaryPages(chunkMetaData));
}

Source File: ColumnarBatchReader.java From iceberg with Apache License 2.0

5 votes

@Override
public final void setRowGroupInfo(PageReadStore pageStore, Map<ColumnPath, ColumnChunkMetaData> metaData) {
  for (VectorizedArrowReader reader : readers) {
    if (reader != null) {
      reader.setRowGroupInfo(pageStore, metaData);
    }
  }
}

Source File: TestInputFormat.java From parquet-mr with Apache License 2.0

5 votes

private BlockMetaData newBlock(long start, long compressedBlockSize) {
  BlockMetaData blockMetaData = new BlockMetaData();
  long uncompressedSize = compressedBlockSize * 2;//assuming the compression ratio is 2
  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
                                                       PrimitiveTypeName.BINARY,
                                                       CompressionCodecName.GZIP,
                                                       new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
                                                       new BinaryStatistics(),
                                                       start, 0l, 0l, compressedBlockSize, uncompressedSize);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(uncompressedSize);
  return blockMetaData;
}

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

5 votes

private static ParquetMetadata createParquetMetaData(Encoding dicEncoding,
  Encoding dataEncoding) {
  MessageType schema =
    parseMessageType("message schema { optional int32 col (INT_32); }");
  org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
    new org.apache.parquet.hadoop.metadata.FileMetaData(schema,
      new HashMap<String, String>(), null);
  List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
  BlockMetaData blockMetaData = new BlockMetaData();
  EncodingStats.Builder builder = new EncodingStats.Builder();
  if (dicEncoding!= null) {
    builder.addDictEncoding(dicEncoding).build();
  }
  builder.addDataEncoding(dataEncoding);
  EncodingStats es = builder.build();
  Set<org.apache.parquet.column.Encoding> e =
    new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.INT32;
  ColumnPath p = ColumnPath.get("col");
  CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md =
    ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
  blockMetaData.addColumn(md);
  blockMetaDataList.add(blockMetaData);
  return new ParquetMetadata(fileMetaData, blockMetaDataList);
}

Source File: IncrementallyUpdatedFilterPredicateBuilderBase.java From parquet-mr with Apache License 2.0

5 votes

protected final void addValueInspector(ColumnPath columnPath, ValueInspector valueInspector) {
  List<ValueInspector> valueInspectors = valueInspectorsByColumn.get(columnPath);
  if (valueInspectors == null) {
    valueInspectors = new ArrayList<>();
    valueInspectorsByColumn.put(columnPath, valueInspectors);
  }
  valueInspectors.add(valueInspector);
}

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

5 votes

private ColumnChunkMetaData createColumnChunkMetaData() {
  Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.BINARY;
  ColumnPath p = ColumnPath.get("foo");
  CompressionCodecName c = CompressionCodecName.GZIP;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s,
          0, 0, 0, 0, 0);
  return md;
}

Source File: ColumnIndexStoreImpl.java From parquet-mr with Apache License 2.0

5 votes

static ColumnIndexStore create(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) {
  try {
    return new ColumnIndexStoreImpl(reader, block, paths);
  } catch (MissingOffsetIndexException e) {
    return EMPTY;
  }
}

Source File: FilteringGroupConverter.java From parquet-mr with Apache License 2.0

5 votes

public FilteringGroupConverter(
    GroupConverter delegate,
    List<Integer> indexFieldPath,
    Map<ColumnPath, List<ValueInspector>> valueInspectorsByColumn, Map<List<Integer>,
    PrimitiveColumnIO> columnIOsByIndexFieldPath) {

  this.delegate = Objects.requireNonNull(delegate, "delegate cannot be null");
  this.indexFieldPath = Objects.requireNonNull(indexFieldPath, "indexFieldPath cannot be null");
  this.columnIOsByIndexFieldPath = Objects.requireNonNull(columnIOsByIndexFieldPath, "columnIOsByIndexFieldPath cannot be null");
  this.valueInspectorsByColumn = Objects.requireNonNull(valueInspectorsByColumn, "valueInspectorsByColumn cannot be null");
}

Source File: SchemaCompatibilityValidator.java From parquet-mr with Apache License 2.0

5 votes

private SchemaCompatibilityValidator(MessageType schema) {

    for (ColumnDescriptor cd : schema.getColumns()) {
      ColumnPath columnPath = ColumnPath.get(cd.getPath());
      columnsAccordingToSchema.put(columnPath, cd);
    }
  }

Source File: SchemaCompatibilityValidator.java From parquet-mr with Apache License 2.0

5 votes

private <T extends Comparable<T>> void validateColumn(Column<T> column) {
  ColumnPath path = column.getColumnPath();

  Class<?> alreadySeen = columnTypesEncountered.get(path);
  if (alreadySeen != null && !alreadySeen.equals(column.getColumnType())) {
    throw new IllegalArgumentException("Column: "
        + path.toDotString()
        + " was provided with different types in the same predicate."
        + " Found both: (" + alreadySeen + ", " + column.getColumnType() + ")");
  }

  if (alreadySeen == null) {
    columnTypesEncountered.put(path, column.getColumnType());
  }

  ColumnDescriptor descriptor = getColumnDescriptor(path);
  if (descriptor == null) {
    // the column is missing from the schema. evaluation uses calls
    // updateNull() a value is missing, so this will be handled correctly.
    return;
  }

  if (descriptor.getMaxRepetitionLevel() > 0) {
    throw new IllegalArgumentException("FilterPredicates do not currently support repeated columns. "
        + "Column " + path.toDotString() + " is repeated.");
  }

  ValidTypeMap.assertTypeValid(column, descriptor.getType());
}

Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testFilteringOnMissingColumns() {
  Set<ColumnPath> paths = paths("column1", "column2", "column3", "column4");

  // Missing column filter is always true
  assertAllRows(calculateRowRanges(FilterCompat.get(
      notEq(intColumn("missing_column"), 0)),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertRows(calculateRowRanges(FilterCompat.get(
      and(
          and(
              gtEq(intColumn("column1"), 7),
              lt(intColumn("column1"), 11)),
          eq(binaryColumn("missing_column"), null))),
      STORE, paths, TOTAL_ROW_COUNT),
      7, 8, 9, 10, 11, 12, 13);

  // Missing column filter is always false
  assertRows(calculateRowRanges(FilterCompat.get(
      or(
          and(
              gtEq(intColumn("column1"), 7),
              lt(intColumn("column1"), 11)),
          notEq(binaryColumn("missing_column"), null))),
      STORE, paths, TOTAL_ROW_COUNT),
      7, 8, 9, 10, 11, 12, 13);
  assertRows(calculateRowRanges(FilterCompat.get(
      gt(intColumn("missing_column"), 0)),
      STORE, paths, TOTAL_ROW_COUNT));
}

Source File: TestColumnIndexFilter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testFilteringWithMissingOffsetIndex() {
  Set<ColumnPath> paths = paths("column1", "column2", "column3", "column4", "column_wo_oi");

  assertAllRows(calculateRowRanges(FilterCompat.get(
      and(
          and(
              gtEq(intColumn("column1"), 7),
              lt(intColumn("column1"), 11)),
          and(
              gt(binaryColumn("column2"), fromString("Romeo")),
              ltEq(binaryColumn("column_wo_oi"), fromString("Tango"))))),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
}

Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0

5 votes

public ContractViolation(Contract violatedContract, String referenceValue, String offendingValue,
    int rowGroupNumber, int columnNumber, ColumnPath columnPath, int pageNumber) {
  this.violatedContract = violatedContract;
  this.referenceValue = referenceValue;
  this.offendingValue = offendingValue;
  this.rowGroupNumber = rowGroupNumber;
  this.columnNumber = columnNumber;
  this.columnPath = columnPath;
  this.pageNumber = pageNumber;
}

org.apache.parquet.hadoop.metadata.ColumnPath Java Examples