org.apache.parquet.column.statistics.Statistics#hasNonNullValue

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

@Override
public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp >= 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

@Override
public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ColumnIndexBuilder.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Adds the data from the specified statistics to this builder
 *
 * @param stats
 *          the statistics to be added
 */
public void add(Statistics<?> stats) {
  if (stats.hasNonNullValue()) {
    nullPages.add(false);
    Object min = stats.genericGetMin();
    Object max = stats.genericGetMax();
    addMinMax(min, max);
    pageIndexes.add(nextPageIndex);
    minMaxSize += sizeOf(min);
    minMaxSize += sizeOf(max);
  } else {
    nullPages.add(true);
  }
  nullCounts.add(stats.getNumNulls());
  ++nextPageIndex;
}

Source File: Util.java From parquet-mr with Apache License 2.0

5 votes

public static String minMaxAsString(Statistics stats) {
  if (stats == null) {
    return "no stats";
  }
  if (!stats.hasNonNullValue()) {
    return "";
  }
  return String.format("%s / %s", humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30));
}

Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Gt<T> gt) {
  Column<T> filterColumn = gt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never greater than a
    // value. for all x, null is never > x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v > someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = gt.getValue();

  // drop if value >= max
  return stats.compareMaxToValue(value) <= 0;
}

Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) {
  Column<T> filterColumn = ltEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never less than or
    // equal to a value. for all x, null is never <= x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v <= someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = ltEq.getValue();

  // drop if value < min
  return stats.compareMinToValue(value) > 0;
}

Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Lt<T> lt) {
  Column<T> filterColumn = lt.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  if (meta == null) {
    // the column is missing and always null, which is never less than a
    // value. for all x, null is never < x.
    return BLOCK_CANNOT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v < someValue
    // this chunk is all nulls, so we can drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  T value = lt.getValue();

  // drop if value <= min
  return stats.compareMinToValue(value) >= 0;
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    int cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

public StatsValidator(DataPage page) {
  Statistics<T> stats = getStatisticsFromPageHeader(page);
  this.comparator = stats.comparator();
  this.hasNonNull = stats.hasNonNullValue();
  if (hasNonNull) {
    this.min = stats.genericGetMin();
    this.max = stats.genericGetMax();
  } else {
    this.min = null;
    this.max = null;
  }
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();
  Types.NestedField field = struct.field(id);
  Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp >= 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
  int id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    ByteBuffer prefixAsBytes = lit.toByteBuffer();

    Comparator<ByteBuffer> comparator = Comparators.unsignedBytes();

    Binary lower = colStats.genericGetMin();
    // truncate lower bound so that its length in bytes is not greater than the length of prefix
    int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length());
    int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes);
    if (lowerCmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    Binary upper = colStats.genericGetMax();
    // truncate upper bound so that its length in bytes is not greater than the length of prefix
    int upperLength = Math.min(prefixAsBytes.remaining(), upper.length());
    int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes);
    if (upperCmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
  Integer id = ref.fieldId();

  // When filtering nested types notNull() is implicit filter passed even though complex
  // filters aren't pushed down in Parquet. Leave all nested column type filters to be
  // evaluated post scan.
  if (schema.findType(id) instanceof Type.NestedType) {
    return ROWS_MIGHT_MATCH;
  }

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    Collection<T> literals = literalSet;

    T lower = min(colStats, id);
    literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList());
    if (literals.isEmpty()) {  // if all values are less than lower bound, rows cannot match.
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList());
    if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match.
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@Override
public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
  Integer id = ref.fieldId();

  // When filtering nested types notNull() is implicit filter passed even though complex
  // filters aren't pushed down in Parquet. Leave all nested column type filters to be
  // evaluated post scan.
  if (schema.findType(id) instanceof Type.NestedType) {
    return ROWS_MIGHT_MATCH;
  }

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<?> colStats = stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    T lower = min(colStats, id);
    int cmp = lit.comparator().compare(lower, lit.value());
    if (cmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    T upper = max(colStats, id);
    cmp = lit.comparator().compare(upper, lit.value());
    if (cmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}

Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0

4 votes

@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
  Column<T> filterColumn = eq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  T value = eq.getValue();

  if (meta == null) {
    // the column isn't in this file so all values are null.
    if (value != null) {
      // non-null is never null
      return BLOCK_CANNOT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (value == null) {
    // We don't know anything about the nulls in this chunk
    if (!stats.isNumNullsSet()) {
      return BLOCK_MIGHT_MATCH;
    }
    // we are looking for records where v eq(null)
    // so drop if there are no nulls in this chunk
    return !hasNulls(meta);
  }

  if (isAllNulls(meta)) {
    // we are looking for records where v eq(someNonNull)
    // and this is a column of all nulls, so drop it
    return BLOCK_CANNOT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  // drop if value < min || value > max
  return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0;
}

Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0

4 votes

@Override
@SuppressWarnings("unchecked")
public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
  Column<T> filterColumn = notEq.getColumn();
  ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());

  T value = notEq.getValue();

  if (meta == null) {
    if (value == null) {
      // null is always equal to null
      return BLOCK_CANNOT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
  }

  Statistics<T> stats = meta.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (value == null) {
    // we are looking for records where v notEq(null)
    // so, if this is a column of all nulls, we can drop it
    return isAllNulls(meta);
  }

  if (stats.isNumNullsSet() && hasNulls(meta)) {
    // we are looking for records where v notEq(someNonNull)
    // but this chunk contains nulls, we cannot drop it
    return BLOCK_MIGHT_MATCH;
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  // drop if this is a column where min = max = value
  return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0;
}

Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0

4 votes

private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) {
  Column<T> filterColumn = ud.getColumn();
  ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
  U udp = ud.getUserDefinedPredicate();

  if (columnChunk == null) {
    // the column isn't in this file so all values are null.
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  Statistics<T> stats = columnChunk.getStatistics();

  if (stats.isEmpty()) {
    // we have no statistics available, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  if (isAllNulls(columnChunk)) {
    // lets run the udp with null value to see if it keeps null or not.
    if (inverted) {
      return udp.acceptsNullValue();
    } else {
      return !udp.acceptsNullValue();
    }
  }

  if (!stats.hasNonNullValue()) {
    // stats does not contain min/max values, we cannot drop any chunks
    return BLOCK_MIGHT_MATCH;
  }

  org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
    new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(),
      stats.comparator());

  if (inverted) {
    return udp.inverseCanDrop(udpStats);
  } else {
    return udp.canDrop(udpStats);
  }
}

Java Code Examples for org.apache.parquet.column.statistics.Statistics#hasNonNullValue()