Java Code Examples for org.apache.parquet.column.statistics.Statistics#hasNonNullValue()
The following examples show how to use
org.apache.parquet.column.statistics.Statistics#hasNonNullValue() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
@Override public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 2
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
@Override public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp >= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 3
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
@Override public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 4
Source File: ColumnIndexBuilder.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * Adds the data from the specified statistics to this builder * * @param stats * the statistics to be added */ public void add(Statistics<?> stats) { if (stats.hasNonNullValue()) { nullPages.add(false); Object min = stats.genericGetMin(); Object max = stats.genericGetMax(); addMinMax(min, max); pageIndexes.add(nextPageIndex); minMaxSize += sizeOf(min); minMaxSize += sizeOf(max); } else { nullPages.add(true); } nullCounts.add(stats.getNumNulls()); ++nextPageIndex; }
Example 5
Source File: Util.java From parquet-mr with Apache License 2.0 | 5 votes |
public static String minMaxAsString(Statistics stats) { if (stats == null) { return "no stats"; } if (!stats.hasNonNullValue()) { return ""; } return String.format("%s / %s", humanReadable(stats.minAsString(), 30), humanReadable(stats.maxAsString(), 30)); }
Example 6
Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Gt<T> gt) { Column<T> filterColumn = gt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never greater than a // value. for all x, null is never > x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v > someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = gt.getValue(); // drop if value >= max return stats.compareMaxToValue(value) <= 0; }
Example 7
Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(LtEq<T> ltEq) { Column<T> filterColumn = ltEq.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never less than or // equal to a value. for all x, null is never <= x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v <= someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = ltEq.getValue(); // drop if value < min return stats.compareMinToValue(value) > 0; }
Example 8
Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Lt<T> lt) { Column<T> filterColumn = lt.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); if (meta == null) { // the column is missing and always null, which is never less than a // value. for all x, null is never < x. return BLOCK_CANNOT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(meta)) { // we are looking for records where v < someValue // this chunk is all nulls, so we can drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } T value = lt.getValue(); // drop if value <= min return stats.compareMinToValue(value) >= 0; }
Example 9
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
public StatsValidator(DataPage page) { Statistics<T> stats = getStatisticsFromPageHeader(page); this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); this.max = stats.genericGetMax(); } else { this.min = null; this.max = null; } }
Example 10
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 11
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); int cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 12
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 5 votes |
public StatsValidator(DataPage page) { Statistics<T> stats = getStatisticsFromPageHeader(page); this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); this.max = stats.genericGetMax(); } else { this.min = null; this.max = null; } }
Example 13
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 14
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp >= 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 15
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) { int id = ref.fieldId(); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } ByteBuffer prefixAsBytes = lit.toByteBuffer(); Comparator<ByteBuffer> comparator = Comparators.unsignedBytes(); Binary lower = colStats.genericGetMin(); // truncate lower bound so that its length in bytes is not greater than the length of prefix int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length()); int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes); if (lowerCmp > 0) { return ROWS_CANNOT_MATCH; } Binary upper = colStats.genericGetMax(); // truncate upper bound so that its length in bytes is not greater than the length of prefix int upperLength = Math.min(prefixAsBytes.remaining(), upper.length()); int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes); if (upperCmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 16
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) { Integer id = ref.fieldId(); // When filtering nested types notNull() is implicit filter passed even though complex // filters aren't pushed down in Parquet. Leave all nested column type filters to be // evaluated post scan. if (schema.findType(id) instanceof Type.NestedType) { return ROWS_MIGHT_MATCH; } Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } Collection<T> literals = literalSet; T lower = min(colStats, id); literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match. return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 17
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { Integer id = ref.fieldId(); // When filtering nested types notNull() is implicit filter passed even though complex // filters aren't pushed down in Parquet. Leave all nested column type filters to be // evaluated post scan. if (schema.findType(id) instanceof Type.NestedType) { return ROWS_MIGHT_MATCH; } Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<?> colStats = stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } T lower = min(colStats, id); int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); cmp = lit.comparator().compare(upper, lit.value()); if (cmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example 18
Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(Eq<T> eq) { Column<T> filterColumn = eq.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); T value = eq.getValue(); if (meta == null) { // the column isn't in this file so all values are null. if (value != null) { // non-null is never null return BLOCK_CANNOT_MATCH; } return BLOCK_MIGHT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (value == null) { // We don't know anything about the nulls in this chunk if (!stats.isNumNullsSet()) { return BLOCK_MIGHT_MATCH; } // we are looking for records where v eq(null) // so drop if there are no nulls in this chunk return !hasNulls(meta); } if (isAllNulls(meta)) { // we are looking for records where v eq(someNonNull) // and this is a column of all nulls, so drop it return BLOCK_CANNOT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } // drop if value < min || value > max return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0; }
Example 19
Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) { Column<T> filterColumn = notEq.getColumn(); ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath()); T value = notEq.getValue(); if (meta == null) { if (value == null) { // null is always equal to null return BLOCK_CANNOT_MATCH; } return BLOCK_MIGHT_MATCH; } Statistics<T> stats = meta.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (value == null) { // we are looking for records where v notEq(null) // so, if this is a column of all nulls, we can drop it return isAllNulls(meta); } if (stats.isNumNullsSet() && hasNulls(meta)) { // we are looking for records where v notEq(someNonNull) // but this chunk contains nulls, we cannot drop it return BLOCK_MIGHT_MATCH; } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } // drop if this is a column where min = max = value return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0; }
Example 20
Source File: StatisticsFilter.java From parquet-mr with Apache License 2.0 | 4 votes |
private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean visit(UserDefined<T, U> ud, boolean inverted) { Column<T> filterColumn = ud.getColumn(); ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath()); U udp = ud.getUserDefinedPredicate(); if (columnChunk == null) { // the column isn't in this file so all values are null. // lets run the udp with null value to see if it keeps null or not. if (inverted) { return udp.acceptsNullValue(); } else { return !udp.acceptsNullValue(); } } Statistics<T> stats = columnChunk.getStatistics(); if (stats.isEmpty()) { // we have no statistics available, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } if (isAllNulls(columnChunk)) { // lets run the udp with null value to see if it keeps null or not. if (inverted) { return udp.acceptsNullValue(); } else { return !udp.acceptsNullValue(); } } if (!stats.hasNonNullValue()) { // stats does not contain min/max values, we cannot drop any chunks return BLOCK_MIGHT_MATCH; } org.apache.parquet.filter2.predicate.Statistics<T> udpStats = new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(), stats.comparator()); if (inverted) { return udp.inverseCanDrop(udpStats); } else { return udp.canDrop(udpStats); } }