org.apache.parquet.column.statistics.IntStatistics Java Exaples

Source File: TupleDomainParquetPredicate.java From presto with Apache License 2.0

6 votes

private static Optional<ParquetIntegerStatistics> toParquetIntegerStatistics(Statistics<?> statistics, ParquetDataSourceId id, String column, boolean failOnCorruptedParquetStatistics)
        throws ParquetCorruptionException
{
    if (statistics instanceof LongStatistics) {
        LongStatistics longStatistics = (LongStatistics) statistics;
        if (longStatistics.genericGetMin() > longStatistics.genericGetMax()) {
            failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, longStatistics);
            return Optional.empty();
        }
        return Optional.of(new ParquetIntegerStatistics(longStatistics.genericGetMin(), longStatistics.genericGetMax()));
    }

    if (statistics instanceof IntStatistics) {
        IntStatistics intStatistics = (IntStatistics) statistics;
        if (intStatistics.genericGetMin() > intStatistics.genericGetMax()) {
            failWithCorruptionException(failOnCorruptedParquetStatistics, column, id, intStatistics);
            return Optional.empty();
        }
        return Optional.of(new ParquetIntegerStatistics((long) intStatistics.getMin(), (long) intStatistics.getMax()));
    }

    throw new IllegalArgumentException("Cannot convert statistics of type " + statistics.getClass().getName());
}

Source File: TestMetadataReader.java From presto with Apache License 2.0

6 votes

@Test(dataProvider = "allCreatedBy")
public void testReadStatsInt32(Optional<String> fileCreatedBy)
{
    Statistics statistics = new Statistics();
    statistics.setNull_count(13);
    statistics.setMin(fromHex("F6FFFFFF"));
    statistics.setMax(fromHex("3AA40000"));
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.of(statistics), new PrimitiveType(OPTIONAL, INT32, "Test column")))
            .isInstanceOfSatisfying(IntStatistics.class, columnStatistics -> {
                assertEquals(columnStatistics.getNumNulls(), 13);
                assertEquals(columnStatistics.getMin(), -10);
                assertEquals(columnStatistics.getMax(), 42042);
                assertEquals(columnStatistics.genericGetMin(), (Integer) (int) -10);
                assertEquals(columnStatistics.genericGetMax(), (Integer) 42042);
            });
}

Source File: TestStatisticsFilter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testEqNull() {
  IntStatistics statsNoNulls = new IntStatistics();
  statsNoNulls.setMinMax(10, 100);
  statsNoNulls.setNumNulls(0);

  IntStatistics statsSomeNulls = new IntStatistics();
  statsSomeNulls.setMinMax(10, 100);
  statsSomeNulls.setNumNulls(3);

  assertTrue(canDrop(eq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsNoNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(eq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsSomeNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(eq(missingColumn, null), columnMetas));

  assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas));
  assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas));
}

Source File: TestStatisticsFilter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testClearExceptionForNots() {
  List<ColumnChunkMetaData> columnMetas = Arrays.asList(
      getDoubleColumnMeta(new DoubleStatistics(), 0L),
      getIntColumnMeta(new IntStatistics(), 0L));

  FilterPredicate pred = and(not(eq(doubleColumn, 12.0)), eq(intColumn, 17));

  try {
    canDrop(pred, columnMetas);
    fail("This should throw");
  } catch (IllegalArgumentException e) {
    assertEquals("This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter?"
        + " not(eq(double.column, 12.0))", e.getMessage());
  }
}

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

6 votes

private void testIntegerStats(StatsHelper helper) {
  // make fake stats and verify the size check
  IntStatistics stats = new IntStatistics();
  stats.incrementNumNulls(3004);
  int min = Integer.MIN_VALUE;
  int max = Integer.MAX_VALUE;
  stats.updateStats(min);
  stats.updateStats(max);

  org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);

  Assert.assertEquals("Min should match",
      min, BytesUtils.bytesToInt(formatStats.getMin()));
  Assert.assertEquals("Max should match",
      max, BytesUtils.bytesToInt(formatStats.getMax()));
  Assert.assertEquals("Num nulls should match",
      3004, formatStats.getNull_count());
}

Source File: TestMetadataReader.java From presto with Apache License 2.0

5 votes

@Test(dataProvider = "allCreatedBy")
public void testReadNullStats(Optional<String> fileCreatedBy)
{
    // integer
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT32, "Test column")))
            .isInstanceOfSatisfying(
                    IntStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // bigint
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT64, "Test column")))
            .isInstanceOfSatisfying(
                    LongStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varchar
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8)))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varbinary
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column")))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));
}

Source File: TestInputFormat.java From parquet-mr with Apache License 2.0

5 votes

public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
  BlockMetaData blockMetaData = new BlockMetaData();

  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      100l, 100l, valueCount, 100l, 100l);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(200l);
  blockMetaData.setRowCount(valueCount);
  return blockMetaData;
}

Source File: TestStatisticsFilter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNotEqNonNull() {
  assertFalse(canDrop(notEq(intColumn, 9), columnMetas));
  assertFalse(canDrop(notEq(intColumn, 10), columnMetas));
  assertFalse(canDrop(notEq(intColumn, 100), columnMetas));
  assertFalse(canDrop(notEq(intColumn, 101), columnMetas));

  IntStatistics allSevens = new IntStatistics();
  allSevens.setMinMax(7, 7);
  assertTrue(canDrop(notEq(intColumn, 7), Arrays.asList(
      getIntColumnMeta(allSevens, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  allSevens.setNumNulls(100L);
  assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
      getIntColumnMeta(allSevens, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  allSevens.setNumNulls(177L);
  assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
      getIntColumnMeta(allSevens, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));

  assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas));
  assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas));
}

Source File: TestStatisticsFilter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNotEqNull() {
  IntStatistics statsNoNulls = new IntStatistics();
  statsNoNulls.setMinMax(10, 100);
  statsNoNulls.setNumNulls(0);

  IntStatistics statsSomeNulls = new IntStatistics();
  statsSomeNulls.setMinMax(10, 100);
  statsSomeNulls.setNumNulls(3);

  IntStatistics statsAllNulls = new IntStatistics();
  statsAllNulls.setMinMax(0, 0);
  statsAllNulls.setNumNulls(177);

  assertFalse(canDrop(notEq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsNoNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(notEq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsSomeNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(notEq(intColumn, null), Arrays.asList(
      getIntColumnMeta(statsAllNulls, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(notEq(missingColumn, null), columnMetas));

  assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas));
  assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas));
}

Source File: ParquetReaderUtility.java From Bats with Apache License 2.0

4 votes

/**
 * Detect corrupt date values by looking at the min/max values in the metadata.
 *
 * This should only be used when a file does not have enough metadata to determine if
 * the data was written with an external tool or an older version of Drill
 * ({@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} <
 * {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT})
 *
 * This method only checks the first Row Group, because Drill has only ever written
 * a single Row Group per file.
 *
 * @param footer parquet footer
 * @param columns list of columns schema path
 * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
 *                                of corrupt dates. There are some rare cases (storing dates thousands
 *                                of years into the future, with tools other than Drill writing files)
 *                                that would result in the date values being "corrected" into bad values.
 */
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
                                                            List<SchemaPath> columns,
                                                            boolean autoCorrectCorruptDates) {
  // Users can turn-off date correction in cases where we are detecting corruption based on the date values
  // that are unlikely to appear in common datasets. In this case report that no correction needs to happen
  // during the file read
  if (! autoCorrectCorruptDates) {
    return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
  }
  // Drill produced files have only ever have a single row group, if this changes in the future it won't matter
  // as we will know from the Drill version written in the files that the dates are correct
  int rowGroupIndex = 0;
  Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
  findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
    List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
    for (int i = 0; i < parquetColumns.size(); ++i) {
      ColumnDescriptor column = parquetColumns.get(i);
      // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
      // creating a NameSegment makes sure we are using the standard code for comparing names,
      // currently it is all case-insensitive
      if (Utilities.isStarQuery(columns)
          || getFullColumnPath(column).equalsIgnoreCase(schemaPath.getUnIndexed().toString())) {
        int colIndex = -1;
        ConvertedType convertedType = schemaElements.get(getFullColumnPath(column)).getConverted_type();
        if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
          List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
          for (int j = 0; j < colChunkList.size(); j++) {
            if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
              colIndex = j;
              break;
            }
          }
        }
        if (colIndex == -1) {
          // column does not appear in this file, skip it
          continue;
        }
        IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
        return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ?
            DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
      }
    }
  }
  return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}

Source File: TestTupleDomainParquetPredicate.java From presto with Apache License 2.0

4 votes

private static IntStatistics intColumnStats(int minimum, int maximum)
{
    IntStatistics statistics = new IntStatistics();
    statistics.setMinMax(minimum, maximum);
    return statistics;
}

Source File: TestStatisticsFilter.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testUdp() {
  FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
  FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class)));

  FilterPredicate udpDropMissingColumn = userDefined(missingColumn2, DropNullUdp.class);
  FilterPredicate invUdpDropMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, DropNullUdp.class)));

  FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
  FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class)));

  FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class);

  IntStatistics seven = new IntStatistics();
  seven.setMinMax(7, 7);

  IntStatistics eight = new IntStatistics();
  eight.setMinMax(8, 8);

  IntStatistics neither = new IntStatistics();
  neither.setMinMax(1 , 2);

  assertTrue(canDrop(pred, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(pred, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(pred, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invPred, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(invPred, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invPred, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // udpDropMissingColumn drops null column.
  assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // invUdpDropMissingColumn (i.e., not(udpDropMissingColumn)) keeps null column.
  assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // udpKeepMissingColumn keeps null column.
  assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  // invUdpKeepMissingColumn (i.e., not(udpKeepMissingColumn)) drops null column.
  assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(seven, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(eight, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
      getIntColumnMeta(neither, 177L),
      getDoubleColumnMeta(doubleStats, 177L))));

  assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas));
}

Source File: TestRowGroupFilter.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testApplyRowGroupFilters() {

  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  IntStatistics stats1 = new IntStatistics();
  stats1.setMinMax(10, 100);
  stats1.setNumNulls(4);
  BlockMetaData b1 = makeBlockFromStats(stats1, 301);
  blocks.add(b1);

  IntStatistics stats2 = new IntStatistics();
  stats2.setMinMax(8, 102);
  stats2.setNumNulls(0);
  BlockMetaData b2 = makeBlockFromStats(stats2, 302);
  blocks.add(b2);

  IntStatistics stats3 = new IntStatistics();
  stats3.setMinMax(100, 102);
  stats3.setNumNulls(12);
  BlockMetaData b3 = makeBlockFromStats(stats3, 303);
  blocks.add(b3);


  IntStatistics stats4 = new IntStatistics();
  stats4.setMinMax(0, 0);
  stats4.setNumNulls(304);
  BlockMetaData b4 = makeBlockFromStats(stats4, 304);
  blocks.add(b4);


  IntStatistics stats5 = new IntStatistics();
  stats5.setMinMax(50, 50);
  stats5.setNumNulls(7);
  BlockMetaData b5 = makeBlockFromStats(stats5, 305);
  blocks.add(b5);

  IntStatistics stats6 = new IntStatistics();
  stats6.setMinMax(0, 0);
  stats6.setNumNulls(12);
  BlockMetaData b6 = makeBlockFromStats(stats6, 306);
  blocks.add(b6);

  MessageType schema = MessageTypeParser.parseMessageType("message Document { optional int32 foo; }");
  IntColumn foo = intColumn("foo");

  List<BlockMetaData> filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, 50)), blocks, schema);
  assertEquals(Arrays.asList(b1, b2, b5), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(notEq(foo, 50)), blocks, schema);
  assertEquals(Arrays.asList(b1, b2, b3, b4, b5, b6), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, null)), blocks, schema);
  assertEquals(Arrays.asList(b1, b3, b4, b5, b6), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(notEq(foo, null)), blocks, schema);
  assertEquals(Arrays.asList(b1, b2, b3, b5, b6), filtered);

  filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, 0)), blocks, schema);
  assertEquals(Arrays.asList(b6), filtered);
}

org.apache.parquet.column.statistics.IntStatistics Java Examples