org.apache.parquet.column.page.DictionaryPageReadStore Java Examples

The following examples show how to use org.apache.parquet.column.page.DictionaryPageReadStore. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testColumnWithoutDictionary() throws Exception {
  IntColumn plain = intColumn("plain_int32_field");
  DictionaryPageReadStore dictionaryStore = mock(DictionaryPageReadStore.class);

  assertFalse("Should never drop block using plain encoding",
      canDrop(eq(plain, -10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(lt(plain, -10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(ltEq(plain, -10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(gt(plain, nElements + 10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(gtEq(plain, nElements + 10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(notEq(plain, nElements + 10), ccmd, dictionaryStore));

  verifyZeroInteractions(dictionaryStore);
}
 
Example #2
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testColumnWithDictionaryAndPlainEncodings() throws Exception {
  IntColumn plain = intColumn("fallback_binary_field");
  DictionaryPageReadStore dictionaryStore = mock(DictionaryPageReadStore.class);

  assertFalse("Should never drop block using plain encoding",
      canDrop(eq(plain, -10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(lt(plain, -10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(ltEq(plain, -10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(gt(plain, nElements + 10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(gtEq(plain, nElements + 10), ccmd, dictionaryStore));

  assertFalse("Should never drop block using plain encoding",
      canDrop(notEq(plain, nElements + 10), ccmd, dictionaryStore));

  verifyZeroInteractions(dictionaryStore);
}
 
Example #3
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private DictionaryFilter(List<ColumnChunkMetaData> columnsList, DictionaryPageReadStore dictionaries) {
  for (ColumnChunkMetaData chunk : columnsList) {
    columns.put(chunk.getPath(), chunk);
  }

  this.dictionaries = dictionaries;
}
 
Example #4
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static boolean canDrop(FilterPredicate pred, List<ColumnChunkMetaData> columns, DictionaryPageReadStore dictionaries) {
  Objects.requireNonNull(pred, "pred cannnot be null");
  Objects.requireNonNull(columns, "columns cannnot be null");
  return pred.accept(new DictionaryFilter(columns, dictionaries));
}
 
Example #5
Source File: ShowDictionaryCommand.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);

  ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
  MessageType schema = reader.getFileMetaData().getSchema();
  ColumnDescriptor descriptor = Util.descriptor(column, schema);
  PrimitiveType type = Util.primitive(column, schema);
  Preconditions.checkNotNull(type);

  DictionaryPageReadStore dictionaryReader;
  int rowGroup = 0;
  while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
    DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);

    Dictionary dict = page.getEncoding().initDictionary(descriptor, page);

    console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
    for (int i = 0; i <= dict.getMaxId(); i += 1) {
      switch(type.getPrimitiveTypeName()) {
        case BINARY:
          if (type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
          } else {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
          }
          break;
        case INT32:
          console.info("{}: {}", String.format("%6d", i),
            dict.decodeToInt(i));
          break;
        case INT64:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToLong(i));
          break;
        case FLOAT:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToFloat(i));
          break;
        case DOUBLE:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToDouble(i));
          break;
        default:
          throw new IllegalArgumentException(
              "Unknown dictionary type: " + type.getPrimitiveTypeName());
      }
    }

    reader.skipNextRowGroup();

    rowGroup += 1;
  }

  console.info("");

  return 0;
}
 
Example #6
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 3 votes vote down vote up
/**
 * Returns a {@link DictionaryPageReadStore} for the row group that would be
 * returned by calling {@link #readNextRowGroup()} or skipped by calling
 * {@link #skipNextRowGroup()}.
 *
 * @return a DictionaryPageReadStore for the next row group
 */
public DictionaryPageReadStore getNextDictionaryReader() {
  if (nextDictionaryReader == null && currentBlock < blocks.size()) {
    this.nextDictionaryReader = getDictionaryReader(blocks.get(currentBlock));
  }
  return nextDictionaryReader;
}
 
Example #7
Source File: ParquetDictionaryRowGroupFilter.java    From iceberg with Apache License 2.0 2 votes vote down vote up
/**
 * Test whether the dictionaries for a row group may contain records that match the expression.
 *
 * @param fileSchema schema for the Parquet file
 * @param dictionaries a dictionary page read store
 * @return false if the file cannot contain rows that match the expression, true otherwise.
 */
public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup,
                          DictionaryPageReadStore dictionaries) {
  return visitor().eval(fileSchema, rowGroup, dictionaries);
}
 
Example #8
Source File: ParquetDictionaryRowGroupFilter.java    From iceberg with Apache License 2.0 2 votes vote down vote up
/**
 * Test whether the dictionaries for a row group may contain records that match the expression.
 *
 * @param fileSchema schema for the Parquet file
 * @param dictionaries a dictionary page read store
 * @return false if the file cannot contain rows that match the expression, true otherwise.
 */
public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup,
                          DictionaryPageReadStore dictionaries) {
  return visitor().eval(fileSchema, rowGroup, dictionaries);
}