org.apache.parquet.schema.MessageType#getColumns

Source File: ColumnWriteStoreBase.java From parquet-mr with Apache License 2.0

6 votes

ColumnWriteStoreBase(
    MessageType schema,
    PageWriteStore pageWriteStore,
    ParquetProperties props) {
  this.props = props;
  this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);
  Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>();
  for (ColumnDescriptor path : schema.getColumns()) {
    PageWriter pageWriter = pageWriteStore.getPageWriter(path);
    mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
  }
  this.columns = unmodifiableMap(mcolumns);

  this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit());

  columnWriterProvider = new ColumnWriterProvider() {
    @Override
    public ColumnWriter getColumnWriter(ColumnDescriptor path) {
      return columns.get(path);
    }
  };
}

Source File: ParquetColumnChunkPageWriteStore.java From Bats with Apache License 2.0

5 votes

public ParquetColumnChunkPageWriteStore(BytesCompressor compressor,
                                        MessageType schema,
                                        int initialSlabSize,
                                        int maxCapacityHint,
                                        ByteBufferAllocator allocator) {
  this.schema = schema;
  for (ColumnDescriptor path : schema.getColumns()) {
    writers.put(path,  new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator));
  }
}

Source File: ParquetFilePOJOReaderTest.java From attic-apex-malhar with Apache License 2.0

5 votes

public POJOWriteSupport(MessageType schema, Class<?> klass)
{
  this.schema = schema;
  this.cols = schema.getColumns();
  this.klass = klass;
  init();
}

Source File: ColumnWriteStoreBase.java From parquet-mr with Apache License 2.0

5 votes

ColumnWriteStoreBase(
  MessageType schema,
  PageWriteStore pageWriteStore,
  BloomFilterWriteStore bloomFilterWriteStore,
  ParquetProperties props) {
  this.props = props;
  this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);
  Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>();
  for (ColumnDescriptor path : schema.getColumns()) {
    PageWriter pageWriter = pageWriteStore.getPageWriter(path);
    if (props.isBloomFilterEnabled(path)) {
      BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path);
      mcolumns.put(path, createColumnWriter(path, pageWriter, bloomFilterWriter, props));
    } else {
      mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
    }
  }
  this.columns = unmodifiableMap(mcolumns);

  this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck();

  columnWriterProvider = new ColumnWriterProvider() {
    @Override
    public ColumnWriter getColumnWriter(ColumnDescriptor path) {
      return columns.get(path);
    }
  };
}

Source File: SchemaCompatibilityValidator.java From parquet-mr with Apache License 2.0

5 votes

private SchemaCompatibilityValidator(MessageType schema) {

    for (ColumnDescriptor cd : schema.getColumns()) {
      ColumnPath columnPath = ColumnPath.get(cd.getPath());
      columnsAccordingToSchema.put(columnPath, cd);
    }
  }

Source File: ColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

5 votes

public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, ByteBufferAllocator allocator,
    int columnIndexTruncateLength, boolean pageWriteChecksumEnabled) {
  this.schema = schema;
  for (ColumnDescriptor path : schema.getColumns()) {
    writers.put(path, new ColumnChunkPageWriter(path, compressor, allocator, columnIndexTruncateLength, pageWriteChecksumEnabled));
  }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}

Source File: ParquetReaderUtility.java From Bats with Apache License 2.0

4 votes

/**
 * Check whether any of columns in the given list is either nested or repetitive.
 *
 * @param footer  Parquet file schema
 * @param columns list of query SchemaPath objects
 */
public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) {

  MessageType schema = footer.getFileMetaData().getSchema();

  if (Utilities.isStarQuery(columns)) {
    for (Type type : schema.getFields()) {
      if (!type.isPrimitive()) {
        return true;
      }
    }
    for (ColumnDescriptor col : schema.getColumns()) {
      if (col.getMaxRepetitionLevel() > 0) {
        return true;
      }
    }
    return false;
  } else {
    Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
    Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);

    for (SchemaPath schemaPath : columns) {
      // Schema path which is non-leaf is complex column
      if (!schemaPath.isLeaf()) {
        logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString());
        return true;
      }

      // following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup:
      // 1. success: queried column is complex, i.e. GroupType
      // 2. failure: queried column is not in schema and thus is non-complex
      ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());

      if (column == null) {
        SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
        if (schemaElement != null) {
          return true;
        }
      } else {
        if (column.getMaxRepetitionLevel() > 0) {
          logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString());
          return true;
        }
      }
    }
  }
  return false;
}

Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0

4 votes

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
  List<ContractViolation> violations = new ArrayList<>();
  try (ParquetFileReader reader = ParquetFileReader.open(file)) {
    FileMetaData meta = reader.getFooter().getFileMetaData();
    MessageType schema = meta.getSchema();
    List<ColumnDescriptor> columns = schema.getColumns();

    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    int rowGroupNumber = 0;
    PageReadStore rowGroup = reader.readNextRowGroup();
    while (rowGroup != null) {
      ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup,
          new DummyRecordConverter(schema).getRootConverter(), schema, null);
      List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
      assert (columnChunks.size() == columns.size());
      for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
        ColumnDescriptor column = columns.get(columnNumber);
        ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
        ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
        if (columnIndex == null) {
          continue;
        }
        ColumnPath columnPath = columnChunk.getPath();
        OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
        List<Long> nullCounts = columnIndex.getNullCounts();
        List<Boolean> nullPages = columnIndex.getNullPages();
        long rowNumber = 0;
        ColumnReader columnReader = columnReadStore.getColumnReader(column);
        ByteBuffer prevMinValue = null;
        ByteBuffer prevMaxValue = null;
        for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
          boolean isNullPage = nullPages.get(pageNumber);
          ByteBuffer minValue = minValues.get(pageNumber);
          ByteBuffer maxValue = maxValues.get(pageNumber);
          PageValidator pageValidator = new PageValidator(
              column.getPrimitiveType(),
              rowGroupNumber, columnNumber, columnPath, pageNumber,
              violations, columnReader,
              minValue,
              maxValue,
              prevMinValue,
              prevMaxValue,
              boundaryOrder,
              nullCounts.get(pageNumber),
              isNullPage);
          if (!isNullPage) {
            prevMinValue = minValue;
            prevMaxValue = maxValue;
          }
          long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
          while (rowNumber <= lastRowNumberInPage) {
            pageValidator.validateValuesBelongingToRow();
            ++rowNumber;
          }
          pageValidator.finishPage();
        }
      }
      rowGroup = reader.readNextRowGroup();
      rowGroupNumber++;
    }
  }
  return violations;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

public void setRequestedSchema(MessageType projection) {
  paths.clear();
  for (ColumnDescriptor col : projection.getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
}

Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
  Path root = file.getParent();
  FileSystem fs = file.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  Encoding dataEncoding = PLAIN;
  int valueCount = 10;
  int d = 1;
  int r = 2;
  int v = 3;
  BytesInput definitionLevels = BytesInput.fromInt(d);
  BytesInput repetitionLevels = BytesInput.fromInt(r);
  Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
      .build();
  BytesInput data = BytesInput.fromInt(v);
  int rowCount = 5;
  int nullCount = 1;
  statistics.incrementNumNulls(nullCount);
  statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3});
  long pageOffset;
  long pageSize;

  {
    OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
    ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE,
        ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
    writer.start();
    writer.startBlock(rowCount);
    pageOffset = outputFile.out().getPos();
    {
      ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema,
          new HeapByteBufferAllocator(), Integer.MAX_VALUE);
      PageWriter pageWriter = store.getPageWriter(col);
      pageWriter.writePageV2(
          rowCount, nullCount, valueCount,
          repetitionLevels, definitionLevels,
          dataEncoding, data,
          statistics);
      store.flushToFileWriter(writer);
      pageSize = outputFile.out().getPos() - pageOffset;
    }
    writer.endBlock();
    writer.end(new HashMap<String, String>());
  }

  {
    ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(
        conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
    PageReadStore rowGroup = reader.readNextRowGroup();
    PageReader pageReader = rowGroup.getPageReader(col);
    DataPageV2 page = (DataPageV2)pageReader.readPage();
    assertEquals(rowCount, page.getRowCount());
    assertEquals(nullCount, page.getNullCount());
    assertEquals(valueCount, page.getValueCount());
    assertEquals(d, intValue(page.getDefinitionLevels()));
    assertEquals(r, intValue(page.getRepetitionLevels()));
    assertEquals(dataEncoding, page.getDataEncoding());
    assertEquals(v, intValue(page.getData()));

    // Checking column/offset indexes for the one page
    ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
    ColumnIndex columnIndex = reader.readColumnIndex(column);
    assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
    assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
    assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
    assertFalse(columnIndex.getNullPages().get(0));
    OffsetIndex offsetIndex = reader.readOffsetIndex(column);
    assertEquals(1, offsetIndex.getPageCount());
    assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
    assertEquals(0, offsetIndex.getFirstRowIndex(0));
    assertEquals(pageOffset, offsetIndex.getOffset(0));

    reader.close();
  }
}

Java Code Examples for org.apache.parquet.schema.MessageType#getColumns()