Java Code Examples for org.apache.parquet.schema.MessageType#getColumns()
The following examples show how to use
org.apache.parquet.schema.MessageType#getColumns() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ColumnWriteStoreBase.java From parquet-mr with Apache License 2.0 | 6 votes |
ColumnWriteStoreBase( MessageType schema, PageWriteStore pageWriteStore, ParquetProperties props) { this.props = props; this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>(); for (ColumnDescriptor path : schema.getColumns()) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); mcolumns.put(path, createColumnWriter(path, pageWriter, null, props)); } this.columns = unmodifiableMap(mcolumns); this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit()); columnWriterProvider = new ColumnWriterProvider() { @Override public ColumnWriter getColumnWriter(ColumnDescriptor path) { return columns.get(path); } }; }
Example 2
Source File: ParquetColumnChunkPageWriteStore.java From Bats with Apache License 2.0 | 5 votes |
public ParquetColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int initialSlabSize, int maxCapacityHint, ByteBufferAllocator allocator) { this.schema = schema; for (ColumnDescriptor path : schema.getColumns()) { writers.put(path, new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator)); } }
Example 3
Source File: ParquetFilePOJOReaderTest.java From attic-apex-malhar with Apache License 2.0 | 5 votes |
public POJOWriteSupport(MessageType schema, Class<?> klass) { this.schema = schema; this.cols = schema.getColumns(); this.klass = klass; init(); }
Example 4
Source File: ColumnWriteStoreBase.java From parquet-mr with Apache License 2.0 | 5 votes |
ColumnWriteStoreBase( MessageType schema, PageWriteStore pageWriteStore, BloomFilterWriteStore bloomFilterWriteStore, ParquetProperties props) { this.props = props; this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>(); for (ColumnDescriptor path : schema.getColumns()) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); if (props.isBloomFilterEnabled(path)) { BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path); mcolumns.put(path, createColumnWriter(path, pageWriter, bloomFilterWriter, props)); } else { mcolumns.put(path, createColumnWriter(path, pageWriter, null, props)); } } this.columns = unmodifiableMap(mcolumns); this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); columnWriterProvider = new ColumnWriterProvider() { @Override public ColumnWriter getColumnWriter(ColumnDescriptor path) { return columns.get(path); } }; }
Example 5
Source File: SchemaCompatibilityValidator.java From parquet-mr with Apache License 2.0 | 5 votes |
private SchemaCompatibilityValidator(MessageType schema) { for (ColumnDescriptor cd : schema.getColumns()) { ColumnPath columnPath = ColumnPath.get(cd.getPath()); columnsAccordingToSchema.put(columnPath, cd); } }
Example 6
Source File: ColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0 | 5 votes |
public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, ByteBufferAllocator allocator, int columnIndexTruncateLength, boolean pageWriteChecksumEnabled) { this.schema = schema; for (ColumnDescriptor path : schema.getColumns()) { writers.put(path, new ColumnChunkPageWriter(path, compressor, allocator, columnIndexTruncateLength, pageWriteChecksumEnabled)); } }
Example 7
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
public void validate(MessageType schema, PageReadStore store) { for (ColumnDescriptor desc : schema.getColumns()) { PageReader reader = store.getPageReader(desc); DictionaryPage dict = reader.readDictionaryPage(); DataPage page; while ((page = reader.readPage()) != null) { validateStatsForPage(page, dict, desc); } } }
Example 8
Source File: ParquetReaderUtility.java From Bats with Apache License 2.0 | 4 votes |
/** * Check whether any of columns in the given list is either nested or repetitive. * * @param footer Parquet file schema * @param columns list of query SchemaPath objects */ public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) { MessageType schema = footer.getFileMetaData().getSchema(); if (Utilities.isStarQuery(columns)) { for (Type type : schema.getFields()) { if (!type.isPrimitive()) { return true; } } for (ColumnDescriptor col : schema.getColumns()) { if (col.getMaxRepetitionLevel() > 0) { return true; } } return false; } else { Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer); Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer); for (SchemaPath schemaPath : columns) { // Schema path which is non-leaf is complex column if (!schemaPath.isLeaf()) { logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString()); return true; } // following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup: // 1. success: queried column is complex, i.e. GroupType // 2. failure: queried column is not in schema and thus is non-complex ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase()); if (column == null) { SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase()); if (schemaElement != null) { return true; } } else { if (column.getMaxRepetitionLevel() > 0) { logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString()); return true; } } } } return false; }
Example 9
Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0 | 4 votes |
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException { List<ContractViolation> violations = new ArrayList<>(); try (ParquetFileReader reader = ParquetFileReader.open(file)) { FileMetaData meta = reader.getFooter().getFileMetaData(); MessageType schema = meta.getSchema(); List<ColumnDescriptor> columns = schema.getColumns(); List<BlockMetaData> blocks = reader.getFooter().getBlocks(); int rowGroupNumber = 0; PageReadStore rowGroup = reader.readNextRowGroup(); while (rowGroup != null) { ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null); List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns(); assert (columnChunks.size() == columns.size()); for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) { ColumnDescriptor column = columns.get(columnNumber); ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber); ColumnIndex columnIndex = reader.readColumnIndex(columnChunk); if (columnIndex == null) { continue; } ColumnPath columnPath = columnChunk.getPath(); OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk); List<ByteBuffer> minValues = columnIndex.getMinValues(); List<ByteBuffer> maxValues = columnIndex.getMaxValues(); BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder(); List<Long> nullCounts = columnIndex.getNullCounts(); List<Boolean> nullPages = columnIndex.getNullPages(); long rowNumber = 0; ColumnReader columnReader = columnReadStore.getColumnReader(column); ByteBuffer prevMinValue = null; ByteBuffer prevMaxValue = null; for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) { boolean isNullPage = nullPages.get(pageNumber); ByteBuffer minValue = minValues.get(pageNumber); ByteBuffer maxValue = maxValues.get(pageNumber); PageValidator pageValidator = new PageValidator( column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage); if (!isNullPage) { prevMinValue = minValue; prevMaxValue = maxValue; } long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount()); while (rowNumber <= lastRowNumberInPage) { pageValidator.validateValuesBelongingToRow(); ++rowNumber; } pageValidator.finishPage(); } } rowGroup = reader.readNextRowGroup(); rowGroupNumber++; } } return violations; }
Example 10
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 4 votes |
public void setRequestedSchema(MessageType projection) { paths.clear(); for (ColumnDescriptor col : projection.getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } }
Example 11
Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void test() throws Exception { Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet"); Path root = file.getParent(); FileSystem fs = file.getFileSystem(conf); if (fs.exists(root)) { fs.delete(root, true); } fs.mkdirs(root); MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }"); ColumnDescriptor col = schema.getColumns().get(0); Encoding dataEncoding = PLAIN; int valueCount = 10; int d = 1; int r = 2; int v = 3; BytesInput definitionLevels = BytesInput.fromInt(d); BytesInput repetitionLevels = BytesInput.fromInt(r); Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")) .build(); BytesInput data = BytesInput.fromInt(v); int rowCount = 5; int nullCount = 1; statistics.incrementNumNulls(nullCount); statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3}); long pageOffset; long pageSize; { OutputFileForTesting outputFile = new OutputFileForTesting(file, conf); ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT); writer.start(); writer.startBlock(rowCount); pageOffset = outputFile.out().getPos(); { ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE); PageWriter pageWriter = store.getPageWriter(col); pageWriter.writePageV2( rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics); store.flushToFileWriter(writer); pageSize = outputFile.out().getPos() - pageOffset; } writer.endBlock(); writer.end(new HashMap<String, String>()); } { ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER); ParquetFileReader reader = new ParquetFileReader( conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns()); PageReadStore rowGroup = reader.readNextRowGroup(); PageReader pageReader = rowGroup.getPageReader(col); DataPageV2 page = (DataPageV2)pageReader.readPage(); assertEquals(rowCount, page.getRowCount()); assertEquals(nullCount, page.getNullCount()); assertEquals(valueCount, page.getValueCount()); assertEquals(d, intValue(page.getDefinitionLevels())); assertEquals(r, intValue(page.getRepetitionLevels())); assertEquals(dataEncoding, page.getDataEncoding()); assertEquals(v, intValue(page.getData())); // Checking column/offset indexes for the one page ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0); ColumnIndex columnIndex = reader.readColumnIndex(column); assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array()); assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array()); assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue()); assertFalse(columnIndex.getNullPages().get(0)); OffsetIndex offsetIndex = reader.readOffsetIndex(column); assertEquals(1, offsetIndex.getPageCount()); assertEquals(pageSize, offsetIndex.getCompressedPageSize(0)); assertEquals(0, offsetIndex.getFirstRowIndex(0)); assertEquals(pageOffset, offsetIndex.getOffset(0)); reader.close(); } }