org.apache.parquet.column.ColumnDescriptor Java Examples
The following examples show how to use
org.apache.parquet.column.ColumnDescriptor.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * @param conf the Hadoop Configuration * @param file Path to a parquet file * @param footer a {@link ParquetMetadata} footer already read from the file * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException { this.converter = new ParquetMetadataConverter(conf); this.file = HadoopInputFile.fromPath(file, conf); this.f = this.file.newStream(); this.options = HadoopReadOptions.builder(conf).build(); this.footer = footer; this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; }
Example #2
Source File: ColumnWriteStoreBase.java From parquet-mr with Apache License 2.0 | 6 votes |
@Deprecated ColumnWriteStoreBase( final PageWriteStore pageWriteStore, final ParquetProperties props) { this.props = props; this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); this.columns = new TreeMap<>(); this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit()); columnWriterProvider = new ColumnWriterProvider() { @Override public ColumnWriter getColumnWriter(ColumnDescriptor path) { ColumnWriterBase column = columns.get(path); if (column == null) { column = createColumnWriter(path, pageWriteStore.getPageWriter(path), null, props); columns.put(path, column); } return column; } }; }
Example #3
Source File: PredicateUtils.java From presto with Apache License 2.0 | 5 votes |
public static boolean predicateMatches(Predicate parquetPredicate, BlockMetaData block, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, boolean failOnCorruptedParquetStatistics) throws ParquetCorruptionException { Map<ColumnDescriptor, Statistics<?>> columnStatistics = getStatistics(block, descriptorsByPath); if (!parquetPredicate.matches(block.getRowCount(), columnStatistics, dataSource.getId(), failOnCorruptedParquetStatistics)) { return false; } return dictionaryPredicatesMatch(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain); }
Example #4
Source File: DefaultV2ValuesWriterFactory.java From parquet-mr with Apache License 2.0 | 5 votes |
private ValuesWriter getDoubleValuesWriter(ColumnDescriptor path) { ValuesWriter fallbackWriter = null; if (this.parquetProperties.isByteStreamSplitEnabled()) { fallbackWriter = new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter(parquetProperties.getInitialSlabSize(), parquetProperties.getPageSizeThreshold(), parquetProperties.getAllocator()); } else { fallbackWriter = new PlainValuesWriter(parquetProperties.getInitialSlabSize(), parquetProperties.getPageSizeThreshold(), parquetProperties.getAllocator()); } return DefaultValuesWriterFactory.dictWriterWithFallBack(path, parquetProperties, getEncodingForDictionaryPage(), getEncodingForDataPage(), fallbackWriter); }
Example #5
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 5 votes |
private static VectorContainer buildLongGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(64, true), null); final VectorContainer input = new VectorContainer(bufferAllocator); final BigIntVector longVector = input.addOrGet(field); longVector.allocateNew(); SortedSet<Long> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToLong(i)); } } if (existingDict != null) { final BigIntVector existingDictValues = existingDict.getValueAccessorById(BigIntVector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Long> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { longVector.setSafe(recordCount++, iter.next()); } longVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
Example #6
Source File: ColumnReaderFactory.java From dremio-oss with Apache License 2.0 | 5 votes |
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v, SchemaElement schemaElement ) throws ExecutionSetupException { ConvertedType convertedType = schemaElement.getConverted_type(); switch (descriptor.getMaxDefinitionLevel()) { case 0: if (convertedType == null) { return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement); default: return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } default: if (convertedType == null) { return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement); default: return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } } }
Example #7
Source File: VarLengthValuesColumn.java From dremio-oss with Apache License 2.0 | 5 votes |
VarLengthValuesColumn(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); variableWidthVector = (VariableWidthVector) valueVec; if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) { usingDictionary = true; } else { usingDictionary = false; } }
Example #8
Source File: Util.java From parquet-mr with Apache License 2.0 | 5 votes |
public static String encodingsAsString(Set<Encoding> encodings, ColumnDescriptor desc) { StringBuilder sb = new StringBuilder(); if (encodings.contains(RLE) || encodings.contains(BIT_PACKED)) { sb.append(desc.getMaxDefinitionLevel() == 0 ? "B" : "R"); sb.append(desc.getMaxRepetitionLevel() == 0 ? "B" : "R"); if (encodings.contains(PLAIN_DICTIONARY)) { sb.append("R"); } if (encodings.contains(PLAIN)) { sb.append("_"); } } else { sb.append("RR"); if (encodings.contains(RLE_DICTIONARY)) { sb.append("R"); } if (encodings.contains(PLAIN)) { sb.append("_"); } if (encodings.contains(DELTA_BYTE_ARRAY) || encodings.contains(DELTA_BINARY_PACKED) || encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) { sb.append("D"); } } return sb.toString(); }
Example #9
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath, boolean showOriginalTypes) { String name = Strings.repeat(".", depth) + type.getName(); Repetition rep = type.getRepetition(); PrimitiveTypeName ptype = type.getPrimitiveTypeName(); out.format("%s: %s %s", name, rep, ptype); if (showOriginalTypes) { OriginalType otype; try { otype = type.getOriginalType(); } catch (Exception e) { otype = null; } if (otype != null) out.format(" O:%s", otype); } else { LogicalTypeAnnotation ltype = type.getLogicalTypeAnnotation(); if (ltype != null) out.format(" L:%s", ltype); } if (container != null) { cpath.add(type.getName()); String[] paths = cpath.toArray(new String[0]); cpath.remove(cpath.size() - 1); ColumnDescriptor desc = container.getColumnDescription(paths); int defl = desc.getMaxDefinitionLevel(); int repl = desc.getMaxRepetitionLevel(); out.format(" R:%d D:%d", repl, defl); } out.println(); }
Example #10
Source File: LocalDictionariesReader.java From dremio-oss with Apache License 2.0 | 5 votes |
/** * Return dictionary per row group for all binary columns in given parquet file. * @param fs filesystem object. * @param filePath parquet file to scan * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded. * @throws IOException */ public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException { // Passing the max footer length is not required in this case as the parquet reader would already have failed. final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER, ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal()); if (parquetMetadata.getBlocks().size() > 1) { throw new IOException( format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s", parquetMetadata.getBlocks().size(), filePath)); } final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0); final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap(); for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) { columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor); } final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap(); try(final FSInputStream in = fs.open(filePath)) { for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) { if (isBinaryType(columnChunkMetaData.getType())) { final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath()); // if first page is dictionary encoded then load dictionary, otherwise skip this column. final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0); if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) { dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec()))); } else { columnsToSkip.add(column); } } } } return new ImmutablePair<>(dictionaries, columnsToSkip); }
Example #11
Source File: ColumnChunkIncReadStore.java From Bats with Apache License 2.0 | 5 votes |
public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException { FSDataInputStream in = fs.open(path); streams.add(in); in.seek(metaData.getStartingPos()); ColumnChunkIncPageReader reader = new ColumnChunkIncPageReader(metaData, descriptor, in); columns.put(descriptor, reader); }
Example #12
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
public static void showDetails(PrettyPrintWriter out, ColumnDescriptor desc) { String path = Joiner.on(".").skipNulls().join(desc.getPath()); PrimitiveTypeName type = desc.getType(); int defl = desc.getMaxDefinitionLevel(); int repl = desc.getMaxRepetitionLevel(); out.format("column desc: %s T:%s R:%d D:%d%n", path, type, repl, defl); }
Example #13
Source File: IncrementallyUpdatedFilterPredicateBuilderBase.java From parquet-mr with Apache License 2.0 | 5 votes |
public IncrementallyUpdatedFilterPredicateBuilderBase(List<PrimitiveColumnIO> leaves) { for (PrimitiveColumnIO leaf : leaves) { ColumnDescriptor descriptor = leaf.getColumnDescriptor(); ColumnPath path = ColumnPath.get(descriptor.getPath()); PrimitiveComparator<?> comparator = descriptor.getPrimitiveType().comparator(); comparatorsByColumn.put(path, comparator); } }
Example #14
Source File: ParquetReader.java From presto with Apache License 2.0 | 5 votes |
private ColumnChunkMetaData getColumnChunkMetaData(BlockMetaData blockMetaData, ColumnDescriptor columnDescriptor) throws IOException { for (ColumnChunkMetaData metadata : blockMetaData.getColumns()) { if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) { return metadata; } } throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor); }
Example #15
Source File: TestColumnReaderImpl.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testOptional() throws Exception { MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }"); ColumnDescriptor col = schema.getColumns().get(0); MemPageWriter pageWriter = new MemPageWriter(); ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder() .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0) .withPageSize(2048).build()); for (int i = 0; i < rows; i++) { columnWriterV2.writeNull(0, 0); if ((i + 1) % 1000 == 0) { columnWriterV2.writePage(); } } columnWriterV2.writePage(); columnWriterV2.finalizeColumnChunk(); List<DataPage> pages = pageWriter.getPages(); int valueCount = 0; int rowCount = 0; for (DataPage dataPage : pages) { valueCount += dataPage.getValueCount(); rowCount += ((DataPageV2)dataPage).getRowCount(); } assertEquals(rows, rowCount); assertEquals(rows, valueCount); MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage()); ValidatingConverter converter = new ValidatingConverter(); ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION)); for (int i = 0; i < rows; i++) { assertEquals(0, columnReader.getCurrentRepetitionLevel()); assertEquals(0, columnReader.getCurrentDefinitionLevel()); columnReader.consume(); } assertEquals(0, converter.count); }
Example #16
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
public void validate(MessageType schema, PageReadStore store) { for (ColumnDescriptor desc : schema.getColumns()) { PageReader reader = store.getPageReader(desc); DictionaryPage dict = reader.readDictionaryPage(); DataPage page; while ((page = reader.readPage()) != null) { validateStatsForPage(page, dict, desc); } } }
Example #17
Source File: ColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0 | 5 votes |
public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, ByteBufferAllocator allocator, int columnIndexTruncateLength, boolean pageWriteChecksumEnabled) { this.schema = schema; for (ColumnDescriptor path : schema.getColumns()) { writers.put(path, new ColumnChunkPageWriter(path, compressor, allocator, columnIndexTruncateLength, pageWriteChecksumEnabled)); } }
Example #18
Source File: ReadState.java From Bats with Apache License 2.0 | 5 votes |
/** * Create the readers needed to read columns: fixed-length or variable length. * * @param reader * @param output * @throws Exception */ @SuppressWarnings("unchecked") public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception { final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>(); // initialize all of the column read status objects BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata(); Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata); for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) { ColumnDescriptor column = columnMetadata.column; columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get( columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); columnMetadata.buildVector(output); if (! columnMetadata.isFixedLength( )) { // create a reader and add it to the appropriate list varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader)); } else if (columnMetadata.isRepeated()) { varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader)); } else { fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader)); } } varLengthReader = new VarLenBinaryReader(reader, varLengthColumns); if (! schema.isStarQuery()) { schema.createNonExistentColumns(output, nullFilledVectors); } }
Example #19
Source File: TestTupleDomainParquetPredicate.java From presto with Apache License 2.0 | 5 votes |
@Test public void testVarcharMatchesWithDictionaryDescriptor() { ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), EMPTY_SLICE); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); DictionaryPage page = new DictionaryPage(Slices.wrappedBuffer(new byte[] {0, 0, 0, 0}), 1, PLAIN_DICTIONARY); assertTrue(parquetPredicate.matches(new DictionaryDescriptor(column, Optional.of(page)))); }
Example #20
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void validateFirstToLast(int rowGroupID, DictionaryPage dictPage, List<DataPage> pageGroup, ColumnDescriptor desc, List<?> expectedValues) { int rowsRead = 0, pageID = 0; for (DataPage page : pageGroup) { List<?> expectedPageValues = expectedValues.subList(rowsRead, rowsRead + page.getValueCount()); PageValuesValidator.validateValuesForPage(rowGroupID, pageID, dictPage, page, desc, expectedPageValues); rowsRead += page.getValueCount(); pageID++; } }
Example #21
Source File: ColumnChunkPageReadStore.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public DictionaryPage readDictionaryPage(ColumnDescriptor descriptor) { return readers.get(descriptor).readDictionaryPage(); }
Example #22
Source File: PageIterator.java From iceberg with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") static <T> PageIterator<T> newIterator(ColumnDescriptor desc, String writerVersion) { switch (desc.getPrimitiveType().getPrimitiveTypeName()) { case BOOLEAN: return (PageIterator<T>) new PageIterator<Boolean>(desc, writerVersion) { @Override public Boolean next() { return nextBoolean(); } }; case INT32: return (PageIterator<T>) new PageIterator<Integer>(desc, writerVersion) { @Override public Integer next() { return nextInteger(); } }; case INT64: return (PageIterator<T>) new PageIterator<Long>(desc, writerVersion) { @Override public Long next() { return nextLong(); } }; case FLOAT: return (PageIterator<T>) new PageIterator<Float>(desc, writerVersion) { @Override public Float next() { return nextFloat(); } }; case DOUBLE: return (PageIterator<T>) new PageIterator<Double>(desc, writerVersion) { @Override public Double next() { return nextDouble(); } }; case FIXED_LEN_BYTE_ARRAY: case BINARY: return (PageIterator<T>) new PageIterator<Binary>(desc, writerVersion) { @Override public Binary next() { return nextBinary(); } }; default: throw new UnsupportedOperationException("Unsupported primitive type: " + desc.getPrimitiveType().getPrimitiveTypeName()); } }
Example #23
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 4 votes |
public static Path dictionaryFilePath(Path dictionaryRootDir, ColumnDescriptor columnDescriptor) { return dictionaryRootDir.resolve(dictionaryFileName(columnDescriptor)); }
Example #24
Source File: GenericParquetReaders.java From iceberg with Apache License 2.0 | 4 votes |
private TimestamptzReader(ColumnDescriptor desc) { super(desc); }
Example #25
Source File: VectorizedPageIterator.java From iceberg with Apache License 2.0 | 4 votes |
@Override protected void initDefinitionLevelsReader(DataPageV2 dataPageV2, ColumnDescriptor desc) { this.vectorizedDefinitionLevelReader = newVectorizedDefinitionLevelReader(desc); }
Example #26
Source File: GenericParquetWriter.java From iceberg with Apache License 2.0 | 4 votes |
private TimestamptzWriter(ColumnDescriptor desc) { super(desc); }
Example #27
Source File: FixedByteAlignedReader.java From Bats with Apache License 2.0 | 4 votes |
FixedByteAlignedReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement); }
Example #28
Source File: GenericParquetReaders.java From iceberg with Apache License 2.0 | 4 votes |
private TimestamptzMillisReader(ColumnDescriptor desc) { super(desc); }
Example #29
Source File: DefaultValuesWriterFactoryTest.java From parquet-mr with Apache License 2.0 | 4 votes |
private ColumnDescriptor createColumnDescriptor(PrimitiveTypeName typeName) { return createColumnDescriptor(typeName, "fake_" + typeName.name().toLowerCase() + "_col"); }
Example #30
Source File: ParquetValueWriters.java From iceberg with Apache License 2.0 | 4 votes |
public static UnboxedWriter<Short> shorts(ColumnDescriptor desc) { return new ShortWriter(desc); }