Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#getRowCount()
The following examples show how to use
org.apache.parquet.hadoop.metadata.BlockMetaData#getRowCount() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { int id = colType.getId().intValue(); stats.put(id, col.getStatistics()); valueCounts.put(id, col.getValueCount()); conversions.put(id, ParquetConversions.converterFromParquet(colType)); } } return ExpressionVisitors.visitEvaluator(expr, this); }
Example 2
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { int id = colType.getId().intValue(); stats.put(id, col.getStatistics()); valueCounts.put(id, col.getValueCount()); conversions.put(id, converterFromParquet(colType)); } } return ExpressionVisitors.visit(expr, this); }
Example 3
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); long numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); return fileMetaData; }
Example 4
Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0 | 6 votes |
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
Example 5
Source File: ParquetReader.java From tajo with Apache License 2.0 | 6 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
Example 6
Source File: PrintFooter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
Example 7
Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0 | 5 votes |
@Test public void test_Meta_Info() throws Exception { FileInputStream fileInputStream = new FileInputStream(tmpAvro); ByteArrayOutputStream out = new ByteArrayOutputStream(); int readedBytes; byte[] buf = new byte[1024]; while ((readedBytes = fileInputStream.read(buf)) > 0) { out.write(buf, 0, readedBytes); } out.close(); Map<String, String> attributes = new HashMap<String, String>() {{ put(CoreAttributes.FILENAME.key(), "test.avro"); }}; runner.enqueue(out.toByteArray(), attributes); runner.run(); MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0); // Save the flowfile byte[] resultContents = runner.getContentAsByteArray(resultFlowFile); FileOutputStream fos = new FileOutputStream(tmpParquet); fos.write(resultContents); fos.flush(); fos.close(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); ParquetMetadata metaData; metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER); // #number of records long nParquetRecords = 0; for(BlockMetaData meta : metaData.getBlocks()){ nParquetRecords += meta.getRowCount(); } long nAvroRecord = records.size(); assertEquals(nParquetRecords, nAvroRecord); }
Example 8
Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) { long start = rowGroup.getStartingPos(); long rowCount = rowGroup.getRowCount(); long compressedSize = rowGroup.getCompressedSize(); long uncompressedSize = rowGroup.getTotalByteSize(); String filePath = rowGroup.getPath(); console.info(String.format("\nRow group %d: count: %d %s records start: %d total: %s%s\n%s", index, rowCount, humanReadable(((float) compressedSize) / rowCount), start, humanReadable(compressedSize), filePath != null ? " path: " + filePath : "", new TextStringBuilder(80).appendPadding(80, '-'))); int size = maxSize(Iterables.transform(rowGroup.getColumns(), new Function<ColumnChunkMetaData, String>() { @Override public String apply(@Nullable ColumnChunkMetaData input) { return input == null ? "" : input.getPath().toDotString(); } })); console.info(String.format("%-" + size + "s %-9s %-9s %-9s %-10s %-7s %s", "", "type", "encodings", "count", "avg size", "nulls", "min / max")); for (ColumnChunkMetaData column : rowGroup.getColumns()) { printColumnChunk(console, size, column, schema); } }
Example 9
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public long getRecordCount() { long total = 0; for (BlockMetaData block : blocks) { total += block.getRowCount(); } return total; }
Example 10
Source File: RowCountCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; out = new PrintWriter(Main.out, true); inputPath = new Path(input); conf = new Configuration(); inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); long rowCount = 0; for (FileStatus fs : inputFileStatuses) { long fileRowCount=0; for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) { for (BlockMetaData b : f.getParquetMetadata().getBlocks()) { rowCount += b.getRowCount(); fileRowCount += b.getRowCount(); } } if (options.hasOption('d')) { out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount); } } out.format("Total RowCount: %d", rowCount); out.println(); }
Example 11
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { long rows = meta.getRowCount(); long tbs = meta.getTotalByteSize(); long offset = meta.getStartingPos(); out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset); out.rule('-'); showDetails(out, meta.getColumns()); }
Example 12
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { long rows = meta.getRowCount(); long tbs = meta.getTotalByteSize(); long offset = meta.getStartingPos(); out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset); out.rule('-'); showDetails(out, meta.getColumns()); }
Example 13
Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0 | 5 votes |
/** * Seek to a particular row number. */ public void seekToRow(long rowCount) throws IOException { if (totalCountLoadedSoFar != 0) { throw new UnsupportedOperationException("Only support seek at first."); } List<BlockMetaData> blockMetaData = reader.getRowGroups(); for (BlockMetaData metaData : blockMetaData) { if (metaData.getRowCount() > rowCount) { break; } else { reader.skipNextRowGroup(); rowsReturned += metaData.getRowCount(); totalCountLoadedSoFar += metaData.getRowCount(); rowsInBatch = (int) metaData.getRowCount(); nextRow = (int) metaData.getRowCount(); rowCount -= metaData.getRowCount(); } } for (int i = 0; i < rowCount; i++) { boolean end = reachedEnd(); if (end) { throw new RuntimeException("Seek to many rows."); } nextRecord(); } }
Example 14
Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0 | 5 votes |
public ParquetColumnarRowSplitReader( boolean utcTimestamp, boolean caseSensitive, Configuration conf, LogicalType[] selectedTypes, String[] selectedFieldNames, ColumnBatchGenerator generator, int batchSize, Path path, long splitStart, long splitLength) throws IOException { this.utcTimestamp = utcTimestamp; this.selectedTypes = selectedTypes; this.batchSize = batchSize; // then we need to apply the predicate push down filter ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(conf); List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); this.reader = new ParquetFileReader( conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); long totalRowCount = 0; for (BlockMetaData block : blocks) { totalRowCount += block.getRowCount(); } this.totalRowCount = totalRowCount; this.nextRow = 0; this.rowsInBatch = 0; this.rowsReturned = 0; checkSchema(); this.writableVectors = createWritableVectors(); this.columnarBatch = generator.generate(createReadableVectors()); this.row = new ColumnarRowData(columnarBatch); }
Example 15
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 4 votes |
/** * Reads all the columns requested from the row group at the current file position. * @throws IOException if an error occurs while reading * @return the PageReadStore which can provide PageReaders for each column. */ public PageReadStore readNextRowGroup() throws IOException { if (currentBlock == blocks.size()) { return null; } BlockMetaData block = blocks.get(currentBlock); if (block.getRowCount() == 0) { throw new RuntimeException("Illegal row group of 0 rows"); } this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount()); // prepare the list of consecutive parts to read them in one scan List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>(); ConsecutivePartList currentParts = null; for (ColumnChunkMetaData mc : block.getColumns()) { ColumnPath pathKey = mc.getPath(); BenchmarkCounter.incrementTotalBytes(mc.getTotalSize()); ColumnDescriptor columnDescriptor = paths.get(pathKey); if (columnDescriptor != null) { long startingPos = mc.getStartingPos(); // first part or not consecutive => new list if (currentParts == null || currentParts.endPos() != startingPos) { currentParts = new ConsecutivePartList(startingPos); allParts.add(currentParts); } currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize())); } } // actually read all the chunks ChunkListBuilder builder = new ChunkListBuilder(); for (ConsecutivePartList consecutiveChunks : allParts) { consecutiveChunks.readAll(f, builder); } for (Chunk chunk : builder.build()) { currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages()); } // avoid re-reading bytes the dictionary reader is used after this call if (nextDictionaryReader != null) { nextDictionaryReader.setRowGroup(currentRowGroup); } advanceToNextBlock(); return currentRowGroup; }
Example 16
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 4 votes |
/** * Reads all the columns requested from the row group at the current file position. It may skip specific pages based * on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different * columns row synchronization might be required. See the documentation of the class SynchronizingColumnReader for * details. * * @return the PageReadStore which can provide PageReaders for each column * @throws IOException * if any I/O error occurs while reading */ public PageReadStore readNextFilteredRowGroup() throws IOException { if (currentBlock == blocks.size()) { return null; } if (!options.useColumnIndexFilter()) { return readNextRowGroup(); } BlockMetaData block = blocks.get(currentBlock); if (block.getRowCount() == 0) { throw new RuntimeException("Illegal row group of 0 rows"); } ColumnIndexStore ciStore = getColumnIndexStore(currentBlock); RowRanges rowRanges = getRowRanges(currentBlock); long rowCount = rowRanges.rowCount(); if (rowCount == 0) { // There are no matching rows -> skipping this row-group advanceToNextBlock(); return readNextFilteredRowGroup(); } if (rowCount == block.getRowCount()) { // All rows are matching -> fall back to the non-filtering path return readNextRowGroup(); } this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges); // prepare the list of consecutive parts to read them in one scan ChunkListBuilder builder = new ChunkListBuilder(); List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>(); ConsecutivePartList currentParts = null; for (ColumnChunkMetaData mc : block.getColumns()) { ColumnPath pathKey = mc.getPath(); ColumnDescriptor columnDescriptor = paths.get(pathKey); if (columnDescriptor != null) { OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath()); OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount()); for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) { BenchmarkCounter.incrementTotalBytes(range.getLength()); long startingPos = range.getOffset(); // first part or not consecutive => new list if (currentParts == null || currentParts.endPos() != startingPos) { currentParts = new ConsecutivePartList(startingPos); allParts.add(currentParts); } ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos, (int) range.getLength()); currentParts.addChunk(chunkDescriptor); builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex); } } } // actually read all the chunks for (ConsecutivePartList consecutiveChunks : allParts) { consecutiveChunks.readAll(f, builder); } for (Chunk chunk : builder.build()) { currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages()); } // avoid re-reading bytes the dictionary reader is used after this call if (nextDictionaryReader != null) { nextDictionaryReader.setRowGroup(currentRowGroup); } advanceToNextBlock(); return currentRowGroup; }
Example 17
Source File: Metadata.java From dremio-oss with Apache License 2.0 | 4 votes |
private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException { final ParquetMetadata metadata = SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength); final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size()); if (numSplits > maxSplits) { throw new TooManySplitsException( String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.", file.getPath(), maxSplits, numSplits)); } final MessageType schema = metadata.getFileMetaData().getSchema(); Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap(); schema.getPaths(); for (String[] path : schema.getPaths()) { originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0)); } List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList(); ArrayList<SchemaPath> ALL_COLS = new ArrayList<>(); ALL_COLS.add(AbstractRecordReader.STAR_COLUMN); boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates; ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates); if(logger.isDebugEnabled()){ logger.debug(containsCorruptDates.toString()); } final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap(); int rowGroupIdx = 0; for (BlockMetaData rowGroup : metadata.getBlocks()) { List<ColumnMetadata> columnMetadataList = Lists.newArrayList(); long length = 0; for (ColumnChunkMetaData col : rowGroup.getColumns()) { ColumnMetadata columnMetadata; // statistics might just have the non-null counts with no min/max they might be // initialized to zero instead of null. // check statistics actually have non null values (or) column has all nulls. boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty() && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() == rowGroup.getRowCount()); Statistics<?> stats = col.getStatistics(); String[] columnName = col.getPath().toArray(); SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName); ColumnTypeMetadata columnTypeMetadata = new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName)); columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata); if (statsAvailable) { // Write stats only if minVal==maxVal. Also, we then store only maxVal Object mxValue = null; if (stats.genericGetMax() != null && stats.genericGetMin() != null && stats.genericGetMax().equals(stats.genericGetMin())) { mxValue = stats.genericGetMax(); if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) { mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue); } } columnMetadata = new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls()); } else { // log it under trace to avoid lot of log entries. logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}", columnSchemaName, rowGroupIdx, file.getPath()); columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null); } columnMetadataList.add(columnMetadata); length += col.getTotalSize(); } RowGroupMetadata rowGroupMeta = new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList); rowGroupMetadataList.add(rowGroupMeta); rowGroupIdx++; } return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo); }
Example 18
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) { //rowGroup.total_byte_size = ; List<ColumnChunkMetaData> columns = block.getColumns(); List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset columnChunk.file_path = block.getPath(); // they are in the same file for now columnChunk.meta_data = new ColumnMetaData( getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), toFormatCodec(columnMetaData.getCodec()), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) { columnChunk.meta_data.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset()); } columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset()); if (!columnMetaData.getStatistics().isEmpty()) { columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength)); } if (columnMetaData.getEncodingStats() != null) { columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats())); } // columnChunk.meta_data.index_page_offset = ; // columnChunk.meta_data.key_value_metadata = ; // nothing yet IndexReference columnIndexRef = columnMetaData.getColumnIndexReference(); if (columnIndexRef != null) { columnChunk.setColumn_index_offset(columnIndexRef.getOffset()); columnChunk.setColumn_index_length(columnIndexRef.getLength()); } IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference(); if (offsetIndexRef != null) { columnChunk.setOffset_index_offset(offsetIndexRef.getOffset()); columnChunk.setOffset_index_length(offsetIndexRef.getLength()); } parquetColumns.add(columnChunk); } RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount()); rowGroups.add(rowGroup); }
Example 19
Source File: ParquetReader.java From iceberg with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter, Function<MessageType, ParquetValueReader<?>> readerFunc, boolean reuseContainers) { this.file = file; this.options = options; this.reader = newReader(file, options); MessageType fileSchema = reader.getFileMetaData().getSchema(); boolean hasIds = hasIds(fileSchema); MessageType typeWithIds = hasIds ? fileSchema : addFallbackIds(fileSchema); this.projection = hasIds ? pruneColumns(fileSchema, expectedSchema) : pruneColumnsFallback(fileSchema, expectedSchema); this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds); this.rowGroups = reader.getRowGroups(); this.shouldSkip = new boolean[rowGroups.size()]; ParquetMetricsRowGroupFilter statsFilter = null; ParquetDictionaryRowGroupFilter dictFilter = null; if (filter != null) { statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter); dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter); } long totalValues = 0L; for (int i = 0; i < shouldSkip.length; i += 1) { BlockMetaData rowGroup = rowGroups.get(i); boolean shouldRead = filter == null || ( statsFilter.shouldRead(typeWithIds, rowGroup) && dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup))); this.shouldSkip[i] = !shouldRead; if (shouldRead) { totalValues += rowGroup.getRowCount(); } } this.totalValues = totalValues; this.reuseContainers = reuseContainers; }
Example 20
Source File: ReadConf.java From iceberg with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter, Function<MessageType, ParquetValueReader<?>> readerFunc, Function<MessageType, VectorizedReader<?>> batchedReaderFunc, NameMapping nameMapping, boolean reuseContainers, boolean caseSensitive, Integer bSize) { this.file = file; this.options = options; this.reader = newReader(file, options); MessageType fileSchema = reader.getFileMetaData().getSchema(); MessageType typeWithIds; if (ParquetSchemaUtil.hasIds(fileSchema)) { typeWithIds = fileSchema; this.projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema); } else if (nameMapping != null) { typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping); this.projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema); } else { typeWithIds = ParquetSchemaUtil.addFallbackIds(fileSchema); this.projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema); } this.rowGroups = reader.getRowGroups(); this.shouldSkip = new boolean[rowGroups.size()]; ParquetMetricsRowGroupFilter statsFilter = null; ParquetDictionaryRowGroupFilter dictFilter = null; if (filter != null) { statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter, caseSensitive); dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter, caseSensitive); } long computedTotalValues = 0L; for (int i = 0; i < shouldSkip.length; i += 1) { BlockMetaData rowGroup = rowGroups.get(i); boolean shouldRead = filter == null || ( statsFilter.shouldRead(typeWithIds, rowGroup) && dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup))); this.shouldSkip[i] = !shouldRead; if (shouldRead) { computedTotalValues += rowGroup.getRowCount(); } } this.totalValues = computedTotalValues; if (readerFunc != null) { this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds); this.vectorizedModel = null; this.columnChunkMetaDataForRowGroups = null; } else { this.model = null; this.vectorizedModel = (VectorizedReader<T>) batchedReaderFunc.apply(typeWithIds); this.columnChunkMetaDataForRowGroups = getColumnChunkMetadataForRowGroups(); } this.reuseContainers = reuseContainers; this.batchSize = bSize; }