org.apache.parquet.hadoop.metadata.BlockMetaData#getRowCount

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, ParquetConversions.converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visitEvaluator(expr, this);
}

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visit(expr, this);
}

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

6 votes

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
  List<BlockMetaData> blocks = parquetMetadata.getBlocks();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long numRows = 0;
  for (BlockMetaData block : blocks) {
    numRows += block.getRowCount();
    addRowGroup(parquetMetadata, rowGroups, block);
  }
  FileMetaData fileMetaData = new FileMetaData(
      currentVersion,
      toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
      numRows,
      rowGroups);

  Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
  for (Entry<String, String> keyValue : keyValues) {
    addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
  }

  fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());

  fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));

  return fileMetaData;
}

Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0

6 votes

public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Source File: ParquetReader.java From tajo with Apache License 2.0

6 votes

private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}

Source File: PrintFooter.java From parquet-mr with Apache License 2.0

6 votes

private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}

Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0

5 votes

@Test
public void test_Meta_Info() throws Exception {

    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetMetadata metaData;
    metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER);

    // #number of records
    long nParquetRecords = 0;
    for(BlockMetaData meta : metaData.getBlocks()){
        nParquetRecords += meta.getRowCount();
    }
    long nAvroRecord = records.size();

    assertEquals(nParquetRecords, nAvroRecord);
}

Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0

5 votes

private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
  long start = rowGroup.getStartingPos();
  long rowCount = rowGroup.getRowCount();
  long compressedSize = rowGroup.getCompressedSize();
  long uncompressedSize = rowGroup.getTotalByteSize();
  String filePath = rowGroup.getPath();

  console.info(String.format("\nRow group %d:  count: %d  %s records  start: %d  total: %s%s\n%s",
      index, rowCount,
      humanReadable(((float) compressedSize) / rowCount),
      start, humanReadable(compressedSize),
      filePath != null ? " path: " + filePath : "",
      new TextStringBuilder(80).appendPadding(80, '-')));

  int size = maxSize(Iterables.transform(rowGroup.getColumns(),
      new Function<ColumnChunkMetaData, String>() {
        @Override
        public String apply(@Nullable ColumnChunkMetaData input) {
          return input == null ? "" : input.getPath().toDotString();
        }
      }));

  console.info(String.format("%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
      "", "type", "encodings", "count", "avg size", "nulls", "min / max"));
  for (ColumnChunkMetaData column : rowGroup.getColumns()) {
    printColumnChunk(console, size, column, schema);
  }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

public long getRecordCount() {
  long total = 0;
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  return total;
}

Source File: RowCountCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long rowCount = 0;

  for (FileStatus fs : inputFileStatuses) {
    long fileRowCount=0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        rowCount += b.getRowCount();
        fileRowCount += b.getRowCount();
      }
    }
    if (options.hasOption('d')) {
      out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
    }
  }

  out.format("Total RowCount: %d", rowCount);
  out.println();
}

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}

Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0

5 votes

/**
 * Seek to a particular row number.
 */
public void seekToRow(long rowCount) throws IOException {
	if (totalCountLoadedSoFar != 0) {
		throw new UnsupportedOperationException("Only support seek at first.");
	}

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	for (BlockMetaData metaData : blockMetaData) {
		if (metaData.getRowCount() > rowCount) {
			break;
		} else {
			reader.skipNextRowGroup();
			rowsReturned += metaData.getRowCount();
			totalCountLoadedSoFar += metaData.getRowCount();
			rowsInBatch = (int) metaData.getRowCount();
			nextRow = (int) metaData.getRowCount();
			rowCount -= metaData.getRowCount();
		}
	}
	for (int i = 0; i < rowCount; i++) {
		boolean end = reachedEnd();
		if (end) {
			throw new RuntimeException("Seek to many rows.");
		}
		nextRecord();
	}
}

Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0

5 votes

public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Reads all the columns requested from the row group at the current file position.
 * @throws IOException if an error occurs while reading
 * @return the PageReadStore which can provide PageReaders for each column.
 */
public PageReadStore readNextRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
  // prepare the list of consecutive parts to read them in one scan
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      long startingPos = mc.getStartingPos();
      // first part or not consecutive => new list
      if (currentParts == null || currentParts.endPos() != startingPos) {
        currentParts = new ConsecutivePartList(startingPos);
        allParts.add(currentParts);
      }
      currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize()));
    }
  }
  // actually read all the chunks
  ChunkListBuilder builder = new ChunkListBuilder();
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Reads all the columns requested from the row group at the current file position. It may skip specific pages based
 * on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different
 * columns row synchronization might be required. See the documentation of the class SynchronizingColumnReader for
 * details.
 *
 * @return the PageReadStore which can provide PageReaders for each column
 * @throws IOException
 *           if any I/O error occurs while reading
 */
public PageReadStore readNextFilteredRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  if (!options.useColumnIndexFilter()) {
    return readNextRowGroup();
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  ColumnIndexStore ciStore = getColumnIndexStore(currentBlock);
  RowRanges rowRanges = getRowRanges(currentBlock);
  long rowCount = rowRanges.rowCount();
  if (rowCount == 0) {
    // There are no matching rows -> skipping this row-group
    advanceToNextBlock();
    return readNextFilteredRowGroup();
  }
  if (rowCount == block.getRowCount()) {
    // All rows are matching -> fall back to the non-filtering path
    return readNextRowGroup();
  }

  this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges);
  // prepare the list of consecutive parts to read them in one scan
  ChunkListBuilder builder = new ChunkListBuilder();
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());

      OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges,
          block.getRowCount());
      for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
        BenchmarkCounter.incrementTotalBytes(range.getLength());
        long startingPos = range.getOffset();
        // first part or not consecutive => new list
        if (currentParts == null || currentParts.endPos() != startingPos) {
          currentParts = new ConsecutivePartList(startingPos);
          allParts.add(currentParts);
        }
        ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos,
            (int) range.getLength());
        currentParts.addChunk(chunkDescriptor);
        builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
      }
    }
  }
  // actually read all the chunks
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}

Source File: Metadata.java From dremio-oss with Apache License 2.0

4 votes

private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

4 votes

private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
    //rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
      columnChunk.file_path = block.getPath(); // they are in the same file for now
      columnChunk.meta_data = new ColumnMetaData(
          getType(columnMetaData.getType()),
          toFormatEncodings(columnMetaData.getEncodings()),
          Arrays.asList(columnMetaData.getPath().toArray()),
          toFormatCodec(columnMetaData.getCodec()),
          columnMetaData.getValueCount(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getTotalSize(),
          columnMetaData.getFirstDataPageOffset());
      if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
        columnChunk.meta_data.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
      }
      columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset());
      if (!columnMetaData.getStatistics().isEmpty()) {
        columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
      }
      if (columnMetaData.getEncodingStats() != null) {
        columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
      }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

      IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
      if (columnIndexRef != null) {
        columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
        columnChunk.setColumn_index_length(columnIndexRef.getLength());
      }
      IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
      if (offsetIndexRef != null) {
        columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
        columnChunk.setOffset_index_length(offsetIndexRef.getLength());
      }

      parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroups.add(rowGroup);
  }

Source File: ParquetReader.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter,
         Function<MessageType, ParquetValueReader<?>> readerFunc, boolean reuseContainers) {
  this.file = file;
  this.options = options;
  this.reader = newReader(file, options);

  MessageType fileSchema = reader.getFileMetaData().getSchema();

  boolean hasIds = hasIds(fileSchema);
  MessageType typeWithIds = hasIds ? fileSchema : addFallbackIds(fileSchema);

  this.projection = hasIds ?
      pruneColumns(fileSchema, expectedSchema) :
      pruneColumnsFallback(fileSchema, expectedSchema);
  this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds);
  this.rowGroups = reader.getRowGroups();
  this.shouldSkip = new boolean[rowGroups.size()];

  ParquetMetricsRowGroupFilter statsFilter = null;
  ParquetDictionaryRowGroupFilter dictFilter = null;
  if (filter != null) {
    statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter);
    dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter);
  }

  long totalValues = 0L;
  for (int i = 0; i < shouldSkip.length; i += 1) {
    BlockMetaData rowGroup = rowGroups.get(i);
    boolean shouldRead = filter == null || (
        statsFilter.shouldRead(typeWithIds, rowGroup) &&
        dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup)));
    this.shouldSkip[i] = !shouldRead;
    if (shouldRead) {
      totalValues += rowGroup.getRowCount();
    }
  }

  this.totalValues = totalValues;
  this.reuseContainers = reuseContainers;
}

Source File: ReadConf.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter,
         Function<MessageType, ParquetValueReader<?>> readerFunc, Function<MessageType,
         VectorizedReader<?>> batchedReaderFunc, NameMapping nameMapping, boolean reuseContainers,
         boolean caseSensitive, Integer bSize) {
  this.file = file;
  this.options = options;
  this.reader = newReader(file, options);
  MessageType fileSchema = reader.getFileMetaData().getSchema();

  MessageType typeWithIds;
  if (ParquetSchemaUtil.hasIds(fileSchema)) {
    typeWithIds = fileSchema;
    this.projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
  } else if (nameMapping != null) {
    typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping);
    this.projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema);
  } else {
    typeWithIds = ParquetSchemaUtil.addFallbackIds(fileSchema);
    this.projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
  }

  this.rowGroups = reader.getRowGroups();
  this.shouldSkip = new boolean[rowGroups.size()];

  ParquetMetricsRowGroupFilter statsFilter = null;
  ParquetDictionaryRowGroupFilter dictFilter = null;
  if (filter != null) {
    statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter, caseSensitive);
    dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter, caseSensitive);
  }

  long computedTotalValues = 0L;
  for (int i = 0; i < shouldSkip.length; i += 1) {
    BlockMetaData rowGroup = rowGroups.get(i);
    boolean shouldRead = filter == null || (
        statsFilter.shouldRead(typeWithIds, rowGroup) &&
            dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup)));
    this.shouldSkip[i] = !shouldRead;
    if (shouldRead) {
      computedTotalValues += rowGroup.getRowCount();
    }
  }

  this.totalValues = computedTotalValues;
  if (readerFunc != null) {
    this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds);
    this.vectorizedModel = null;
    this.columnChunkMetaDataForRowGroups = null;
  } else {
    this.model = null;
    this.vectorizedModel = (VectorizedReader<T>) batchedReaderFunc.apply(typeWithIds);
    this.columnChunkMetaDataForRowGroups = getColumnChunkMetadataForRowGroups();
  }

  this.reuseContainers = reuseContainers;
  this.batchSize = bSize;
}

Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#getRowCount()