org.apache.parquet.hadoop.metadata.FileMetaData Java Examples
The following examples show how to use
org.apache.parquet.hadoop.metadata.FileMetaData.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveProtoParquetWriterWithOffsetTest.java From garmadon with Apache License 2.0 | 6 votes |
@Before public void setup() throws IOException { protoParquetWriterWithOffset = mock(ProtoParquetWriterWithOffset.class); hiveClient = mock(HiveClient.class); when(protoParquetWriterWithOffset.getEventName()).thenReturn(eventName); when(protoParquetWriterWithOffset.getFinalHdfsDir()).thenReturn(finalPath); ProtoParquetWriter<Message> writerMock = mock(ProtoParquetWriter.class); when(protoParquetWriterWithOffset.getWriter()).thenReturn(writerMock); ParquetMetadata parquetMetadata = mock(ParquetMetadata.class); when(writerMock.getFooter()).thenReturn(parquetMetadata); PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id"); schema = new MessageType("fs", appId); FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "test"); when(parquetMetadata.getFileMetaData()).thenReturn(fileMetaData); when(protoParquetWriterWithOffset.getDayStartTime()).thenReturn(LocalDateTime.of(2019, 9, 10, 10, 10, 10)); }
Example #2
Source File: ParquetHdfsFileSink.java From components with Apache License 2.0 | 6 votes |
@Override protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException { FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder); List<Path> sourceFiles = new ArrayList<>(); for (FileStatus sourceStatus : sourceStatuses) { sourceFiles.add(sourceStatus.getPath()); } FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData(); ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile), ParquetFileWriter.Mode.CREATE); writer.start(); for (Path input : sourceFiles) { writer.appendFile(fs.getConf(), input); } writer.end(mergedMeta.getKeyValueMetaData()); }
Example #3
Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0 | 6 votes |
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
Example #4
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 6 votes |
public static void showDetails(PrettyPrintWriter out, FileMetaData meta) { out.format("creator: %s%n", meta.getCreatedBy()); Map<String,String> extra = meta.getKeyValueMetaData(); if (extra != null) { for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) { out.print("extra: "); out.incrementTabLevel(); out.format("%s = %s%n", entry.getKey(), entry.getValue()); out.decrementTabLevel(); } } out.println(); out.format("file schema: %s%n", meta.getSchema().getName()); out.rule('-'); showDetails(out, meta.getSchema()); }
Example #5
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 6 votes |
static void showDetails(PrettyPrintWriter out, FileMetaData meta, boolean showOriginalTypes) { out.format("creator: %s%n", meta.getCreatedBy()); Map<String,String> extra = meta.getKeyValueMetaData(); if (extra != null) { for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) { out.print("extra: "); out.incrementTabLevel(); out.format("%s = %s%n", entry.getKey(), entry.getValue()); out.decrementTabLevel(); } } out.println(); out.format("file schema: %s%n", meta.getSchema().getName()); out.rule('-'); showDetails(out, meta.getSchema(), showOriginalTypes); }
Example #6
Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0 | 6 votes |
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) { // splitting files? if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) { // this is okay if not using DELTA_BYTE_ARRAY with the bug Set<Encoding> encodings = new HashSet<Encoding>(); for (ColumnChunkMetaData column : block.getColumns()) { encodings.addAll(column.getEncodings()); } for (Encoding encoding : encodings) { if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) { throw new ParquetDecodingException("Cannot read data due to " + "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false"); } } } }
Example #7
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * Given a list of metadata files, merge them into a single ParquetMetadata * Requires that the schemas be compatible, and the extraMetadata be exactly equal. * @param files a list of files to merge metadata from * @param conf a configuration * @return merged parquet metadata for the files * @throws IOException if there is an error while writing * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException { Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata"); GlobalMetaData globalMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Path p : files) { ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER); FileMetaData fmd = pmd.getFileMetaData(); globalMetaData = mergeInto(fmd, globalMetaData, true); blocks.addAll(pmd.getBlocks()); } // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible return new ParquetMetadata(globalMetaData.merge(), blocks); }
Example #8
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 6 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); // Setting the projection schema before running any filtering (e.g. getting filtered record count) // because projection impacts filtering reader.setRequestedSchema(requestedSchema); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); LOG.info("RecordReader initialized will read a total of {} records.", total); }
Example #9
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * @param configuration the Hadoop conf * @param fileMetaData fileMetaData for parquet file * @param filePath Path for the parquet file * @param blocks the blocks to read * @param columns the columns to read (their path) * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader( Configuration configuration, FileMetaData fileMetaData, Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException { this.converter = new ParquetMetadataConverter(configuration); this.file = HadoopInputFile.fromPath(filePath, configuration); this.fileMetaData = fileMetaData; this.f = file.newStream(); this.options = HadoopReadOptions.builder(configuration).build(); this.blocks = filterRowGroups(blocks); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : columns) { paths.put(ColumnPath.get(col.getPath()), col); } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; }
Example #10
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 5 votes |
private String check(String file) throws IOException { Path path = qualifiedPath(file); ParquetMetadata footer = ParquetFileReader.readFooter( getConf(), path, ParquetMetadataConverter.NO_FILTER); FileMetaData meta = footer.getFileMetaData(); String createdBy = meta.getCreatedBy(); if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) { // create fake metadata that will read corrupt stats and return them FileMetaData fakeMeta = new FileMetaData( meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION); // get just the binary columns List<ColumnDescriptor> columns = Lists.newArrayList(); Iterables.addAll(columns, Iterables.filter( meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() { @Override public boolean apply(@Nullable ColumnDescriptor input) { return input != null && input.getType() == BINARY; } })); // now check to see if the data is actually corrupt ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns); try { PageStatsValidator validator = new PageStatsValidator(); for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) { validator.validate(columns, pages); } } catch (BadStatsException e) { return e.getMessage(); } } return null; }
Example #11
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) { this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); // real schema of parquet file this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), readSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.recordMaterializer = readSupport.prepareForRead( configuration, fileMetadata, readSchema, readContext); this.numTotalRecords = reader.getRecordCount(); }
Example #12
Source File: ParquetRecordReaderWrapper.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * gets a ParquetInputSplit corresponding to a split given by Hive * * @param oldSplit The split given by Hive * @param conf The JobConf of the Hive job * @return a ParquetInputSplit corresponding to the oldSplit * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file */ protected ParquetInputSplit getSplit( final InputSplit oldSplit, final JobConf conf ) throws IOException { if (oldSplit instanceof FileSplit) { FileSplit fileSplit = (FileSplit) oldSplit; final long splitStart = fileSplit.getStart(); final long splitLength = fileSplit.getLength(); final Path finalPath = fileSplit.getPath(); final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent()); final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS); final FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); final ReadContext readContext = new DataWritableReadSupport() .init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema()); schemaSize = MessageTypeParser.parseMessageType( readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY) ).getFieldCount(); return new ParquetInputSplit( finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null); } else { throw new IllegalArgumentException("Unknown split type: " + oldSplit); } }
Example #13
Source File: TestInputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
@Before public void setUp() { blocks = new ArrayList<BlockMetaData>(); for (int i = 0; i < 10; i++) { blocks.add(newBlock(i * 10, 10)); } schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }"); fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr"); }
Example #14
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
static GlobalMetaData mergeInto( FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) { MessageType schema = null; Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>(); Set<String> createdBy = new HashSet<String>(); if (mergedMetadata != null) { schema = mergedMetadata.getSchema(); newKeyValues.putAll(mergedMetadata.getKeyValueMetaData()); createdBy.addAll(mergedMetadata.getCreatedBy()); } if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) { schema = mergeInto(toMerge.getSchema(), schema, strict); } for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) { Set<String> values = newKeyValues.get(entry.getKey()); if (values == null) { values = new LinkedHashSet<String>(); newKeyValues.put(entry.getKey(), values); } values.add(entry.getValue()); } createdBy.add(toMerge.getCreatedBy()); return new GlobalMetaData( schema, newKeyValues, createdBy); }
Example #15
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException { long footerIndex = out.getPos(); ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(); org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer); writeFileMetaData(parquetMetadata, out); LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex)); BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex)); out.write(MAGIC); }
Example #16
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * ends a file once all blocks have been written. * closes the file. * @param extraMetaData the extra meta data to write in the footer * @throws IOException if there is an error while writing */ public void end(Map<String, String> extraMetaData) throws IOException { state = state.end(); serializeColumnIndexes(columnIndexes, blocks, out); serializeOffsetIndexes(offsetIndexes, blocks, out); serializeBloomFilters(bloomFilters, blocks, out); LOG.debug("{}: end", out.getPos()); this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, out); out.close(); }
Example #17
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, ParquetReadOptions options) { // copy custom configuration to the Configuration passed to the ReadSupport Configuration conf = new Configuration(); if (options instanceof HadoopReadOptions) { conf = ((HadoopReadOptions) options).getConf(); } for (String property : options.getPropertyNames()) { conf.set(property, options.getProperty(property)); } // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); // Setting the projection schema before running any filtering (e.g. getting filtered record count) // because projection impacts filtering reader.setRequestedSchema(requestedSchema); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); LOG.info("RecordReader initialized will read a total of {} records.", total); }
Example #18
Source File: MergeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { // Prepare arguments List<String> args = options.getArgList(); List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1)); Path outputFile = new Path(args.get(args.size() - 1)); // Merge schema and extraMeta FileMetaData mergedMeta = mergedMetadata(inputFiles); PrintWriter out = new PrintWriter(Main.out, true); // Merge data ParquetFileWriter writer = new ParquetFileWriter(conf, mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE); writer.start(); boolean tooSmallFilesMerged = false; for (Path input: inputFiles) { if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) { out.format("Warning: file %s is too small, length: %d\n", input, input.getFileSystem(conf).getFileStatus(input).getLen()); tooSmallFilesMerged = true; } writer.appendFile(HadoopInputFile.fromPath(input, conf)); } if (tooSmallFilesMerged) { out.println("Warning: you merged too small files. " + "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " + "which usually leads to bad query performance!"); } writer.end(mergedMeta.getKeyValueMetaData()); }
Example #19
Source File: PruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { List<String> args = options.getArgList(); Path inputFile = new Path(args.get(0)); Path outputFile = new Path(args.get(1)); List<String> cols = args.subList(2, args.size()); Set<ColumnPath> prunePaths = convertToColumnPaths(cols); ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER); FileMetaData metaData = pmd.getFileMetaData(); MessageType schema = metaData.getSchema(); List<String> paths = new ArrayList<>(); getPaths(schema, paths, null); for (String col : cols) { if (!paths.contains(col)) { LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName()); } } ParquetFileWriter writer = new ParquetFileWriter(conf, pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE); writer.start(); writer.appendFile(HadoopInputFile.fromPath(inputFile, conf)); writer.end(metaData.getKeyValueMetaData()); }
Example #20
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) { this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); // real schema of parquet file this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), readSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.recordMaterializer = readSupport.prepareForRead( configuration, fileMetadata, readSchema, readContext); this.numTotalRecords = reader.getRecordCount(); }
Example #21
Source File: ParquetFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
/** * Reads the original schema from the parquet file. * * @param parquetFile the path to the parquet file * @param fileSplit the file split we are accessing * @return the original schema from the parquet file * @throws IOException when there's an IOException while reading the schema */ private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException { final long then = System.nanoTime(); ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range( fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength()); ParquetReadOptions parquetReadOptions = HadoopReadOptions .builder(configuration) .withMetadataFilter(filter) .build(); HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration); try (ParquetFileReader parquetFileReader = ParquetFileReader.open(inputFile, parquetReadOptions)) { FileMetaData metadata = parquetFileReader.getFileMetaData(); if (LOG.isDebugEnabled()) { LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups", context.getTransactionId(), context.getSegmentId(), parquetFile.getName(), parquetFileReader.getRecordCount(), parquetFileReader.getRowGroups().size()); } final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then); LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(), context.getSegmentId(), millis); return metadata.getSchema(); } catch (Exception e) { throw new IOException(e); } }
Example #22
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 4 votes |
public FileMetaData getFileMetaData() { if (fileMetaData != null) { return fileMetaData; } return getFooter().getFileMetaData(); }
Example #23
Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0 | 4 votes |
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException { List<ContractViolation> violations = new ArrayList<>(); try (ParquetFileReader reader = ParquetFileReader.open(file)) { FileMetaData meta = reader.getFooter().getFileMetaData(); MessageType schema = meta.getSchema(); List<ColumnDescriptor> columns = schema.getColumns(); List<BlockMetaData> blocks = reader.getFooter().getBlocks(); int rowGroupNumber = 0; PageReadStore rowGroup = reader.readNextRowGroup(); while (rowGroup != null) { ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null); List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns(); assert (columnChunks.size() == columns.size()); for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) { ColumnDescriptor column = columns.get(columnNumber); ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber); ColumnIndex columnIndex = reader.readColumnIndex(columnChunk); if (columnIndex == null) { continue; } ColumnPath columnPath = columnChunk.getPath(); OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk); List<ByteBuffer> minValues = columnIndex.getMinValues(); List<ByteBuffer> maxValues = columnIndex.getMaxValues(); BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder(); List<Long> nullCounts = columnIndex.getNullCounts(); List<Boolean> nullPages = columnIndex.getNullPages(); long rowNumber = 0; ColumnReader columnReader = columnReadStore.getColumnReader(column); ByteBuffer prevMinValue = null; ByteBuffer prevMaxValue = null; for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) { boolean isNullPage = nullPages.get(pageNumber); ByteBuffer minValue = minValues.get(pageNumber); ByteBuffer maxValue = maxValues.get(pageNumber); PageValidator pageValidator = new PageValidator( column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage); if (!isNullPage) { prevMinValue = minValue; prevMaxValue = maxValue; } long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount()); while (rowNumber <= lastRowNumberInPage) { pageValidator.validateValuesBelongingToRow(); ++rowNumber; } pageValidator.finishPage(); } } rowGroup = reader.readNextRowGroup(); rowGroupNumber++; } } return violations; }
Example #24
Source File: MergeCommand.java From parquet-mr with Apache License 2.0 | 4 votes |
private FileMetaData mergedMetadata(List<Path> inputFiles) throws IOException { return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData(); }
Example #25
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * Will return the result of merging toMerge into mergedMetadata * @param toMerge the metadata toMerge * @param mergedMetadata the reference metadata to merge into * @return the result of the merge */ static GlobalMetaData mergeInto( FileMetaData toMerge, GlobalMetaData mergedMetadata) { return mergeInto(toMerge, mergedMetadata, true); }