Java Code Examples for org.apache.parquet.hadoop.metadata.FileMetaData#getSchema()
The following examples show how to use
org.apache.parquet.hadoop.metadata.FileMetaData#getSchema() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetHdfsFileSink.java From components with Apache License 2.0 | 6 votes |
@Override protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException { FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder); List<Path> sourceFiles = new ArrayList<>(); for (FileStatus sourceStatus : sourceStatuses) { sourceFiles.add(sourceStatus.getPath()); } FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData(); ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile), ParquetFileWriter.Mode.CREATE); writer.start(); for (Path input : sourceFiles) { writer.appendFile(fs.getConf(), input); } writer.end(mergedMeta.getKeyValueMetaData()); }
Example 2
Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0 | 6 votes |
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
Example 3
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 6 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); // Setting the projection schema before running any filtering (e.g. getting filtered record count) // because projection impacts filtering reader.setRequestedSchema(requestedSchema); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); LOG.info("RecordReader initialized will read a total of {} records.", total); }
Example 4
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) { this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); // real schema of parquet file this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), readSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.recordMaterializer = readSupport.prepareForRead( configuration, fileMetadata, readSchema, readContext); this.numTotalRecords = reader.getRecordCount(); }
Example 5
Source File: ParquetFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
/** * Reads the original schema from the parquet file. * * @param parquetFile the path to the parquet file * @param fileSplit the file split we are accessing * @return the original schema from the parquet file * @throws IOException when there's an IOException while reading the schema */ private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException { final long then = System.nanoTime(); ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range( fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength()); ParquetReadOptions parquetReadOptions = HadoopReadOptions .builder(configuration) .withMetadataFilter(filter) .build(); HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration); try (ParquetFileReader parquetFileReader = ParquetFileReader.open(inputFile, parquetReadOptions)) { FileMetaData metadata = parquetFileReader.getFileMetaData(); if (LOG.isDebugEnabled()) { LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups", context.getTransactionId(), context.getSegmentId(), parquetFile.getName(), parquetFileReader.getRecordCount(), parquetFileReader.getRowGroups().size()); } final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then); LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(), context.getSegmentId(), millis); return metadata.getSchema(); } catch (Exception e) { throw new IOException(e); } }
Example 6
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) { this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); // real schema of parquet file this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), readSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.recordMaterializer = readSupport.prepareForRead( configuration, fileMetadata, readSchema, readContext); this.numTotalRecords = reader.getRecordCount(); }
Example 7
Source File: PruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { List<String> args = options.getArgList(); Path inputFile = new Path(args.get(0)); Path outputFile = new Path(args.get(1)); List<String> cols = args.subList(2, args.size()); Set<ColumnPath> prunePaths = convertToColumnPaths(cols); ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER); FileMetaData metaData = pmd.getFileMetaData(); MessageType schema = metaData.getSchema(); List<String> paths = new ArrayList<>(); getPaths(schema, paths, null); for (String col : cols) { if (!paths.contains(col)) { LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName()); } } ParquetFileWriter writer = new ParquetFileWriter(conf, pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE); writer.start(); writer.appendFile(HadoopInputFile.fromPath(inputFile, conf)); writer.end(metaData.getKeyValueMetaData()); }
Example 8
Source File: MergeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { // Prepare arguments List<String> args = options.getArgList(); List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1)); Path outputFile = new Path(args.get(args.size() - 1)); // Merge schema and extraMeta FileMetaData mergedMeta = mergedMetadata(inputFiles); PrintWriter out = new PrintWriter(Main.out, true); // Merge data ParquetFileWriter writer = new ParquetFileWriter(conf, mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE); writer.start(); boolean tooSmallFilesMerged = false; for (Path input: inputFiles) { if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) { out.format("Warning: file %s is too small, length: %d\n", input, input.getFileSystem(conf).getFileStatus(input).getLen()); tooSmallFilesMerged = true; } writer.appendFile(HadoopInputFile.fromPath(input, conf)); } if (tooSmallFilesMerged) { out.println("Warning: you merged too small files. " + "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " + "which usually leads to bad query performance!"); } writer.end(mergedMeta.getKeyValueMetaData()); }
Example 9
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, ParquetReadOptions options) { // copy custom configuration to the Configuration passed to the ReadSupport Configuration conf = new Configuration(); if (options instanceof HadoopReadOptions) { conf = ((HadoopReadOptions) options).getConf(); } for (String property : options.getPropertyNames()) { conf.set(property, options.getProperty(property)); } // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); // Setting the projection schema before running any filtering (e.g. getting filtered record count) // because projection impacts filtering reader.setRequestedSchema(requestedSchema); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); LOG.info("RecordReader initialized will read a total of {} records.", total); }
Example 10
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
static GlobalMetaData mergeInto( FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) { MessageType schema = null; Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>(); Set<String> createdBy = new HashSet<String>(); if (mergedMetadata != null) { schema = mergedMetadata.getSchema(); newKeyValues.putAll(mergedMetadata.getKeyValueMetaData()); createdBy.addAll(mergedMetadata.getCreatedBy()); } if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) { schema = mergeInto(toMerge.getSchema(), schema, strict); } for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) { Set<String> values = newKeyValues.get(entry.getKey()); if (values == null) { values = new LinkedHashSet<String>(); newKeyValues.put(entry.getKey(), values); } values.add(entry.getValue()); } createdBy.add(toMerge.getCreatedBy()); return new GlobalMetaData( schema, newKeyValues, createdBy); }
Example 11
Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0 | 5 votes |
private String check(String file) throws IOException { Path path = qualifiedPath(file); ParquetMetadata footer = ParquetFileReader.readFooter( getConf(), path, ParquetMetadataConverter.NO_FILTER); FileMetaData meta = footer.getFileMetaData(); String createdBy = meta.getCreatedBy(); if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) { // create fake metadata that will read corrupt stats and return them FileMetaData fakeMeta = new FileMetaData( meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION); // get just the binary columns List<ColumnDescriptor> columns = Lists.newArrayList(); Iterables.addAll(columns, Iterables.filter( meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() { @Override public boolean apply(@Nullable ColumnDescriptor input) { return input != null && input.getType() == BINARY; } })); // now check to see if the data is actually corrupt ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns); try { PageStatsValidator validator = new PageStatsValidator(); for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) { validator.validate(columns, pages); } } catch (BadStatsException e) { return e.getMessage(); } } return null; }
Example 12
Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0 | 4 votes |
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException { List<ContractViolation> violations = new ArrayList<>(); try (ParquetFileReader reader = ParquetFileReader.open(file)) { FileMetaData meta = reader.getFooter().getFileMetaData(); MessageType schema = meta.getSchema(); List<ColumnDescriptor> columns = schema.getColumns(); List<BlockMetaData> blocks = reader.getFooter().getBlocks(); int rowGroupNumber = 0; PageReadStore rowGroup = reader.readNextRowGroup(); while (rowGroup != null) { ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null); List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns(); assert (columnChunks.size() == columns.size()); for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) { ColumnDescriptor column = columns.get(columnNumber); ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber); ColumnIndex columnIndex = reader.readColumnIndex(columnChunk); if (columnIndex == null) { continue; } ColumnPath columnPath = columnChunk.getPath(); OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk); List<ByteBuffer> minValues = columnIndex.getMinValues(); List<ByteBuffer> maxValues = columnIndex.getMaxValues(); BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder(); List<Long> nullCounts = columnIndex.getNullCounts(); List<Boolean> nullPages = columnIndex.getNullPages(); long rowNumber = 0; ColumnReader columnReader = columnReadStore.getColumnReader(column); ByteBuffer prevMinValue = null; ByteBuffer prevMaxValue = null; for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) { boolean isNullPage = nullPages.get(pageNumber); ByteBuffer minValue = minValues.get(pageNumber); ByteBuffer maxValue = maxValues.get(pageNumber); PageValidator pageValidator = new PageValidator( column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage); if (!isNullPage) { prevMinValue = minValue; prevMaxValue = maxValue; } long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount()); while (rowNumber <= lastRowNumberInPage) { pageValidator.validateValuesBelongingToRow(); ++rowNumber; } pageValidator.finishPage(); } } rowGroup = reader.readNextRowGroup(); rowGroupNumber++; } } return violations; }