org.apache.parquet.hadoop.metadata.ParquetMetadata Java Examples
The following examples show how to use
org.apache.parquet.hadoop.metadata.ParquetMetadata.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReaderUtility.java From Bats with Apache License 2.0 | 6 votes |
/** * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects. * * @param footer Parquet file metadata * @return schema full path to SchemaElement map */ public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) { Map<String, SchemaElement> schemaElements = new HashMap<>(); FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer); Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator(); // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`, // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation if (iter.hasNext()) { iter.next(); } while (iter.hasNext()) { addSchemaElementMapping(iter, new StringBuilder(), schemaElements); } return schemaElements; }
Example #2
Source File: TestParquetReader.java From dremio-oss with Apache License 2.0 | 6 votes |
@Test public void testArrowSchemaOldInFooter() throws Exception { URL badparquet = getClass().getResource("/types.parquet"); Path filePathBad = Path.of(badparquet.toURI()); ParquetMetadata parquetMetadataBad = SingletonParquetFooterCache.readFooter(localFs, filePathBad, ParquetMetadataConverter.NO_FILTER, ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal()); Map<String, String> metadataBad = parquetMetadataBad.getFileMetaData().getKeyValueMetaData(); // should have DREMIO_ARROW_SCHEMA field, but no DREMIO_ARROW_SCHEMA_2_1 assertTrue(metadataBad.containsKey(DREMIO_ARROW_SCHEMA)); assertFalse(metadataBad.containsKey(DREMIO_ARROW_SCHEMA_2_1)); try { DremioArrowSchema.fromMetaData(metadataBad); fail("Should not be able to process arrow schema"); } catch (Exception e) { // ok } }
Example #3
Source File: Schemas.java From parquet-mr with Apache License 2.0 | 6 votes |
public static Schema fromParquet(Configuration conf, URI location) throws IOException { Path path = new Path(location); FileSystem fs = path.getFileSystem(conf); ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path); String schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("parquet.avro.schema"); if (schemaString == null) { // try the older property schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("avro.schema"); } if (schemaString != null) { return new Schema.Parser().parse(schemaString); } else { return new AvroSchemaConverter() .convert(footer.getFileMetaData().getSchema()); } }
Example #4
Source File: TestParquetReader.java From dremio-oss with Apache License 2.0 | 6 votes |
@Test public void testArrowSchema210InFooter() throws Exception { URL parquet210 = getClass().getResource("/dremio-region-210.parquet"); Path filePath210 = Path.of(parquet210.toURI()); ParquetMetadata parquetMetadata210 = SingletonParquetFooterCache.readFooter(localFs, filePath210, ParquetMetadataConverter.NO_FILTER, ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal()); Map<String, String> metadata210 = parquetMetadata210.getFileMetaData().getKeyValueMetaData(); // should not have DREMIO_ARROW_SCHEMA field, but should have DREMIO_ARROW_SCHEMA_2_1 assertFalse(metadata210.containsKey(DREMIO_ARROW_SCHEMA)); assertTrue(metadata210.containsKey(DREMIO_ARROW_SCHEMA_2_1)); Schema schema210 = DremioArrowSchema.fromMetaData(metadata210); assertNotNull(schema210); }
Example #5
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void test() throws IOException { Configuration configuration = new Configuration(); ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, super.fsPath, ParquetMetadataConverter.NO_FILTER); ParquetFileReader reader = new ParquetFileReader(configuration, metadata.getFileMetaData(), super.fsPath, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns()); PageStatsValidator validator = new PageStatsValidator(); PageReadStore pageReadStore; while ((pageReadStore = reader.readNextRowGroup()) != null) { validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore); } }
Example #6
Source File: ParquetReaderUtility.java From dremio-oss with Apache License 2.0 | 6 votes |
/** * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input * format finds the row group numbers for input split. */ public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength, final ParquetMetadata footer) throws IOException { final List<BlockMetaData> blocks = footer.getBlocks(); final List<Integer> rowGroupNums = Lists.newArrayList(); int i = 0; for (final BlockMetaData block : blocks) { final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) { rowGroupNums.add(i); } i++; } return rowGroupNums; }
Example #7
Source File: HiveProtoParquetWriterWithOffsetTest.java From garmadon with Apache License 2.0 | 6 votes |
@Before public void setup() throws IOException { protoParquetWriterWithOffset = mock(ProtoParquetWriterWithOffset.class); hiveClient = mock(HiveClient.class); when(protoParquetWriterWithOffset.getEventName()).thenReturn(eventName); when(protoParquetWriterWithOffset.getFinalHdfsDir()).thenReturn(finalPath); ProtoParquetWriter<Message> writerMock = mock(ProtoParquetWriter.class); when(protoParquetWriterWithOffset.getWriter()).thenReturn(writerMock); ParquetMetadata parquetMetadata = mock(ParquetMetadata.class); when(writerMock.getFooter()).thenReturn(parquetMetadata); PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id"); schema = new MessageType("fs", appId); FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "test"); when(parquetMetadata.getFileMetaData()).thenReturn(fileMetaData); when(protoParquetWriterWithOffset.getDayStartTime()).thenReturn(LocalDateTime.of(2019, 9, 10, 10, 10, 10)); }
Example #8
Source File: CompressionConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, String createdBy, CompressionCodecName codecName) throws IOException { int blockIndex = 0; PageReadStore store = reader.readNextRowGroup(); while (store != null) { writer.startBlock(store.getRowCount()); BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex); List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns(); Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect( Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x)); for (int i = 0; i < columnsInOrder.size(); i += 1) { ColumnChunkMetaData chunk = columnsInOrder.get(i); ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy); ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath()); writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName); processChunk(reader, writer, chunk, createdBy, codecName); writer.endColumn(); } writer.endBlock(); store = reader.readNextRowGroup(); blockIndex++; } }
Example #9
Source File: DumpCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; Configuration conf = new Configuration(); Path inpath = new Path(input); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); boolean showmd = !options.hasOption('m'); boolean showdt = !options.hasOption('d'); boolean cropoutput = !options.hasOption('n'); Set<String> showColumns = null; if (options.hasOption('c')) { String[] cols = options.getOptionValues('c'); showColumns = new HashSet<String>(Arrays.asList(cols)); } PrettyPrintWriter out = prettyPrintWriter(cropoutput); dump(out, metaData, schema, inpath, showmd, showdt, showColumns); }
Example #10
Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); List<String> args = options.getArgList(); Path inPath = new Path(args.get(0)); Path outPath = new Path(args.get(1)); CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2)); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE); writer.start(); try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) { compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName); } finally { writer.end(metaData.getFileMetaData().getKeyValueMetaData()); } }
Example #11
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * Given a list of metadata files, merge them into a single ParquetMetadata * Requires that the schemas be compatible, and the extraMetadata be exactly equal. * @param files a list of files to merge metadata from * @param conf a configuration * @return merged parquet metadata for the files * @throws IOException if there is an error while writing * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException { Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata"); GlobalMetaData globalMetaData = null; List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Path p : files) { ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER); FileMetaData fmd = pmd.getFileMetaData(); globalMetaData = mergeInto(fmd, globalMetaData, true); blocks.addAll(pmd.getBlocks()); } // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible return new ParquetMetadata(globalMetaData.merge(), blocks); }
Example #12
Source File: PentahoApacheInputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public List<IParquetInputField> readSchema( String file ) throws Exception { return inClassloader( () -> { Configuration conf = job.getConfiguration(); S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf ); Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) ); FileSystem fs = FileSystem.get( filePath.toUri(), conf ); FileStatus fileStatus = fs.getFileStatus( filePath ); List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true ); if ( footers.isEmpty() ) { return new ArrayList<>(); } else { ParquetMetadata meta = footers.get( 0 ).getParquetMetadata(); MessageType schema = meta.getFileMetaData().getSchema(); return ParquetConverter.buildInputFields( schema ); } } ); }
Example #13
Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFailDroppingColumns() throws IOException { MessageType droppedColumnSchema = Types.buildMessage() .required(BINARY).as(UTF8).named("string") .named("AppendTest"); final ParquetMetadata footer = ParquetFileReader.readFooter( CONF, file1, NO_FILTER); final FSDataInputStream incoming = file1.getFileSystem(CONF).open(file1); Path droppedColumnFile = newTemp(); final ParquetFileWriter writer = new ParquetFileWriter( CONF, droppedColumnSchema, droppedColumnFile); writer.start(); TestUtils.assertThrows("Should complain that id column is dropped", IllegalArgumentException.class, (Callable<Void>) () -> { writer.appendRowGroups(incoming, footer.getBlocks(), false); return null; }); }
Example #14
Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testPruneMultiColumns() throws Exception { // Create Parquet file String inputFile = createParquetFile("input"); String outputFile = createTempFile("output"); // Remove columns String cargs[] = {inputFile, outputFile, "Name", "Gender"}; executeCommandLine(cargs); // Verify the schema are not changed for the columns not pruned ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER); MessageType schema = pmd.getFileMetaData().getSchema(); List<Type> fields = schema.getFields(); assertEquals(fields.size(), 2); assertEquals(fields.get(0).getName(), "DocId"); assertEquals(fields.get(1).getName(), "Links"); List<Type> subFields = fields.get(1).asGroupType().getFields(); assertEquals(subFields.size(), 2); assertEquals(subFields.get(0).getName(), "Backward"); assertEquals(subFields.get(1).getName(), "Forward"); // Verify the data are not changed for the columns not pruned List<String> prunePaths = Arrays.asList("Name", "Gender"); validateColumns(inputFile, prunePaths); }
Example #15
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testParquetMetadataConverterWithoutDictionary() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(null, Encoding.PLAIN); ParquetMetadataConverter converter = new ParquetMetadataConverter(); FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData); // Flag should be false fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> { assertFalse(column.meta_data.isSetDictionary_page_offset()); })); ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream(); Util.writeFileMetaData(fmd1, metaDataOutputStream); ByteArrayInputStream metaDataInputStream = new ByteArrayInputStream(metaDataOutputStream.toByteArray()); FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream); ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2); long dicOffsetConverted = pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset(); Assert.assertEquals(0, dicOffsetConverted); }
Example #16
Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0 | 6 votes |
/** * Returns the schema for the given Parquet file path. */ public static Schema getParquetSchema(Path path) throws IOException { ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER); Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData(); String schemaString = metaData.get("parquet.avro.schema"); if (schemaString == null) { // Try the older property schemaString = metaData.get("avro.schema"); } if (schemaString != null) { return new Schema.Parser().parse(schemaString); } else { return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema()); } }
Example #17
Source File: PrintFooter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
Example #18
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
static void showDetails(PrettyPrintWriter out, ParquetMetadata meta, boolean showOriginalTypes) { showDetails(out, meta.getFileMetaData(), showOriginalTypes); long i = 1; for (BlockMetaData bmeta : meta.getBlocks()) { out.println(); showDetails(out, bmeta, i++); } }
Example #19
Source File: StreamPerColumnProvider.java From dremio-oss with Apache License 2.0 | 5 votes |
@Override public ParquetMetadata getFooter() throws IOException { if(footer == null) { SingletonParquetFooterCache footerCache = new SingletonParquetFooterCache(); footer = footerCache.getFooter(getStream(null), path.toString(), length, fs, maxFooterLen); } return footer; }
Example #20
Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0 | 5 votes |
@Before public void setUp() throws Exception { reader = ParquetFileReader.open(conf, file); ParquetMetadata meta = reader.getFooter(); ccmd = meta.getBlocks().get(0).getColumns(); dictionaries = reader.getDictionaryReader(meta.getBlocks().get(0)); }
Example #21
Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException { Map<String, Long> colSizes = new HashMap<>(); ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER); for (BlockMetaData block : pmd.getBlocks()) { for (ColumnChunkMetaData column : block.getColumns()) { String colName = column.getPath().toDotString(); colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L)); } } return colSizes; }
Example #22
Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testPruneOneColumn() throws Exception { // Create Parquet file String inputFile = createParquetFile("input"); String outputFile = createTempFile("output"); // Remove column String cargs[] = {inputFile, outputFile, "Gender"}; executeCommandLine(cargs); // Verify the schema are not changed for the columns not pruned ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER); MessageType schema = pmd.getFileMetaData().getSchema(); List<Type> fields = schema.getFields(); assertEquals(fields.size(), 3); assertEquals(fields.get(0).getName(), "DocId"); assertEquals(fields.get(1).getName(), "Name"); assertEquals(fields.get(2).getName(), "Links"); List<Type> subFields = fields.get(2).asGroupType().getFields(); assertEquals(subFields.size(), 2); assertEquals(subFields.get(0).getName(), "Backward"); assertEquals(subFields.get(1).getName(), "Forward"); // Verify the data are not changed for the columns not pruned List<String> prunePaths = Arrays.asList("Gender"); validateColumns(inputFile, prunePaths); }
Example #23
Source File: ParquetReaderFactory.java From dremio-oss with Apache License 2.0 | 5 votes |
RecordReader newReader(OperatorContext context, ParquetScanProjectedColumns projectedColumns, String path, CompressionCodecFactory codecFactory, List<ParquetFilterCondition> conditions, ParquetFilterCreator filterCreator, ParquetDictionaryConvertor dictionaryConvertor, boolean enableDetailedTracing, ParquetMetadata footer, int rowGroupIndex, SimpleIntVector deltas, SchemaDerivationHelper schemaHelper, InputStreamProvider inputStreamProvider);
Example #24
Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0 | 5 votes |
@Test public void test_Meta_Info() throws Exception { FileInputStream fileInputStream = new FileInputStream(tmpAvro); ByteArrayOutputStream out = new ByteArrayOutputStream(); int readedBytes; byte[] buf = new byte[1024]; while ((readedBytes = fileInputStream.read(buf)) > 0) { out.write(buf, 0, readedBytes); } out.close(); Map<String, String> attributes = new HashMap<String, String>() {{ put(CoreAttributes.FILENAME.key(), "test.avro"); }}; runner.enqueue(out.toByteArray(), attributes); runner.run(); MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0); // Save the flowfile byte[] resultContents = runner.getContentAsByteArray(resultFlowFile); FileOutputStream fos = new FileOutputStream(tmpParquet); fos.write(resultContents); fos.flush(); fos.close(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); ParquetMetadata metaData; metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER); // #number of records long nParquetRecords = 0; for(BlockMetaData meta : metaData.getBlocks()){ nParquetRecords += meta.getRowCount(); } long nAvroRecord = records.size(); assertEquals(nParquetRecords, nAvroRecord); }
Example #25
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException { List<PageReadStore> rowGroups = new ArrayList<PageReadStore>(); ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER); ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns()); PageReadStore group; while ((group = fileReader.readNextRowGroup()) != null) { rowGroups.add(group); } return rowGroups; }
Example #26
Source File: TestParquetWriter.java From dremio-oss with Apache License 2.0 | 5 votes |
public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile, boolean sort) throws Exception { try { deleteTableIfExists(outputFile); test("use dfs_test"); // test("ALTER SESSION SET \"planner.add_producer_consumer\" = false"); String query = select(selection, inputTable, sort); System.out.println(outputFile); String create = "CREATE TABLE " + outputFile + " AS " + query; String validateQuery = select(validationSelection, outputFile, sort); test(create); test(validateQuery); // TODO: remove testBuilder() .unOrdered() .sqlQuery(validateQuery) .sqlBaselineQuery(query) .go(); Configuration hadoopConf = new Configuration(); Path output = new Path(getDfsTestTmpSchemaLocation(), outputFile); FileSystem fs = output.getFileSystem(hadoopConf); for (FileStatus file : fs.listStatus(output)) { ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS); String version = footer.getFileMetaData().getKeyValueMetaData().get(DREMIO_VERSION_PROPERTY); assertEquals(DremioVersionInfo.getVersion(), version); PageHeaderUtil.validatePageHeaders(file.getPath(), footer); } } finally { deleteTableIfExists(outputFile); } }
Example #27
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * ends a file once all blocks have been written. * closes the file. * @param extraMetaData the extra meta data to write in the footer * @throws IOException if there is an error while writing */ public void end(Map<String, String> extraMetaData) throws IOException { state = state.end(); serializeColumnIndexes(columnIndexes, blocks, out); serializeOffsetIndexes(offsetIndexes, blocks, out); serializeBloomFilters(bloomFilters, blocks, out); LOG.debug("{}: end", out.getPos()); this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks); serializeFooter(footer, out); out.close(); }
Example #28
Source File: ParquetReaderUtility.java From Bats with Apache License 2.0 | 5 votes |
/** * Map full column paths to all ColumnDescriptors in file schema * * @param footer Parquet file metadata * @return column full path to ColumnDescriptor object map */ public static Map<String, ColumnDescriptor> getColNameToColumnDescriptorMapping(ParquetMetadata footer) { Map<String, ColumnDescriptor> colDescMap = new HashMap<>(); List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns(); for (ColumnDescriptor column : columns) { colDescMap.put(getFullColumnPath(column), column); } return colDescMap; }
Example #29
Source File: IcebergParquetReader.java From dremio-oss with Apache License 2.0 | 5 votes |
public IcebergParquetReader( OperatorContext context, ParquetReaderFactory readerFactory, BatchSchema tableSchema, ParquetScanProjectedColumns projectedColumns, Map<String, GlobalDictionaryFieldInfo> globalDictionaryFieldInfoMap, List<ParquetFilterCondition> filterConditions, ParquetProtobuf.ParquetDatasetSplitScanXAttr readEntry, FileSystem fs, ParquetMetadata footer, GlobalDictionaries dictionaries, SchemaDerivationHelper schemaHelper, boolean vectorize, boolean enableDetailedTracing, boolean supportsColocatedReads, InputStreamProvider inputStreamProvider) { this.context = context; this.readerFactory = readerFactory; this.tableSchema = tableSchema; this.projectedColumns = projectedColumns; this.globalDictionaryFieldInfoMap = globalDictionaryFieldInfoMap; this.filterConditions = filterConditions; this.readEntry = readEntry; this.fs = fs; this.footer = footer; this.dictionaries = dictionaries; this.schemaHelper = schemaHelper; this.vectorize = vectorize; this.enableDetailedTracing = enableDetailedTracing; this.supportsColocatedReads = supportsColocatedReads; this.inputStreamProvider = inputStreamProvider; }
Example #30
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; }