org.apache.parquet.hadoop.ParquetFileReader Java Examples
The following examples show how to use
org.apache.parquet.hadoop.ParquetFileReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0 | 7 votes |
@Override @SuppressWarnings("unchecked") public int run() throws IOException { Preconditions.checkArgument(input != null && output != null, "Both input and output parquet file paths are required."); Preconditions.checkArgument(codec != null, "The codec cannot be null"); Path inPath = new Path(input); Path outPath = new Path(output); CompressionCodecName codecName = CompressionCodecName.valueOf(codec); ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE); writer.start(); try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) { compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName); } finally { writer.end(metaData.getFileMetaData().getKeyValueMetaData()); } return 0; }
Example #2
Source File: HdfsOffsetComputer.java From garmadon with Apache License 2.0 | 6 votes |
protected Long getMaxOffset(Map<String, FinalEventPartitionFile> dateFinalEventPartitionFile) { // Get max offset from all files for a partition return dateFinalEventPartitionFile .values() .stream() .flatMap(finalEventPartitionFile -> { try (ParquetFileReader pFR = ParquetFileReader.open(fs.getConf(), finalEventPartitionFile.getFilePath())) { return pFR.getFooter().getBlocks().stream(); } catch (IOException e) { throw new RuntimeException(e); } }) .map(b -> b.getColumns().stream() .filter(column -> Arrays.stream(column.getPath().toArray()).allMatch(path -> path.equals("kafka_offset"))) .findFirst() .map(ColumnChunkMetaData::getStatistics) .map(Statistics::genericGetMax) .map(Long.class::cast) .orElse(NO_OFFSET)) .mapToLong(Long::longValue) .max() .orElse(NO_OFFSET); }
Example #3
Source File: Schemas.java From parquet-mr with Apache License 2.0 | 6 votes |
public static Schema fromParquet(Configuration conf, URI location) throws IOException { Path path = new Path(location); FileSystem fs = path.getFileSystem(conf); ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path); String schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("parquet.avro.schema"); if (schemaString == null) { // try the older property schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("avro.schema"); } if (schemaString != null) { return new Schema.Parser().parse(schemaString); } else { return new AvroSchemaConverter() .convert(footer.getFileMetaData().getSchema()); } }
Example #4
Source File: ShowMetaCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; boolean showOriginalTypes = options.hasOption('o'); Configuration conf = new Configuration(); Path inputPath = new Path(input); FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath); List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false); PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter() .withAutoColumn() .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE) .withColumnPadding(1) .build(); for(Footer f: footers) { out.format("file: %s%n" , f.getFile()); MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes); out.flushColumns(); } }
Example #5
Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); List<String> args = options.getArgList(); Path inPath = new Path(args.get(0)); Path outPath = new Path(args.get(1)); CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2)); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE); writer.start(); try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) { compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName); } finally { writer.end(metaData.getFileMetaData().getKeyValueMetaData()); } }
Example #6
Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0 | 6 votes |
/** * Returns the schema for the given Parquet file path. */ public static Schema getParquetSchema(Path path) throws IOException { ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER); Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData(); String schemaString = metaData.get("parquet.avro.schema"); if (schemaString == null) { // Try the older property schemaString = metaData.get("avro.schema"); } if (schemaString != null) { return new Schema.Parser().parse(schemaString); } else { return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema()); } }
Example #7
Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0 | 6 votes |
public void createParquetInputFile(List<Record> records) throws IOException { if (PARQUET_FILE.exists()) { Assert.assertTrue(PARQUET_FILE.delete()); } OutputFile outFile = Files.localOutput(PARQUET_FILE); try (FileAppender<Record> appender = Parquet.write(outFile) .schema(FILE_SCHEMA) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.addAll(records); } InputFile inFile = Files.localInput(PARQUET_FILE); try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size()); rowGroupMetadata = reader.getRowGroups().get(0); parquetSchema = reader.getFileMetaData().getSchema(); } PARQUET_FILE.deleteOnExit(); }
Example #8
Source File: ParquetResolverTest.java From pxf with Apache License 2.0 | 6 votes |
@SuppressWarnings("deprecation") private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException { List<Group> result = new ArrayList<>(); String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath(); Path path = new Path(parquetFile); ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER); PageReadStore rowGroup; while ((rowGroup = fileReader.readNextRowGroup()) != null) { MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema)); long rowCount = rowGroup.getRowCount(); for (long i = 0; i < rowCount; i++) { result.add(recordReader.read()); } } fileReader.close(); assertEquals(expectedSize, result.size()); return result; }
Example #9
Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testPruneMultiColumns() throws Exception { // Create Parquet file String inputFile = createParquetFile("input"); String outputFile = createTempFile("output"); // Remove columns String cargs[] = {inputFile, outputFile, "Name", "Gender"}; executeCommandLine(cargs); // Verify the schema are not changed for the columns not pruned ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER); MessageType schema = pmd.getFileMetaData().getSchema(); List<Type> fields = schema.getFields(); assertEquals(fields.size(), 2); assertEquals(fields.get(0).getName(), "DocId"); assertEquals(fields.get(1).getName(), "Links"); List<Type> subFields = fields.get(1).asGroupType().getFields(); assertEquals(subFields.size(), 2); assertEquals(subFields.get(0).getName(), "Backward"); assertEquals(subFields.get(1).getName(), "Forward"); // Verify the data are not changed for the columns not pruned List<String> prunePaths = Arrays.asList("Name", "Gender"); validateColumns(inputFile, prunePaths); }
Example #10
Source File: PentahoApacheInputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public List<IParquetInputField> readSchema( String file ) throws Exception { return inClassloader( () -> { Configuration conf = job.getConfiguration(); S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf ); Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) ); FileSystem fs = FileSystem.get( filePath.toUri(), conf ); FileStatus fileStatus = fs.getFileStatus( filePath ); List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true ); if ( footers.isEmpty() ) { return new ArrayList<>(); } else { ParquetMetadata meta = footers.get( 0 ).getParquetMetadata(); MessageType schema = meta.getFileMetaData().getSchema(); return ParquetConverter.buildInputFields( schema ); } } ); }
Example #11
Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0 | 6 votes |
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException { Path inPath = new Path(inputFile); Path outPath = new Path(outputFile); CompressionCodecName codecName = CompressionCodecName.valueOf(codec); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE); writer.start(); try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) { compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName); } finally { writer.end(metaData.getFileMetaData().getKeyValueMetaData()); } }
Example #12
Source File: SchemaCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
private String getParquetSchema(String source) throws IOException { Formats.Format format; try (SeekableInput in = openSeekable(source)) { format = Formats.detectFormat((InputStream) in); in.seek(0); switch (format) { case PARQUET: return new ParquetFileReader( getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER) .getFileMetaData().getSchema().toString(); default: throw new IllegalArgumentException(String.format( "Could not get a Parquet schema for format %s: %s", format, source)); } } }
Example #13
Source File: ParquetReader.java From tajo with Apache License 2.0 | 6 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
Example #14
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void test() throws IOException { Configuration configuration = new Configuration(); ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, super.fsPath, ParquetMetadataConverter.NO_FILTER); ParquetFileReader reader = new ParquetFileReader(configuration, metadata.getFileMetaData(), super.fsPath, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns()); PageStatsValidator validator = new PageStatsValidator(); PageReadStore pageReadStore; while ((pageReadStore = reader.readNextRowGroup()) != null) { validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore); } }
Example #15
Source File: Schemas.java From kite with Apache License 2.0 | 6 votes |
public static Schema fromParquet(FileSystem fs, Path location) throws IOException { ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), location); String schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("parquet.avro.schema"); if (schemaString == null) { // try the older property schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("avro.schema"); } if (schemaString != null) { return new Schema.Parser().parse(schemaString); } else { return new AvroSchemaConverter() .convert(footer.getFileMetaData().getSchema()); } }
Example #16
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadSimpleGroup() throws IOException { Long[] array = {1L}; GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA) .set("bar", "test") .set("foo", 32L) .set("arr", array).build(); Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(3, row.getArity()); assertEquals(32L, row.getField(0)); assertEquals("test", row.getField(1)); assertArrayEquals(array, (Long[]) row.getField(2)); assertTrue(rowReader.reachEnd()); }
Example #17
Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testPruneNestedColumn() throws Exception { // Create Parquet file String inputFile = createParquetFile("input"); String outputFile = createTempFile("output"); // Remove nested column String cargs[] = {inputFile, outputFile, "Links.Backward"}; executeCommandLine(cargs); // Verify the schema are not changed for the columns not pruned ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER); MessageType schema = pmd.getFileMetaData().getSchema(); List<Type> fields = schema.getFields(); assertEquals(fields.size(), 4); assertEquals(fields.get(0).getName(), "DocId"); assertEquals(fields.get(1).getName(), "Name"); assertEquals(fields.get(2).getName(), "Gender"); assertEquals(fields.get(3).getName(), "Links"); List<Type> subFields = fields.get(3).asGroupType().getFields(); assertEquals(subFields.size(), 1); assertEquals(subFields.get(0).getName(), "Forward"); // Verify the data are not changed for the columns not pruned List<String> prunePaths = Arrays.asList("Links.Backward"); validateColumns(inputFile, prunePaths); }
Example #18
Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0 | 5 votes |
@Before public void setUp() throws Exception { reader = ParquetFileReader.open(conf, file); ParquetMetadata meta = reader.getFooter(); ccmd = meta.getBlocks().get(0).getColumns(); dictionaries = reader.getDictionaryReader(meta.getBlocks().get(0)); }
Example #19
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testMapGroup() throws IOException { Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema()) .getType().equals(Schema.Type.MAP)); ImmutableMap.Builder<String, String> map = ImmutableMap.builder(); map.put("testKey", "testValue"); GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA) .set("foo", 32L) .set("spamMap", map.build()) .build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(32L, row.getField(0)); Map<?, ?> result = (Map<?, ?>) row.getField(1); assertEquals(result.get("testKey").toString(), "testValue"); assertTrue(rowReader.reachEnd()); }
Example #20
Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0 | 5 votes |
@Test public void test_Meta_Info() throws Exception { FileInputStream fileInputStream = new FileInputStream(tmpAvro); ByteArrayOutputStream out = new ByteArrayOutputStream(); int readedBytes; byte[] buf = new byte[1024]; while ((readedBytes = fileInputStream.read(buf)) > 0) { out.write(buf, 0, readedBytes); } out.close(); Map<String, String> attributes = new HashMap<String, String>() {{ put(CoreAttributes.FILENAME.key(), "test.avro"); }}; runner.enqueue(out.toByteArray(), attributes); runner.run(); MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0); // Save the flowfile byte[] resultContents = runner.getContentAsByteArray(resultFlowFile); FileOutputStream fos = new FileOutputStream(tmpParquet); fos.write(resultContents); fos.flush(); fos.close(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); ParquetMetadata metaData; metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER); // #number of records long nParquetRecords = 0; for(BlockMetaData meta : metaData.getBlocks()){ nParquetRecords += meta.getRowCount(); } long nAvroRecord = records.size(); assertEquals(nParquetRecords, nAvroRecord); }
Example #21
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException { List<PageReadStore> rowGroups = new ArrayList<PageReadStore>(); ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER); ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns()); PageReadStore group; while ((group = fileReader.readNextRowGroup()) != null) { rowGroups.add(group); } return rowGroups; }
Example #22
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException { Configuration configuration = new Configuration(true); GroupReadSupport readSupport = new GroupReadSupport(); ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath); MessageType schema = readFooter.getFileMetaData().getSchema(); readSupport.init(configuration, null, schema); return new ParquetReader<Group>(parquetFilePath, readSupport); }
Example #23
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadNestedGroup() throws IOException { Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema()); GenericData.Record barRecord = new GenericRecordBuilder(schema) .set("spam", 31L).build(); GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA) .set("foo", 32L) .set("bar", barRecord) .build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(32L, row.getField(0)); assertEquals(31L, ((Row) row.getField(2)).getField(0)); assertTrue(rowReader.reachEnd()); }
Example #24
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadMultipleSimpleGroup() throws IOException { Long[] array = {1L}; List<IndexedRecord> records = new ArrayList<>(); for (int i = 0; i < 100; i++) { GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA) .set("bar", "test") .set("foo", i) .set("arr", array).build(); records.add(record); } Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records); MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertTrue(!rowReader.reachEnd()); for (long i = 0; i < 100; i++) { assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(3, row.getArity()); assertEquals(i, row.getField(0)); assertEquals("test", row.getField(1)); assertArrayEquals(array, (Long[]) row.getField(2)); } assertTrue(rowReader.reachEnd()); }
Example #25
Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException { Map<String, Long> colSizes = new HashMap<>(); ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER); for (BlockMetaData block : pmd.getBlocks()) { for (ColumnChunkMetaData column : block.getColumns()) { String colName = column.getPath().toDotString(); colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L)); } } return colSizes; }
Example #26
Source File: ParquetInputFormat.java From flink with Apache License 2.0 | 5 votes |
@Override public void open(FileInputSplit split) throws IOException { // reset the flag when open a new split this.skipThisSplit = false; org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration(); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); MessageType fileSchema = fileReader.getFileMetaData().getSchema(); MessageType readSchema = getReadSchema(fileSchema, split.getPath()); if (skipThisSplit) { LOG.warn(String.format( "Escaped the file split [%s] due to mismatch of file schema to expected result schema", split.getPath().toString())); } else { this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema, filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate)); this.parquetRecordReader.initialize(fileReader, configuration); this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); if (this.recordConsumed == null) { this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed"); } LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString())); } }
Example #27
Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0 | 5 votes |
public ParquetColumnarRowSplitReader( boolean utcTimestamp, boolean caseSensitive, Configuration conf, LogicalType[] selectedTypes, String[] selectedFieldNames, ColumnBatchGenerator generator, int batchSize, Path path, long splitStart, long splitLength) throws IOException { this.utcTimestamp = utcTimestamp; this.selectedTypes = selectedTypes; this.batchSize = batchSize; // then we need to apply the predicate push down filter ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength)); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(conf); List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive); this.reader = new ParquetFileReader( conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns()); long totalRowCount = 0; for (BlockMetaData block : blocks) { totalRowCount += block.getRowCount(); } this.totalRowCount = totalRowCount; this.nextRow = 0; this.rowsInBatch = 0; this.rowsReturned = 0; checkSchema(); this.writableVectors = createWritableVectors(); this.columnarBatch = generator.generate(createReadableVectors()); this.row = new ColumnarRowData(columnarBatch); }
Example #28
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) { this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); // real schema of parquet file this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), readSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.recordMaterializer = readSupport.prepareForRead( configuration, fileMetadata, readSchema, readContext); this.numTotalRecords = reader.getRecordCount(); }
Example #29
Source File: SparkModelParser.java From ignite with Apache License 2.0 | 5 votes |
/** * Load logistic regression model. * * @param pathToMdl Path to model. * @param learningEnvironment Learning environment. */ private static Model loadLogRegModel(String pathToMdl, LearningEnvironment learningEnvironment) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readInterceptor(g); coefficients = readCoefficients(g); } } } catch (IOException e) { String msg = "Error reading parquet file: " + e.getMessage(); learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg); e.printStackTrace(); } return new LogisticRegressionModel(coefficients, interceptor); }
Example #30
Source File: FooterGatherer.java From Bats with Apache License 2.0 | 5 votes |
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException { final List<TimedCallable<Footer>> readers = new ArrayList<>(); final List<Footer> foundFooters = new ArrayList<>(); for (FileStatus status : statuses) { if (status.isDirectory()){ // first we check for summary file. FileSystem fs = status.getPath().getFileSystem(conf); final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE); if (fs.exists(summaryPath)){ FileStatus summaryStatus = fs.getFileStatus(summaryPath); foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus)); continue; } // else we handle as normal file. for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){ readers.add(new FooterReader(conf, inStatus)); } } else { readers.add(new FooterReader(conf, status)); } } if(!readers.isEmpty()){ foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism)); } return foundFooters; }