org.apache.parquet.io.InputFile Java Examples
The following examples show how to use
org.apache.parquet.io.InputFile.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException { this.converter = new ParquetMetadataConverter(options); this.file = file; this.f = file.newStream(); this.options = options; try { this.footer = readFooter(file, options, f, converter); } catch (Exception e) { // In case that reading footer throws an exception in the constructor, the new stream // should be closed. Otherwise, there's no way to close this outside. f.close(); throw e; } this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; }
Example #2
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 6 votes |
public ParquetReader<T> build() throws IOException { ParquetReadOptions options = optionsBuilder.build(); if (path != null) { FileSystem fs = path.getFileSystem(conf); FileStatus stat = fs.getFileStatus(path); if (stat.isFile()) { return new ParquetReader<>( Collections.singletonList((InputFile) HadoopInputFile.fromStatus(stat, conf)), options, getReadSupport()); } else { List<InputFile> files = new ArrayList<>(); for (FileStatus fileStatus : fs.listStatus(path, HiddenFileFilter.INSTANCE)) { files.add(HadoopInputFile.fromStatus(fileStatus, conf)); } return new ParquetReader<T>(files, options, getReadSupport()); } } else { return new ParquetReader<>(Collections.singletonList(file), options, getReadSupport()); } }
Example #3
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 6 votes |
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (filesIterator.hasNext()) { InputFile file = filesIterator.next(); ParquetFileReader fileReader = ParquetFileReader.open(file, options); reader = new InternalParquetRecordReader<>(readSupport, options.getRecordFilter()); reader.initialize(fileReader, options); } }
Example #4
Source File: ParquetInputFormat.java From flink with Apache License 2.0 | 5 votes |
@Override public void open(FileInputSplit split) throws IOException { // reset the flag when open a new split this.skipThisSplit = false; org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration(); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); MessageType fileSchema = fileReader.getFileMetaData().getSchema(); MessageType readSchema = getReadSchema(fileSchema, split.getPath()); if (skipThisSplit) { LOG.warn(String.format( "Escaped the file split [%s] due to mismatch of file schema to expected result schema", split.getPath().toString())); } else { this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema, filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate)); this.parquetRecordReader.initialize(fileReader, configuration); this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); if (this.recordConsumed == null) { this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed"); } LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString())); } }
Example #5
Source File: TestParquetRecordSetWriter.java From nifi with Apache License 2.0 | 5 votes |
private void verifyParquetRecords(final File parquetFile, final int expectedRecordCount) throws IOException { final Configuration conf = new Configuration(); final Path path = new Path(parquetFile.getPath()); final InputFile inputFile = HadoopInputFile.fromPath(path, conf); try (final ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(inputFile).withConf(conf).build()){ int recordCount = 0; while(reader.read() != null) { recordCount++; } assertEquals(expectedRecordCount, recordCount); } }
Example #6
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException { long fileLen = file.getLength(); LOG.debug("File length {}", fileLen); int FOOTER_LENGTH_SIZE = 4; if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(file.toString() + " is not a Parquet file (too small length: " + fileLen + ")"); } long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length; LOG.debug("reading footer index at {}", footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(file.toString() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex); } f.seek(footerIndex); // Read all the footer bytes in one time to avoid multiple read operations, // since it can be pretty time consuming for a single read operation in HDFS. ByteBuffer footerBytesBuffer = ByteBuffer.allocate(footerLength); f.readFully(footerBytesBuffer); LOG.debug("Finished to read all footer bytes."); footerBytesBuffer.flip(); InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer); return converter.readParquetMetadata(footerBytesStream, options.getMetadataFilter()); }
Example #7
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * Reads the meta data block in the footer of the file using provided input stream * @param file a {@link InputFile} to read * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file * @deprecated will be removed in 2.0.0; * use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)} */ @Deprecated public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException { ParquetReadOptions options; if (file instanceof HadoopInputFile) { options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration()) .withMetadataFilter(filter).build(); } else { options = ParquetReadOptions.builder().withMetadataFilter(filter).build(); } try (SeekableInputStream in = file.newStream()) { return readFooter(file, options, in); } }
Example #8
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
protected Builder(InputFile file) { this.readSupport = null; this.file = Objects.requireNonNull(file, "file cannot be null"); this.path = null; if (file instanceof HadoopInputFile) { this.conf = ((HadoopInputFile) file).getConfiguration(); } else { this.conf = new Configuration(); } optionsBuilder = HadoopReadOptions.builder(conf); }
Example #9
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private ParquetReader(List<InputFile> files, ParquetReadOptions options, ReadSupport<T> readSupport) throws IOException { this.readSupport = readSupport; this.options = options; this.filesIterator = files.iterator(); }
Example #10
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, FilterCompat.Filter filter) throws IOException { this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)), HadoopReadOptions.builder(conf) .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null")) .build(), readSupport); }
Example #11
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testNestedMapGroup() throws IOException { Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema()); Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP)); Schema mapValueSchema = nestedMapSchema.getValueType(); GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema) .set("type", "nested") .set("value", "nested_value").build(); ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder(); map.put("testKey", mapValue); GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA) .set("nestedMap", map.build()) .set("foo", 34L).build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(34L, row.getField(0)); Map result = (Map) row.getField(5); Row nestedRow = (Row) result.get("testKey"); assertEquals("nested", nestedRow.getField(0)); assertEquals("nested_value", nestedRow.getField(1)); }
Example #12
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testMapGroup() throws IOException { Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema()) .getType().equals(Schema.Type.MAP)); ImmutableMap.Builder<String, String> map = ImmutableMap.builder(); map.put("testKey", "testValue"); GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA) .set("foo", 32L) .set("spamMap", map.build()) .build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(32L, row.getField(0)); Map<?, ?> result = (Map<?, ?>) row.getField(1); assertEquals(result.get("testKey").toString(), "testValue"); assertTrue(rowReader.reachEnd()); }
Example #13
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadNestedGroup() throws IOException { Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema()); GenericData.Record barRecord = new GenericRecordBuilder(schema) .set("spam", 31L).build(); GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA) .set("foo", 32L) .set("bar", barRecord) .build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(32L, row.getField(0)); assertEquals(31L, ((Row) row.getField(2)).getField(0)); assertTrue(rowReader.reachEnd()); }
Example #14
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadMultipleSimpleGroup() throws IOException { Long[] array = {1L}; List<IndexedRecord> records = new ArrayList<>(); for (int i = 0; i < 100; i++) { GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA) .set("bar", "test") .set("foo", i) .set("arr", array).build(); records.add(record); } Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records); MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertTrue(!rowReader.reachEnd()); for (long i = 0; i < 100; i++) { assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(3, row.getArity()); assertEquals(i, row.getField(0)); assertEquals("test", row.getField(1)); assertArrayEquals(array, (Long[]) row.getField(2)); } assertTrue(rowReader.reachEnd()); }
Example #15
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadNestedGroup() throws IOException { Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema()); GenericData.Record barRecord = new GenericRecordBuilder(schema) .set("spam", 31L).build(); GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA) .set("foo", 32L) .set("bar", barRecord) .build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(32L, row.getField(0)); assertEquals(31L, ((Row) row.getField(2)).getField(0)); assertTrue(rowReader.reachEnd()); }
Example #16
Source File: ParquetStreamingFileSinkITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException { InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration()); ArrayList<T> results = new ArrayList<>(); try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) { T next; while ((next = reader.read()) != null) { results.add(next); } } return results; }
Example #17
Source File: ParquetStreamingFileSinkITCase.java From flink with Apache License 2.0 | 5 votes |
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException { InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration()); ArrayList<T> results = new ArrayList<>(); try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) { T next; while ((next = reader.read()) != null) { results.add(next); } } return results; }
Example #18
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadSimpleGroup() throws IOException { Long[] array = {1L}; GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA) .set("bar", "test") .set("foo", 32L) .set("arr", array).build(); Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(3, row.getArity()); assertEquals(32L, row.getField(0)); assertEquals("test", row.getField(1)); assertArrayEquals(array, (Long[]) row.getField(2)); assertTrue(rowReader.reachEnd()); }
Example #19
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadMultipleSimpleGroup() throws IOException { Long[] array = {1L}; List<IndexedRecord> records = new ArrayList<>(); for (int i = 0; i < 100; i++) { GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA) .set("bar", "test") .set("foo", i) .set("arr", array).build(); records.add(record); } Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records); MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertTrue(!rowReader.reachEnd()); for (long i = 0; i < 100; i++) { assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(3, row.getArity()); assertEquals(i, row.getField(0)); assertEquals("test", row.getField(1)); assertArrayEquals(array, (Long[]) row.getField(2)); } assertTrue(rowReader.reachEnd()); }
Example #20
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testReadSimpleGroup() throws IOException { Long[] array = {1L}; GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA) .set("bar", "test") .set("foo", 32L) .set("arr", array).build(); Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(3, row.getArity()); assertEquals(32L, row.getField(0)); assertEquals("test", row.getField(1)); assertArrayEquals(array, (Long[]) row.getField(2)); assertTrue(rowReader.reachEnd()); }
Example #21
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testMapGroup() throws IOException { Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema()) .getType().equals(Schema.Type.MAP)); ImmutableMap.Builder<String, String> map = ImmutableMap.builder(); map.put("testKey", "testValue"); GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA) .set("foo", 32L) .set("spamMap", map.build()) .build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(32L, row.getField(0)); Map<?, ?> result = (Map<?, ?>) row.getField(1); assertEquals(result.get("testKey").toString(), "testValue"); assertTrue(rowReader.reachEnd()); }
Example #22
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testNestedMapGroup() throws IOException { Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema()); Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP)); Schema mapValueSchema = nestedMapSchema.getValueType(); GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema) .set("type", "nested") .set("value", "nested_value").build(); ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder(); map.put("testKey", mapValue); GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA) .set("nestedMap", map.build()) .set("foo", 34L).build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(34L, row.getField(0)); Map result = (Map) row.getField(5); Row nestedRow = (Row) result.get("testKey"); assertEquals("nested", nestedRow.getField(0)); assertEquals("nested_value", nestedRow.getField(1)); }
Example #23
Source File: ParquetStreamingFileSinkITCase.java From flink with Apache License 2.0 | 5 votes |
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException { InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration()); ArrayList<T> results = new ArrayList<>(); try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) { T next; while ((next = reader.read()) != null) { results.add(next); } } return results; }
Example #24
Source File: ParquetInputFormat.java From flink with Apache License 2.0 | 5 votes |
@Override public void open(FileInputSplit split) throws IOException { // reset the flag when open a new split this.skipThisSplit = false; org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration(); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); MessageType fileSchema = fileReader.getFileMetaData().getSchema(); MessageType readSchema = getReadSchema(fileSchema, split.getPath()); if (skipThisSplit) { LOG.warn(String.format( "Escaped the file split [%s] due to mismatch of file schema to expected result schema", split.getPath().toString())); } else { this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema, filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate)); this.parquetRecordReader.initialize(fileReader, configuration); this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); if (this.recordConsumed == null) { this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed"); } LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString())); } }
Example #25
Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0 | 4 votes |
@Test public void testNestedArrayGroup() throws IOException { Schema nestedArraySchema = unWrapSchema(NESTED_SCHEMA.getField("nestedArray").schema()); Preconditions.checkState(nestedArraySchema.getType().equals(Schema.Type.ARRAY)); Schema arrayItemSchema = nestedArraySchema.getElementType(); GenericRecord item = new GenericRecordBuilder(arrayItemSchema) .set("type", "nested") .set("value", 1L).build(); ImmutableList.Builder<GenericRecord> list = ImmutableList.builder(); list.add(item); GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA) .set("nestedArray", list.build()) .set("foo", 34L).build(); Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record)); MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA); ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema); InputFile inputFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig); ParquetReadOptions options = ParquetReadOptions.builder().build(); ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); rowReader.initialize(fileReader, testConfig); assertFalse(rowReader.reachEnd()); Row row = rowReader.nextRecord(); assertEquals(7, row.getArity()); assertEquals(34L, row.getField(0)); Object[] result = (Object[]) row.getField(6); assertEquals(1, result.length); Row nestedRow = (Row) result[0]; assertEquals("nested", nestedRow.getField(0)); assertEquals(1L, nestedRow.getField(1)); }
Example #26
Source File: AvroParquetReader.java From parquet-mr with Apache License 2.0 | 4 votes |
private Builder(InputFile file) { super(file); }
Example #27
Source File: AvroParquetReader.java From parquet-mr with Apache License 2.0 | 4 votes |
public static <T> Builder<T> builder(InputFile file) { return new Builder<T>(file); }
Example #28
Source File: ShowColumnIndexCommand.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public int run() throws IOException { Preconditions.checkArgument(files != null && files.size() >= 1, "A Parquet file is required."); Preconditions.checkArgument(files.size() == 1, "Cannot process multiple Parquet files."); InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf()); if (!showColumnIndex && !showOffsetIndex) { showColumnIndex = true; showOffsetIndex = true; } Set<String> rowGroupIndexSet = new HashSet<>(); if (rowGroupIndexes != null) { rowGroupIndexSet.addAll(rowGroupIndexes); } try (ParquetFileReader reader = ParquetFileReader.open(in)) { boolean firstBlock = true; int rowGroupIndex = 0; for (BlockMetaData block : reader.getFooter().getBlocks()) { if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) { ++rowGroupIndex; continue; } if (!firstBlock) { console.info(""); } firstBlock = false; console.info("row-group {}:", rowGroupIndex); for (ColumnChunkMetaData column : getColumns(block)) { String path = column.getPath().toDotString(); if (showColumnIndex) { console.info("column index for column {}:", path); ColumnIndex columnIndex = reader.readColumnIndex(column); if (columnIndex == null) { console.info("NONE"); } else { console.info(columnIndex.toString()); } } if (showOffsetIndex) { console.info("offset index for column {}:", path); OffsetIndex offsetIndex = reader.readOffsetIndex(column); if (offsetIndex == null) { console.info("NONE"); } else { console.info(offsetIndex.toString()); } } } ++rowGroupIndex; } } return 0; }
Example #29
Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0 | 4 votes |
/** * Test whether corruption in the page content is detected by checksum verification */ @Test public void testCorruptedPage() throws IOException { Configuration conf = new Configuration(); conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true); Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED); InputFile inputFile = HadoopInputFile.fromPath(path, conf); try (SeekableInputStream inputStream = inputFile.newStream()) { int fileLen = (int) inputFile.getLength(); byte[] fileBytes = new byte[fileLen]; inputStream.readFully(fileBytes); inputStream.close(); // There are 4 pages in total (2 per column), we corrupt the first page of the first column // and the second page of the second column. We do this by altering a byte roughly in the // middle of each page to be corrupted fileBytes[fileLen / 8]++; fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++; OutputFile outputFile = HadoopOutputFile.fromPath(path, conf); try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) { outputStream.write(fileBytes); outputStream.close(); // First we disable checksum verification, the corruption will go undetected as it is in the // data section of the page conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false); try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) { PageReadStore pageReadStore = reader.readNextRowGroup(); DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore); assertFalse("Data in page was not corrupted", Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes)); readNextPage(colADesc, pageReadStore); readNextPage(colBDesc, pageReadStore); DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore); assertFalse("Data in page was not corrupted", Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes)); } // Now we enable checksum verification, the corruption should be detected conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true); try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) { // We expect an exception on the first encountered corrupt page (in readAllPages) assertVerificationFailed(reader); } } } }
Example #30
Source File: CompressionConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
public TransParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException { super(file, options); }