org.apache.parquet.io.InputFile Java Exaples

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
  this.converter = new ParquetMetadataConverter(options);
  this.file = file;
  this.f = file.newStream();
  this.options = options;
  try {
    this.footer = readFooter(file, options, f, converter);
  } catch (Exception e) {
    // In case that reading footer throws an exception in the constructor, the new stream
    // should be closed. Otherwise, there's no way to close this outside.
    f.close();
    throw e;
  }
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

6 votes

public ParquetReader<T> build() throws IOException {
  ParquetReadOptions options = optionsBuilder.build();

  if (path != null) {
    FileSystem fs = path.getFileSystem(conf);
    FileStatus stat = fs.getFileStatus(path);

    if (stat.isFile()) {
      return new ParquetReader<>(
          Collections.singletonList((InputFile) HadoopInputFile.fromStatus(stat, conf)),
          options,
          getReadSupport());

    } else {
      List<InputFile> files = new ArrayList<>();
      for (FileStatus fileStatus : fs.listStatus(path, HiddenFileFilter.INSTANCE)) {
        files.add(HadoopInputFile.fromStatus(fileStatus, conf));
      }
      return new ParquetReader<T>(files, options, getReadSupport());
    }

  } else {
    return new ParquetReader<>(Collections.singletonList(file), options, getReadSupport());
  }
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

6 votes

private void initReader() throws IOException {
  if (reader != null) {
    reader.close();
    reader = null;
  }

  if (filesIterator.hasNext()) {
    InputFile file = filesIterator.next();

    ParquetFileReader fileReader = ParquetFileReader.open(file, options);

    reader = new InternalParquetRecordReader<>(readSupport, options.getRecordFilter());

    reader.initialize(fileReader, options);
  }
}

Source File: ParquetInputFormat.java From flink with Apache License 2.0

5 votes

@Override
public void open(FileInputSplit split) throws IOException {
	// reset the flag when open a new split
	this.skipThisSplit = false;
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
	MessageType fileSchema = fileReader.getFileMetaData().getSchema();
	MessageType readSchema = getReadSchema(fileSchema, split.getPath());
	if (skipThisSplit) {
		LOG.warn(String.format(
			"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
			split.getPath().toString()));
	} else {
		this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema,
			filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate));
		this.parquetRecordReader.initialize(fileReader, configuration);
		this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);

		if (this.recordConsumed == null) {
			this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
		}

		LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
	}
}

Source File: TestParquetRecordSetWriter.java From nifi with Apache License 2.0

5 votes

private void verifyParquetRecords(final File parquetFile, final int expectedRecordCount) throws IOException {
    final Configuration conf = new Configuration();
    final Path path = new Path(parquetFile.getPath());
    final InputFile inputFile = HadoopInputFile.fromPath(path, conf);

    try (final ParquetReader<GenericRecord> reader =
            AvroParquetReader.<GenericRecord>builder(inputFile).withConf(conf).build()){

        int recordCount = 0;
        while(reader.read() != null) {
            recordCount++;
        }
        assertEquals(expectedRecordCount, recordCount);
    }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
  long fileLen = file.getLength();
  LOG.debug("File length {}", fileLen);
  int FOOTER_LENGTH_SIZE = 4;
  if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
    throw new RuntimeException(file.toString() + " is not a Parquet file (too small length: " + fileLen + ")");
  }
  long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
  LOG.debug("reading footer index at {}", footerLengthIndex);

  f.seek(footerLengthIndex);
  int footerLength = readIntLittleEndian(f);
  byte[] magic = new byte[MAGIC.length];
  f.readFully(magic);
  if (!Arrays.equals(MAGIC, magic)) {
    throw new RuntimeException(file.toString() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
  }
  long footerIndex = footerLengthIndex - footerLength;
  LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
  if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
    throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
  }
  f.seek(footerIndex);
  // Read all the footer bytes in one time to avoid multiple read operations,
  // since it can be pretty time consuming for a single read operation in HDFS.
  ByteBuffer footerBytesBuffer = ByteBuffer.allocate(footerLength);
  f.readFully(footerBytesBuffer);
  LOG.debug("Finished to read all footer bytes.");
  footerBytesBuffer.flip();
  InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
  return converter.readParquetMetadata(footerBytesStream, options.getMetadataFilter());
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Reads the meta data block in the footer of the file using provided input stream
 * @param file a {@link InputFile} to read
 * @param filter the filter to apply to row groups
 * @return the metadata blocks in the footer
 * @throws IOException if an error occurs while reading the file
 * @deprecated will be removed in 2.0.0;
 *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
 */
@Deprecated
public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException {
  ParquetReadOptions options;
  if (file instanceof HadoopInputFile) {
    options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration())
        .withMetadataFilter(filter).build();
  } else {
    options = ParquetReadOptions.builder().withMetadataFilter(filter).build();
  }

  try (SeekableInputStream in = file.newStream()) {
    return readFooter(file, options, in);
  }
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

protected Builder(InputFile file) {
  this.readSupport = null;
  this.file = Objects.requireNonNull(file, "file cannot be null");
  this.path = null;
  if (file instanceof HadoopInputFile) {
    this.conf = ((HadoopInputFile) file).getConfiguration();
  } else {
    this.conf = new Configuration();
  }
  optionsBuilder = HadoopReadOptions.builder(conf);
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

private ParquetReader(List<InputFile> files,
                      ParquetReadOptions options,
                      ReadSupport<T> readSupport) throws IOException {
  this.readSupport = readSupport;
  this.options = options;
  this.filesIterator = files.iterator();
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testNestedMapGroup() throws IOException {
	Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema());
	Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP));

	Schema mapValueSchema = nestedMapSchema.getValueType();
	GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema)
		.set("type", "nested")
		.set("value", "nested_value").build();

	ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder();
	map.put("testKey", mapValue);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedMap", map.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Map result = (Map) row.getField(5);

	Row nestedRow = (Row) result.get("testKey");
	assertEquals("nested", nestedRow.getField(0));
	assertEquals("nested_value", nestedRow.getField(1));
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}

Source File: ParquetStreamingFileSinkITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}

Source File: ParquetStreamingFileSinkITCase.java From flink with Apache License 2.0

5 votes

private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testNestedMapGroup() throws IOException {
	Schema nestedMapSchema = unWrapSchema(NESTED_SCHEMA.getField("nestedMap").schema());
	Preconditions.checkState(nestedMapSchema.getType().equals(Schema.Type.MAP));

	Schema mapValueSchema = nestedMapSchema.getValueType();
	GenericRecord mapValue = new GenericRecordBuilder(mapValueSchema)
		.set("type", "nested")
		.set("value", "nested_value").build();

	ImmutableMap.Builder<String, GenericRecord> map = ImmutableMap.builder();
	map.put("testKey", mapValue);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedMap", map.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Map result = (Map) row.getField(5);

	Row nestedRow = (Row) result.get("testKey");
	assertEquals("nested", nestedRow.getField(0));
	assertEquals("nested_value", nestedRow.getField(1));
}

Source File: ParquetStreamingFileSinkITCase.java From flink with Apache License 2.0

5 votes

private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}

Source File: ParquetInputFormat.java From flink with Apache License 2.0

5 votes

@Override
public void open(FileInputSplit split) throws IOException {
	// reset the flag when open a new split
	this.skipThisSplit = false;
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
	MessageType fileSchema = fileReader.getFileMetaData().getSchema();
	MessageType readSchema = getReadSchema(fileSchema, split.getPath());
	if (skipThisSplit) {
		LOG.warn(String.format(
			"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
			split.getPath().toString()));
	} else {
		this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema,
			filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate));
		this.parquetRecordReader.initialize(fileReader, configuration);
		this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);

		if (this.recordConsumed == null) {
			this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
		}

		LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
	}
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

4 votes

@Test
public void testNestedArrayGroup() throws IOException {
	Schema nestedArraySchema = unWrapSchema(NESTED_SCHEMA.getField("nestedArray").schema());
	Preconditions.checkState(nestedArraySchema.getType().equals(Schema.Type.ARRAY));

	Schema arrayItemSchema = nestedArraySchema.getElementType();
	GenericRecord item = new GenericRecordBuilder(arrayItemSchema)
		.set("type", "nested")
		.set("value", 1L).build();

	ImmutableList.Builder<GenericRecord> list = ImmutableList.builder();
	list.add(item);

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("nestedArray", list.build())
		.set("foo", 34L).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(34L, row.getField(0));
	Object[] result = (Object[]) row.getField(6);

	assertEquals(1, result.length);

	Row nestedRow = (Row) result[0];
	assertEquals("nested", nestedRow.getField(0));
	assertEquals(1L, nestedRow.getField(1));
}

Source File: AvroParquetReader.java From parquet-mr with Apache License 2.0

4 votes

private Builder(InputFile file) {
  super(file);
}

Source File: AvroParquetReader.java From parquet-mr with Apache License 2.0

4 votes

public static <T> Builder<T> builder(InputFile file) {
  return new Builder<T>(file);
}

Source File: ShowColumnIndexCommand.java From parquet-mr with Apache License 2.0

4 votes

@Override
public int run() throws IOException {
  Preconditions.checkArgument(files != null && files.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(files.size() == 1,
      "Cannot process multiple Parquet files.");

  InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf());
  if (!showColumnIndex && !showOffsetIndex) {
    showColumnIndex = true;
    showOffsetIndex = true;
  }

  Set<String> rowGroupIndexSet = new HashSet<>();
  if (rowGroupIndexes != null) {
    rowGroupIndexSet.addAll(rowGroupIndexes);
  }

  try (ParquetFileReader reader = ParquetFileReader.open(in)) {
    boolean firstBlock = true;
    int rowGroupIndex = 0;
    for (BlockMetaData block : reader.getFooter().getBlocks()) {
      if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) {
        ++rowGroupIndex;
        continue;
      }
      if (!firstBlock) {
        console.info("");
      }
      firstBlock = false;
      console.info("row-group {}:", rowGroupIndex);
      for (ColumnChunkMetaData column : getColumns(block)) {
        String path = column.getPath().toDotString();
        if (showColumnIndex) {
          console.info("column index for column {}:", path);
          ColumnIndex columnIndex = reader.readColumnIndex(column);
          if (columnIndex == null) {
            console.info("NONE");
          } else {
            console.info(columnIndex.toString());
          }
        }
        if (showOffsetIndex) {
          console.info("offset index for column {}:", path);
          OffsetIndex offsetIndex = reader.readOffsetIndex(column);
          if (offsetIndex == null) {
            console.info("NONE");
          } else {
            console.info(offsetIndex.toString());
          }
        }
      }
      ++rowGroupIndex;
    }
  }
  return 0;
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Test whether corruption in the page content is detected by checksum verification
 */
@Test
public void testCorruptedPage() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  InputFile inputFile = HadoopInputFile.fromPath(path, conf);
  try (SeekableInputStream inputStream = inputFile.newStream()) {
    int fileLen = (int) inputFile.getLength();
    byte[] fileBytes = new byte[fileLen];
    inputStream.readFully(fileBytes);
    inputStream.close();

    // There are 4 pages in total (2 per column), we corrupt the first page of the first column
    // and the second page of the second column. We do this by altering a byte roughly in the
    // middle of each page to be corrupted
    fileBytes[fileLen / 8]++;
    fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;

    OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
    try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
      outputStream.write(fileBytes);
      outputStream.close();

      // First we disable checksum verification, the corruption will go undetected as it is in the
      // data section of the page
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
      try (ParquetFileReader reader = getParquetFileReader(path, conf,
        Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();

        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
        readNextPage(colADesc, pageReadStore);
        readNextPage(colBDesc, pageReadStore);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
      }

      // Now we enable checksum verification, the corruption should be detected
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
      try (ParquetFileReader reader =
             getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        // We expect an exception on the first encountered corrupt page (in readAllPages)
        assertVerificationFailed(reader);
      }
    }
  }
}

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

4 votes

public TransParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
  super(file, options);
}

org.apache.parquet.io.InputFile Java Examples