org.apache.parquet.avro.AvroParquetReader Java Exaples

Source File: ParquetRecordReader.java From nifi with Apache License 2.0

8 votes

public ParquetRecordReader(final InputStream inputStream, final long inputLength, final Configuration configuration) throws IOException {
    if (inputLength < 0) {
        throw new IllegalArgumentException("Invalid input length of '" + inputLength + "'. This record reader requires knowing " +
                "the length of the InputStream and cannot be used in some cases where the length may not be known.");
    }

    this.inputStream = inputStream;

    inputFile = new NifiParquetInputFile(inputStream, inputLength);
    parquetReader = AvroParquetReader.<GenericRecord>builder(inputFile).withConf(configuration).build();

    // Read the first record so that we can extract the schema
    lastParquetRecord = parquetReader.read();
    if (lastParquetRecord == null) {
        throw new EOFException("Unable to obtain schema because no records were available");
    }

    // Convert Avro schema to RecordSchema
    recordSchema = AvroTypeUtil.createSchema(lastParquetRecord.getSchema());
}

Source File: ParquetReader.java From reef with Apache License 2.0

7 votes

/**
 * Serialize Avro data to a in-memory ByteBuffer.
 * @return A ByteBuffer that contains avro data.
 * @throws IOException if the parquet file couldn't be parsed correctly.
 */
public ByteBuffer serializeToByteBuffer() throws IOException {
  final ByteArrayOutputStream stream = new ByteArrayOutputStream();
  final Encoder encoder = EncoderFactory.get().binaryEncoder(stream, null);
  final DatumWriter writer = new GenericDatumWriter<GenericRecord>();
  writer.setSchema(createAvroSchema());
  final AvroParquetReader<GenericRecord> reader = createAvroReader();

  GenericRecord record = reader.read();
  while (record != null) {
    writer.write(record, encoder);
    record = reader.read();
  }

  try {
    reader.close();
  } catch (IOException ex){
    LOG.log(Level.SEVERE, ex.getMessage());
    throw ex;
  }

  encoder.flush();
  final ByteBuffer buf = ByteBuffer.wrap(stream.toByteArray());
  buf.order(ByteOrder.LITTLE_ENDIAN);
  return buf;
}

Source File: ParquetFileSystemDatasetReader.java From kite with Apache License 2.0

6 votes

@Override
public void initialize() {
  Preconditions.checkState(state.equals(ReaderWriterState.NEW),
    "A reader may not be opened more than once - current state:%s", state);

  LOG.debug("Opening reader on path:{}", path);

  try {
    final Configuration conf = fileSystem.getConf();
    AvroReadSupport.setAvroReadSchema(conf, readerSchema);
    reader = new AvroParquetReader<E>(
        conf, fileSystem.makeQualified(path));
  } catch (IOException e) {
    throw new DatasetIOException("Unable to create reader path:" + path, e);
  }

  advance();

  state = ReaderWriterState.OPEN;
}

Source File: PutParquetTest.java From nifi with Apache License 2.0

6 votes

private void verifyAvroParquetUsers(final Path avroParquetUsers, final int numExpectedUsers) throws IOException {
    final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader
            .<GenericRecord>builder(avroParquetUsers)
            .withConf(testConf);

    int currUser = 0;

    try (final ParquetReader<GenericRecord> reader = readerBuilder.build()) {
        GenericRecord nextRecord;
        while((nextRecord = reader.read()) != null) {
            Assert.assertNotNull(nextRecord);
            Assert.assertEquals("name" + currUser, nextRecord.get("name").toString());
            Assert.assertEquals(currUser, nextRecord.get("favorite_number"));
            Assert.assertEquals("blue" + currUser, nextRecord.get("favorite_color").toString());
            currUser++;
        }
    }

    Assert.assertEquals(numExpectedUsers, currUser);
}

Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0

6 votes

private List<TestRecord> readParquetFilesAvro(File outputFile)
    throws IOException {
  ParquetReader<org.apache.gobblin.test.avro.TestRecord> reader = null;
  List<TestRecord> records = new ArrayList<>();
  try {
    reader = new AvroParquetReader<>(new Path(outputFile.toString()));
    for (org.apache.gobblin.test.avro.TestRecord value = reader.read(); value != null; value = reader.read()) {
      records.add(new TestRecord(value.getPartition(),
          value.getSequence(),
          value.getPayload()));
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records;

}

Source File: ParquetIO.java From beam with Apache License 2.0

6 votes

@ProcessElement
public void processElement(ProcessContext processContext) throws Exception {
  FileIO.ReadableFile file = processContext.element();

  if (!file.getMetadata().isReadSeekEfficient()) {
    ResourceId filename = file.getMetadata().resourceId();
    throw new RuntimeException(String.format("File has to be seekable: %s", filename));
  }

  SeekableByteChannel seekableByteChannel = file.openSeekable();

  AvroParquetReader.Builder builder =
      AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel));
  if (modelClass != null) {
    // all GenericData implementations have a static get method
    builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null));
  }

  try (ParquetReader<GenericRecord> reader = builder.build()) {
    GenericRecord read;
    while ((read = reader.read()) != null) {
      processContext.output(read);
    }
  }
}

Source File: LargeInputFileIT.java From datacollector with Apache License 2.0

6 votes

public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  for(long i = 0; i < recourdCount; i++) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + i, actualRow);

    Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0);
    Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i)));
    Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i);
    Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100);
    Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100)));
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}

Source File: BaseAvroParquetConvertIT.java From datacollector with Apache License 2.0

6 votes

public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  int position = 0;
  for(Map<String, Object> expectedRow : data) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + position, actualRow);

    for(Map.Entry<String, Object> entry : expectedRow.entrySet()) {
      Object value = actualRow.get(entry.getKey());
      Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value);
    }
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}

Source File: MiniDfsResource.java From components with Apache License 2.0

5 votes

/**
 * Tests that a file on the HDFS cluster contains the given parquet.
 *
 * @param path the name of the file on the HDFS cluster
 * @param expected the expected avro record in the file .
 */
public static void assertReadParquetFile(FileSystem fs, String path, Set<IndexedRecord> expected, boolean part) throws IOException {
    Path p = new Path(path);
    if (fs.isFile(p)) {
        try (AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(fs.getConf(), new Path(path))) {
            IndexedRecord record = null;
            while (null != (record = reader.read())){
                IndexedRecord eqRecord = null;
                for (IndexedRecord indexedRecord : expected) {
                    if(indexedRecord.equals(record)){
                        eqRecord = indexedRecord;
                        break;
                    }
                }
                expected.remove(eqRecord);
            }
        }
        // Check before asserting for the message.
        if (!part && expected.size() != 0)
            assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0));
    } else if (fs.isDirectory(p)) {
        for (FileStatus fstatus : FileSystemUtil.listSubFiles(fs, p)) {
            assertReadParquetFile(fs, fstatus.getPath().toString(), expected, true);
        }
        // Check before asserting for the message.
        if (expected.size() != 0)
            assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0));
    } else {
        fail("No such path: " + path);
    }
}

Source File: TestParquetRecordSetWriter.java From nifi with Apache License 2.0

5 votes

private void verifyParquetRecords(final File parquetFile, final int expectedRecordCount) throws IOException {
    final Configuration conf = new Configuration();
    final Path path = new Path(parquetFile.getPath());
    final InputFile inputFile = HadoopInputFile.fromPath(path, conf);

    try (final ParquetReader<GenericRecord> reader =
            AvroParquetReader.<GenericRecord>builder(inputFile).withConf(conf).build()){

        int recordCount = 0;
        while(reader.read() != null) {
            recordCount++;
        }
        assertEquals(expectedRecordCount, recordCount);
    }
}

Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0

5 votes

/**
 * Returns a ParquetReader with the given path.
 */
public static ParquetReader<GenericRecord> getParquetReader(Path path)
    throws IOException {
  //noinspection unchecked
  return AvroParquetReader.<GenericRecord>builder(path).disableCompatibility().withDataModel(GenericData.get())
      .withConf(getConfiguration()).build();
}

Source File: AvroParquetFileReaderWriterFactory.java From secor with Apache License 2.0

5 votes

public AvroParquetFileReader(LogFilePath logFilePath, CompressionCodec codec) throws IOException {
    Path path = new Path(logFilePath.getLogFilePath());
    topic = logFilePath.getTopic();
    Schema schema = schemaRegistry.getSchema(topic);
    reader = AvroParquetReader.<GenericRecord>builder(path).build();
    writer = new SpecificDatumWriter(schema);
    offset = logFilePath.getOffset();
}

Source File: ParquetStreamingFileSinkITCase.java From flink with Apache License 2.0

5 votes

private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}

Source File: ConvertCsvToParquetFileExpressionProcessorTests.java From vividus with Apache License 2.0

5 votes

private GenericRecord readActualRecord(String parquetPath) throws IOException
{
    try (ParquetReader<GenericRecord> reader = AvroParquetReader
            .<GenericRecord>builder(
                    HadoopInputFile.fromPath(new Path(new File(parquetPath).toURI()), new Configuration()))
            .build())
    {
        return reader.read();
    }
}

Source File: TestParquetInLining.java From hudi with Apache License 2.0

5 votes

@Test
public void testSimpleInlineFileSystem() throws IOException {
  Path outerInMemFSPath = getRandomOuterInMemPath();
  Path outerPath = new Path(FILE_SCHEME + outerInMemFSPath.toString().substring(outerInMemFSPath.toString().indexOf(':')));
  generatedPath = outerPath;
  ParquetWriter inlineWriter = new AvroParquetWriter(outerInMemFSPath, HoodieTestDataGenerator.AVRO_SCHEMA,
      CompressionCodecName.GZIP, 100 * 1024 * 1024, 1024 * 1024, true, inMemoryConf);
  // write few records
  List<GenericRecord> recordsToWrite = getParquetHoodieRecords();
  for (GenericRecord rec : recordsToWrite) {
    inlineWriter.write(rec);
  }
  inlineWriter.close();
  byte[] inlineBytes = getBytesToInline(outerInMemFSPath);
  long startOffset = generateOuterFile(outerPath, inlineBytes);

  long inlineLength = inlineBytes.length;

  // Generate phantom inline file
  Path inlinePath = getPhantomFile(outerPath, startOffset, inlineLength);

  // instantiate Parquet reader
  ParquetReader inLineReader = AvroParquetReader.builder(inlinePath).withConf(inlineConf).build();
  List<GenericRecord> records = readParquetGenericRecords(inLineReader);
  assertArrayEquals(recordsToWrite.toArray(), records.toArray());
  inLineReader.close();
}

Source File: ParquetUtils.java From hudi with Apache License 2.0

5 votes

/**
 * Fetch {@link HoodieKey}s from the given parquet file.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @return {@link List} of {@link HoodieKey}s fetched from the parquet file
 */
public static List<HoodieKey> fetchRecordKeyPartitionPathFromParquet(Configuration configuration, Path filePath) {
  List<HoodieKey> hoodieKeys = new ArrayList<>();
  try {
    if (!filePath.getFileSystem(configuration).exists(filePath)) {
      return new ArrayList<>();
    }

    Configuration conf = new Configuration(configuration);
    conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
    Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
    AvroReadSupport.setAvroReadSchema(conf, readSchema);
    AvroReadSupport.setRequestedProjection(conf, readSchema);
    ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build();
    Object obj = reader.read();
    while (obj != null) {
      if (obj instanceof GenericRecord) {
        String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
        String partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
        hoodieKeys.add(new HoodieKey(recordKey, partitionPath));
        obj = reader.read();
      }
    }
  } catch (IOException e) {
    throw new HoodieIOException("Failed to read from Parquet file " + filePath, e);
  }
  return hoodieKeys;
}

Source File: ParquetUtils.java From hudi with Apache License 2.0

5 votes

/**
 * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
 * return all the rowkeys.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @param filter        record keys filter
 * @param readSchema    schema of columns to be read
 * @return Set Set of row keys matching candidateRecordKeys
 */
private static Set<String> filterParquetRowKeys(Configuration configuration, Path filePath, Set<String> filter,
                                                Schema readSchema) {
  Option<RecordKeysFilterFunction> filterFunction = Option.empty();
  if (filter != null && !filter.isEmpty()) {
    filterFunction = Option.of(new RecordKeysFilterFunction(filter));
  }
  Configuration conf = new Configuration(configuration);
  conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
  AvroReadSupport.setAvroReadSchema(conf, readSchema);
  AvroReadSupport.setRequestedProjection(conf, readSchema);
  Set<String> rowKeys = new HashSet<>();
  try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build()) {
    Object obj = reader.read();
    while (obj != null) {
      if (obj instanceof GenericRecord) {
        String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
        if (!filterFunction.isPresent() || filterFunction.get().apply(recordKey)) {
          rowKeys.add(recordKey);
        }
      }
      obj = reader.read();
    }
  } catch (IOException e) {
    throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e);

  }
  // ignore
  return rowKeys;
}

Source File: ParquetFileReader.java From kafka-connect-fs with Apache License 2.0

5 votes

private ParquetReader<GenericRecord> initReader() throws IOException {
    Configuration configuration = getFs().getConf();
    if (this.schema != null) {
        AvroReadSupport.setAvroReadSchema(configuration, this.schema);
    }
    if (this.projection != null) {
        AvroReadSupport.setRequestedProjection(configuration, this.projection);
    }
    return AvroParquetReader
            .<GenericRecord>builder(HadoopInputFile.fromPath(getFilePath(), configuration))
            .build();
}

Source File: DataLoad.java From arvo2parquet with MIT License

5 votes

private static void readFromParquet(@Nonnull final Path filePathToRead) throws IOException {
  try (final ParquetReader<GenericData.Record> reader = AvroParquetReader
          .<GenericData.Record>builder(nioPathToInputFile(filePathToRead))
          .withConf(new Configuration())
          .build())
  {
    GenericData.Record record;
    while ((record = reader.read()) != null) {
      System.out.println(record);
    }
  }
}

Source File: ParquetStreamingFileSinkITCase.java From flink with Apache License 2.0

5 votes

private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}

Source File: ParquetStreamingFileSinkITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
	InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());

	ArrayList<T> results = new ArrayList<>();
	try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
		T next;
		while ((next = reader.read()) != null) {
			results.add(next);
		}
	}

	return results;
}

Source File: HoodieParquetReader.java From hudi with Apache License 2.0

4 votes

@Override
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
  AvroReadSupport.setAvroReadSchema(conf, schema);
  ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(path).withConf(conf).build();
  return new ParquetReaderIterator(reader);
}

Source File: FetchParquet.java From nifi with Apache License 2.0

4 votes

@Override
public HDFSRecordReader createHDFSRecordReader(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path)
        throws IOException {
    final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader.<GenericRecord>builder(path).withConf(conf);
    return new AvroParquetHDFSRecordReader(readerBuilder.build());
}

Source File: ParquetReader.java From reef with Apache License 2.0

2 votes

/**
 * Construct an avro reader from parquet file.
 * @return avro reader based on the provided parquet file.
 * @throws IOException if the parquet file couldn't be parsed correctly.
 */
private AvroParquetReader<GenericRecord> createAvroReader() throws IOException {
  return new AvroParquetReader<GenericRecord>(parquetFilePath);
}

org.apache.parquet.avro.AvroParquetReader Java Examples