Java Code Examples for org.apache.parquet.hadoop.ParquetReader#read()

The following examples show how to use org.apache.parquet.hadoop.ParquetReader#read() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestThriftToParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteFileMapOfList() throws IOException, InterruptedException, TException {
  Map<String, List<String>> map = new HashMap<String,List<String>>();
  map.put("key", Arrays.asList("val1","val2"));
  final TestListInMap mapList = new TestListInMap("maplist", map);
  final Path fileToCreate = createFile(mapList);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals("key",
        g.getGroup("names", 0).getGroup("map",0).getBinary("key", 0).toStringUsingUTF8());
    assertEquals(map.get("key").size(),
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getFieldRepetitionCount(0));
  }
}
 
Example 2
Source File: TestFiltersWithMissingColumns.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), path)
      .withFilter(FilterCompat.get(pred))
      .build();

  long count = 0;
  try {
    while (reader.read() != null) {
      count += 1;
    }
  } finally {
    reader.close();
  }
  return count;
}
 
Example 3
Source File: ReadBenchmarks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
  for (int i = 0; i < nRows; i++) {
    Group group = reader.read();
    blackhole.consume(group.getBinary("binary_field", 0));
    blackhole.consume(group.getInteger("int32_field", 0));
    blackhole.consume(group.getLong("int64_field", 0));
    blackhole.consume(group.getBoolean("boolean_field", 0));
    blackhole.consume(group.getFloat("float_field", 0));
    blackhole.consume(group.getDouble("double_field", 0));
    blackhole.consume(group.getBinary("flba_field", 0));
    blackhole.consume(group.getInt96("int96_field", 0));
  }
  reader.close();
}
 
Example 4
Source File: TestThriftToParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteFileListOfMap() throws IOException, InterruptedException, TException {
  Map<String, String> map1 = new HashMap<String,String>();
  map1.put("key11", "value11");
  map1.put("key12", "value12");
  Map<String, String> map2 = new HashMap<String,String>();
  map2.put("key21", "value21");
  final TestMapInList listMap = new TestMapInList("listmap",
      Arrays.asList(map1, map2));

  final Path fileToCreate = createFile(listMap);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals(listMap.names.size(),
        g.getGroup("names", 0).getFieldRepetitionCount("names_tuple"));
    assertEquals(listMap.names.get(0).size(),
        g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map"));
    assertEquals(listMap.names.get(1).size(),
        g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map"));
  }
}
 
Example 5
Source File: TestThriftToParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException {
  Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>();
  map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2"));
  final TestListsInMap mapList = new TestListsInMap("maplists", map);
  final Path fileToCreate = createFile(mapList);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals("key1",
        g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8());
    assertEquals("key2",
        g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8());
    assertEquals("val1",
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8());
    assertEquals("val2",
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8());
  }
}
 
Example 6
Source File: ParquetHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
protected List<TestRecord> readParquetFilesGroup(File outputFile)
    throws IOException {
  ParquetReader<Group> reader = null;
  List<Group> records = new ArrayList<>();
  try {
    reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport());
    for (Group value = reader.read(); value != null; value = reader.read()) {
      records.add(value);
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records.stream().map(value -> new TestRecord(
      value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0),
      value.getLong(TestConstants.SEQUENCE_FIELD_NAME, 0),
      value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0)
  )).collect(Collectors.toList());
}
 
Example 7
Source File: TestBackwardCompatibility.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testCompatStringCompatibility() throws IOException {
  // some older versions of Parquet used avro.schema instead of
  // parquet.avro.schema and didn't annotate binary with UTF8 when the type
  // was converted from an Avro string. this validates that the old read
  // schema is recognized and used to read the file as expected.
  Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile());
  Configuration conf = new Configuration();
  ParquetReader<GenericRecord> reader = AvroParquetReader
      .builder(new AvroReadSupport<GenericRecord>(), testFile)
      .withConf(conf)
      .build();
  GenericRecord r;
  while ((r = reader.read()) != null) {
    Assert.assertTrue("Should read value into a String",
        r.get("text") instanceof String);
  }
}
 
Example 8
Source File: ParquetHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
private List<TestRecord> readParquetFilesAvro(File outputFile)
    throws IOException {
  ParquetReader<org.apache.gobblin.test.avro.TestRecord> reader = null;
  List<TestRecord> records = new ArrayList<>();
  try {
    reader = new AvroParquetReader<>(new Path(outputFile.toString()));
    for (org.apache.gobblin.test.avro.TestRecord value = reader.read(); value != null; value = reader.read()) {
      records.add(new TestRecord(value.getPartition(),
          value.getSequence(),
          value.getPayload()));
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records;

}
 
Example 9
Source File: BaseAvroParquetConvertIT.java    From datacollector with Apache License 2.0 6 votes vote down vote up
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  int position = 0;
  for(Map<String, Object> expectedRow : data) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + position, actualRow);

    for(Map.Entry<String, Object> entry : expectedRow.entrySet()) {
      Object value = actualRow.get(entry.getKey());
      Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value);
    }
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
Example 10
Source File: LargeInputFileIT.java    From datacollector with Apache License 2.0 6 votes vote down vote up
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  for(long i = 0; i < recourdCount; i++) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + i, actualRow);

    Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0);
    Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i)));
    Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i);
    Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100);
    Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100)));
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
Example 11
Source File: TestThriftToParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteFile() throws IOException, InterruptedException, TException {
  final AddressBook a = new AddressBook(
      Arrays.asList(
          new Person(
              new Name("Bob", "Roberts"),
              0,
              "bob.roberts@example.com",
              Arrays.asList(new PhoneNumber("1234567890")))));

  final Path fileToCreate = createFile(a);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  int i = 0;
  while((g = reader.read()) != null) {
    assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons"));
    assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0));
    // just some sanity check, we're testing the various layers somewhere else
    ++i;
  }
  assertEquals("read 1 record", 1, i);

}
 
Example 12
Source File: TestParquetInLining.java    From hudi with Apache License 2.0 5 votes vote down vote up
static List<GenericRecord> readParquetGenericRecords(ParquetReader reader) throws IOException {
  List<GenericRecord> toReturn = new ArrayList<>();
  Object obj = reader.read();
  while (obj instanceof GenericRecord) {
    toReturn.add((GenericRecord) obj);
    obj = reader.read();
  }
  return toReturn;
}
 
Example 13
Source File: ParquetFileReader.java    From streamx with Apache License 2.0 5 votes vote down vote up
@Override
public Collection<Object> readData(Configuration conf, Path path) throws IOException {
  Collection<Object> result = new ArrayList<>();
  AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
  ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
  ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
  GenericRecord record;
  while ((record = parquetReader.read()) != null) {
    result.add(record);
  }
  parquetReader.close();
  return result;
}
 
Example 14
Source File: ParquetFileReader.java    From streamx with Apache License 2.0 5 votes vote down vote up
@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
  AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
  ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
  ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
  GenericRecord record;
  Schema schema = null;
  while ((record = parquetReader.read()) != null) {
    schema = avroData.toConnectSchema(record.getSchema());
  }
  parquetReader.close();
  return schema;
}
 
Example 15
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateColumns(String file, int numRecord, TestDocs testDocs) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(file)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    assertTrue(group.getLong("DocId", 0) == testDocs.docId[i]);
    assertArrayEquals(group.getBinary("Name", 0).getBytes(), testDocs.name[i].getBytes());
    assertArrayEquals(group.getBinary("Gender", 0).getBytes(), testDocs.gender[i].getBytes());
    Group subGroup = group.getGroup("Links", 0);
    assertArrayEquals(subGroup.getBinary("Backward", 0).getBytes(), testDocs.linkBackward[i].getBytes());
    assertArrayEquals(subGroup.getBinary("Forward", 0).getBytes(), testDocs.linkForward[i].getBytes());
  }
  reader.close();
}
 
Example 16
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static List<Group> readFile(File f, Filter filter) throws IOException {
  ParquetReader<Group> reader = createReader(new Path(f.getAbsolutePath()), filter);

  Group current;
  List<Group> users = new ArrayList<Group>();

  current = reader.read();
  while (current != null) {
    users.add(current);
    current = reader.read();
  }

  return users;
}
 
Example 17
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static List<User> readUsers(ParquetReader.Builder<Group> builder) throws IOException {
  ParquetReader<Group> reader = builder.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()).build();

  List<User> users = new ArrayList<>();
  for (Group group = reader.read(); group != null; group = reader.read()) {
    users.add(userFromGroup(group));
  }
  return users;
}
 
Example 18
Source File: ScroogeBinaryTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testScroogeBinaryEncoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  // read using the parquet-thrift version to isolate the write path
  ParquetReader<org.apache.parquet.thrift.test.binary.StringAndBinary> reader = ThriftParquetReader.<org.apache.parquet.thrift.test.binary.StringAndBinary>
      build(path)
      .withThriftClass(org.apache.parquet.thrift.test.binary.StringAndBinary.class)
      .build();
  org.apache.parquet.thrift.test.binary.StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s);
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b);
}
 
Example 19
Source File: TestConvertAvroToParquet.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_Data() throws Exception {


    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(tmpParquet.getAbsolutePath()))
                    .withConf(conf)
                    .build();

    List<Group> parquetRecords = new ArrayList<Group>();

    Group current;
    current = reader.read();
    while (current != null) {
        assertTrue(current instanceof Group);
        parquetRecords.add(current);
        current = reader.read();
    }

    Group firstRecord = parquetRecords.get(0);

    // Primitive
    assertEquals(firstRecord.getInteger("myint", 0), 1);
    assertEquals(firstRecord.getLong("mylong", 0), 2);
    assertEquals(firstRecord.getBoolean("myboolean", 0), true);
    assertEquals(firstRecord.getFloat("myfloat", 0), 3.1, 0.0001);
    assertEquals(firstRecord.getDouble("mydouble", 0), 4.1, 0.001);
    assertEquals(firstRecord.getString("mybytes", 0), "hello");
    assertEquals(firstRecord.getString("mystring", 0), "hello");

    // Nested
    assertEquals(firstRecord.getGroup("mynestedrecord",0).getInteger("mynestedint",0), 1);

    // Array
    assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",0).getInteger("element", 0), 1);
    assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",1).getInteger("element", 0), 2);

    // Map
    assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",0).getInteger("value", 0), 1);
    assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",1).getInteger("value", 0), 2);

    // Fixed
    assertEquals(firstRecord.getString("myfixed",0), "A");

}
 
Example 20
Source File: ProtoParquetWriterWithOffsetTest.java    From garmadon with Apache License 2.0 4 votes vote down vote up
@Test
public void finalFileAndTempFilesMergedWhenFinalSizeIsNotBigEnough() throws IOException {
    localFs.getConf().set("fs.local.block.size", String.valueOf(Long.MAX_VALUE));

    final HdfsOffsetComputer hdfsOffsetComputer = new HdfsOffsetComputer(localFs, rootPath, 2);

    //we already create and populate tmp file as we will mock the underlying writer
    final Path tmpFile = new Path(tmpPath, "tmp_file");
    final Path existingFinalFile = new Path(finalPath, hdfsOffsetComputer.computePath(TODAY, 0L, 1));
    createParquetFile(
        tmpFile,
        DataAccessEventProtos.FsEvent.class,
        () -> DataAccessEventProtos.FsEvent.newBuilder().build(),
        987654321,
        1
    );
    createParquetFile(
        existingFinalFile,
        DataAccessEventProtos.FsEvent.class,
        () -> DataAccessEventProtos.FsEvent.newBuilder().build(),
        123456789,
        2
    );

    final ProtoParquetWriter<Message> writerMock = mock(ProtoParquetWriter.class);
    final BiConsumer<String, String> protoMetadataWriter = mock(BiConsumer.class);

    ProtoParquetWriterWithOffset parquetWriter = new ProtoParquetWriterWithOffset<>(writerMock,
        tmpFile, finalPath, localFs, hdfsOffsetComputer, TODAY, "ignored",
        protoMetadataWriter, 1);
    //simul write action
    parquetWriter.write(999999999L, mock(MessageOrBuilder.class), new TopicPartitionOffset(TOPIC, 1, 0));
    parquetWriter.close();


    Set<LocatedFileStatus> finalFiles = listFiles(localFs, finalPath);
    Set<LocatedFileStatus> tmpFiles = listFiles(localFs, tmpPath);

    assertEquals(1, finalFiles.size());
    assertTrue(containsFile(finalFiles, existingFinalFile));
    assertEquals(0, tmpFiles.size());

    //Check that it can still be read after merge
    ParquetReader<DataAccessEventProtos.FsEvent.Builder> reader = ProtoParquetReader
        .<DataAccessEventProtos.FsEvent.Builder>builder(existingFinalFile).build();
    int count = 0;
    while (reader.read() != null) {
        count++;
    }
    assertEquals(3, count);

    //timestamp should be the one of the latest tmp file merged
    checkFileLatestCommittedTimestamp(existingFinalFile, 999999999);
}