Java Code Examples for org.apache.parquet.hadoop.ParquetReader#read()
The following examples show how to use
org.apache.parquet.hadoop.ParquetReader#read() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testWriteFileMapOfList() throws IOException, InterruptedException, TException { Map<String, List<String>> map = new HashMap<String,List<String>>(); map.put("key", Arrays.asList("val1","val2")); final TestListInMap mapList = new TestListInMap("maplist", map); final Path fileToCreate = createFile(mapList); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals("key", g.getGroup("names", 0).getGroup("map",0).getBinary("key", 0).toStringUsingUTF8()); assertEquals(map.get("key").size(), g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getFieldRepetitionCount(0)); } }
Example 2
Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0 | 6 votes |
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{ ParquetReader<Group> reader = ParquetReader .builder(new GroupReadSupport(), path) .withFilter(FilterCompat.get(pred)) .build(); long count = 0; try { while (reader.read() != null) { count += 1; } } finally { reader.close(); } return count; }
Example 3
Source File: ReadBenchmarks.java From parquet-mr with Apache License 2.0 | 6 votes |
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException { ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build(); for (int i = 0; i < nRows; i++) { Group group = reader.read(); blackhole.consume(group.getBinary("binary_field", 0)); blackhole.consume(group.getInteger("int32_field", 0)); blackhole.consume(group.getLong("int64_field", 0)); blackhole.consume(group.getBoolean("boolean_field", 0)); blackhole.consume(group.getFloat("float_field", 0)); blackhole.consume(group.getDouble("double_field", 0)); blackhole.consume(group.getBinary("flba_field", 0)); blackhole.consume(group.getInt96("int96_field", 0)); } reader.close(); }
Example 4
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testWriteFileListOfMap() throws IOException, InterruptedException, TException { Map<String, String> map1 = new HashMap<String,String>(); map1.put("key11", "value11"); map1.put("key12", "value12"); Map<String, String> map2 = new HashMap<String,String>(); map2.put("key21", "value21"); final TestMapInList listMap = new TestMapInList("listmap", Arrays.asList(map1, map2)); final Path fileToCreate = createFile(listMap); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals(listMap.names.size(), g.getGroup("names", 0).getFieldRepetitionCount("names_tuple")); assertEquals(listMap.names.get(0).size(), g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map")); assertEquals(listMap.names.get(1).size(), g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map")); } }
Example 5
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException { Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>(); map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2")); final TestListsInMap mapList = new TestListsInMap("maplists", map); final Path fileToCreate = createFile(mapList); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals("key1", g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8()); assertEquals("key2", g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8()); assertEquals("val1", g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8()); assertEquals("val2", g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8()); } }
Example 6
Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 6 votes |
protected List<TestRecord> readParquetFilesGroup(File outputFile) throws IOException { ParquetReader<Group> reader = null; List<Group> records = new ArrayList<>(); try { reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport()); for (Group value = reader.read(); value != null; value = reader.read()) { records.add(value); } } finally { if (reader != null) { try { reader.close(); } catch (Exception ex) { System.out.println(ex.getMessage()); } } } return records.stream().map(value -> new TestRecord( value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0), value.getLong(TestConstants.SEQUENCE_FIELD_NAME, 0), value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0) )).collect(Collectors.toList()); }
Example 7
Source File: TestBackwardCompatibility.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testCompatStringCompatibility() throws IOException { // some older versions of Parquet used avro.schema instead of // parquet.avro.schema and didn't annotate binary with UTF8 when the type // was converted from an Avro string. this validates that the old read // schema is recognized and used to read the file as expected. Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile()); Configuration conf = new Configuration(); ParquetReader<GenericRecord> reader = AvroParquetReader .builder(new AvroReadSupport<GenericRecord>(), testFile) .withConf(conf) .build(); GenericRecord r; while ((r = reader.read()) != null) { Assert.assertTrue("Should read value into a String", r.get("text") instanceof String); } }
Example 8
Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 6 votes |
private List<TestRecord> readParquetFilesAvro(File outputFile) throws IOException { ParquetReader<org.apache.gobblin.test.avro.TestRecord> reader = null; List<TestRecord> records = new ArrayList<>(); try { reader = new AvroParquetReader<>(new Path(outputFile.toString())); for (org.apache.gobblin.test.avro.TestRecord value = reader.read(); value != null; value = reader.read()) { records.add(new TestRecord(value.getPartition(), value.getSequence(), value.getPayload())); } } finally { if (reader != null) { try { reader.close(); } catch (Exception ex) { System.out.println(ex.getMessage()); } } } return records; }
Example 9
Source File: BaseAvroParquetConvertIT.java From datacollector with Apache License 2.0 | 6 votes |
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException { ParquetReader reader = AvroParquetReader.builder(parquetFile) .build(); int position = 0; for(Map<String, Object> expectedRow : data) { GenericData.Record actualRow = (GenericData.Record) reader.read(); Assert.assertNotNull("Can't read row " + position, actualRow); for(Map.Entry<String, Object> entry : expectedRow.entrySet()) { Object value = actualRow.get(entry.getKey()); Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value); } } Assert.assertNull("Parquet file contains more then expected rows", reader.read()); }
Example 10
Source File: LargeInputFileIT.java From datacollector with Apache License 2.0 | 6 votes |
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException { ParquetReader reader = AvroParquetReader.builder(parquetFile) .build(); for(long i = 0; i < recourdCount; i++) { GenericData.Record actualRow = (GenericData.Record) reader.read(); Assert.assertNotNull("Can't read row " + i, actualRow); Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0); Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i))); Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i); Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100); Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100))); } Assert.assertNull("Parquet file contains more then expected rows", reader.read()); }
Example 11
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testWriteFile() throws IOException, InterruptedException, TException { final AddressBook a = new AddressBook( Arrays.asList( new Person( new Name("Bob", "Roberts"), 0, "[email protected]", Arrays.asList(new PhoneNumber("1234567890"))))); final Path fileToCreate = createFile(a); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; int i = 0; while((g = reader.read()) != null) { assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons")); assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0)); // just some sanity check, we're testing the various layers somewhere else ++i; } assertEquals("read 1 record", 1, i); }
Example 12
Source File: TestParquetInLining.java From hudi with Apache License 2.0 | 5 votes |
static List<GenericRecord> readParquetGenericRecords(ParquetReader reader) throws IOException { List<GenericRecord> toReturn = new ArrayList<>(); Object obj = reader.read(); while (obj instanceof GenericRecord) { toReturn.add((GenericRecord) obj); obj = reader.read(); } return toReturn; }
Example 13
Source File: ParquetFileReader.java From streamx with Apache License 2.0 | 5 votes |
@Override public Collection<Object> readData(Configuration conf, Path path) throws IOException { Collection<Object> result = new ArrayList<>(); AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>(); ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path); ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build(); GenericRecord record; while ((record = parquetReader.read()) != null) { result.add(record); } parquetReader.close(); return result; }
Example 14
Source File: ParquetFileReader.java From streamx with Apache License 2.0 | 5 votes |
@Override public Schema getSchema(Configuration conf, Path path) throws IOException { AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>(); ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path); ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build(); GenericRecord record; Schema schema = null; while ((record = parquetReader.read()) != null) { schema = avroData.toConnectSchema(record.getSchema()); } parquetReader.close(); return schema; }
Example 15
Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateColumns(String file, int numRecord, TestDocs testDocs) throws IOException { ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(file)).withConf(conf).build(); for (int i = 0; i < numRecord; i++) { Group group = reader.read(); assertTrue(group.getLong("DocId", 0) == testDocs.docId[i]); assertArrayEquals(group.getBinary("Name", 0).getBytes(), testDocs.name[i].getBytes()); assertArrayEquals(group.getBinary("Gender", 0).getBytes(), testDocs.gender[i].getBytes()); Group subGroup = group.getGroup("Links", 0); assertArrayEquals(subGroup.getBinary("Backward", 0).getBytes(), testDocs.linkBackward[i].getBytes()); assertArrayEquals(subGroup.getBinary("Forward", 0).getBytes(), testDocs.linkForward[i].getBytes()); } reader.close(); }
Example 16
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
public static List<Group> readFile(File f, Filter filter) throws IOException { ParquetReader<Group> reader = createReader(new Path(f.getAbsolutePath()), filter); Group current; List<Group> users = new ArrayList<Group>(); current = reader.read(); while (current != null) { users.add(current); current = reader.read(); } return users; }
Example 17
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
public static List<User> readUsers(ParquetReader.Builder<Group> builder) throws IOException { ParquetReader<Group> reader = builder.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()).build(); List<User> users = new ArrayList<>(); for (Group group = reader.read(); group != null; group = reader.read()) { users.add(userFromGroup(group)); } return users; }
Example 18
Source File: ScroogeBinaryTest.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testScroogeBinaryEncoding() throws Exception { StringAndBinary expected = new StringAndBinary.Immutable("test", ByteBuffer.wrap(new byte[] {-123, 20, 33})); File temp = tempDir.newFile(UUID.randomUUID().toString()); temp.deleteOnExit(); temp.delete(); Path path = new Path(temp.getPath()); ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>( path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class)); writer.write(expected); writer.close(); // read using the parquet-thrift version to isolate the write path ParquetReader<org.apache.parquet.thrift.test.binary.StringAndBinary> reader = ThriftParquetReader.<org.apache.parquet.thrift.test.binary.StringAndBinary> build(path) .withThriftClass(org.apache.parquet.thrift.test.binary.StringAndBinary.class) .build(); org.apache.parquet.thrift.test.binary.StringAndBinary record = reader.read(); reader.close(); Assert.assertEquals("String should match after serialization round trip", "test", record.s); Assert.assertEquals("ByteBuffer should match after serialization round trip", ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b); }
Example 19
Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0 | 4 votes |
@Test public void test_Data() throws Exception { FileInputStream fileInputStream = new FileInputStream(tmpAvro); ByteArrayOutputStream out = new ByteArrayOutputStream(); int readedBytes; byte[] buf = new byte[1024]; while ((readedBytes = fileInputStream.read(buf)) > 0) { out.write(buf, 0, readedBytes); } out.close(); Map<String, String> attributes = new HashMap<String, String>() {{ put(CoreAttributes.FILENAME.key(), "test.avro"); }}; runner.enqueue(out.toByteArray(), attributes); runner.run(); MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0); // Save the flowfile byte[] resultContents = runner.getContentAsByteArray(resultFlowFile); FileOutputStream fos = new FileOutputStream(tmpParquet); fos.write(resultContents); fos.flush(); fos.close(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(tmpParquet.getAbsolutePath())) .withConf(conf) .build(); List<Group> parquetRecords = new ArrayList<Group>(); Group current; current = reader.read(); while (current != null) { assertTrue(current instanceof Group); parquetRecords.add(current); current = reader.read(); } Group firstRecord = parquetRecords.get(0); // Primitive assertEquals(firstRecord.getInteger("myint", 0), 1); assertEquals(firstRecord.getLong("mylong", 0), 2); assertEquals(firstRecord.getBoolean("myboolean", 0), true); assertEquals(firstRecord.getFloat("myfloat", 0), 3.1, 0.0001); assertEquals(firstRecord.getDouble("mydouble", 0), 4.1, 0.001); assertEquals(firstRecord.getString("mybytes", 0), "hello"); assertEquals(firstRecord.getString("mystring", 0), "hello"); // Nested assertEquals(firstRecord.getGroup("mynestedrecord",0).getInteger("mynestedint",0), 1); // Array assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",0).getInteger("element", 0), 1); assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",1).getInteger("element", 0), 2); // Map assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",0).getInteger("value", 0), 1); assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",1).getInteger("value", 0), 2); // Fixed assertEquals(firstRecord.getString("myfixed",0), "A"); }
Example 20
Source File: ProtoParquetWriterWithOffsetTest.java From garmadon with Apache License 2.0 | 4 votes |
@Test public void finalFileAndTempFilesMergedWhenFinalSizeIsNotBigEnough() throws IOException { localFs.getConf().set("fs.local.block.size", String.valueOf(Long.MAX_VALUE)); final HdfsOffsetComputer hdfsOffsetComputer = new HdfsOffsetComputer(localFs, rootPath, 2); //we already create and populate tmp file as we will mock the underlying writer final Path tmpFile = new Path(tmpPath, "tmp_file"); final Path existingFinalFile = new Path(finalPath, hdfsOffsetComputer.computePath(TODAY, 0L, 1)); createParquetFile( tmpFile, DataAccessEventProtos.FsEvent.class, () -> DataAccessEventProtos.FsEvent.newBuilder().build(), 987654321, 1 ); createParquetFile( existingFinalFile, DataAccessEventProtos.FsEvent.class, () -> DataAccessEventProtos.FsEvent.newBuilder().build(), 123456789, 2 ); final ProtoParquetWriter<Message> writerMock = mock(ProtoParquetWriter.class); final BiConsumer<String, String> protoMetadataWriter = mock(BiConsumer.class); ProtoParquetWriterWithOffset parquetWriter = new ProtoParquetWriterWithOffset<>(writerMock, tmpFile, finalPath, localFs, hdfsOffsetComputer, TODAY, "ignored", protoMetadataWriter, 1); //simul write action parquetWriter.write(999999999L, mock(MessageOrBuilder.class), new TopicPartitionOffset(TOPIC, 1, 0)); parquetWriter.close(); Set<LocatedFileStatus> finalFiles = listFiles(localFs, finalPath); Set<LocatedFileStatus> tmpFiles = listFiles(localFs, tmpPath); assertEquals(1, finalFiles.size()); assertTrue(containsFile(finalFiles, existingFinalFile)); assertEquals(0, tmpFiles.size()); //Check that it can still be read after merge ParquetReader<DataAccessEventProtos.FsEvent.Builder> reader = ProtoParquetReader .<DataAccessEventProtos.FsEvent.Builder>builder(existingFinalFile).build(); int count = 0; while (reader.read() != null) { count++; } assertEquals(3, count); //timestamp should be the one of the latest tmp file merged checkFileLatestCommittedTimestamp(existingFinalFile, 999999999); }