Java Code Examples for org.apache.avro.file.DataFileReader#openReader()
The following examples show how to use
org.apache.avro.file.DataFileReader#openReader() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0 | 6 votes |
public void convert(SeekableInput avroInputFile, Path orcOutputFile) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<>(); try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroInputFile, reader)) { Schema avroSchema = fileReader.getSchema(); initializeWriter(avroSchema, orcOutputFile); while (fileReader.hasNext()) { GenericRecord record = fileReader.next(); addAvroRecord(record); } closeWriter(); } }
Example 2
Source File: AvroInputFormat.java From stratosphere with Apache License 2.0 | 6 votes |
@Override public void open(FileInputSplit split) throws IOException { super.open(split); DatumReader<E> datumReader; if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) { datumReader = new SpecificDatumReader<E>(avroValueType); } else { datumReader = new ReflectDatumReader<E>(avroValueType); } LOG.info("Opening split " + split); SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength()); dataFileReader = DataFileReader.openReader(in, datumReader); dataFileReader.sync(split.getStart()); }
Example 3
Source File: TestMerge.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
private boolean checkAvroFileForLine(FileSystem fs, Path p, List<Integer> record) throws IOException { SeekableInput in = new FsInput(p, new Configuration()); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> reader = DataFileReader.openReader(in, datumReader); reader.sync(0); while (reader.hasNext()) { if (valueMatches(reader.next(), record)) { return true; } } return false; }
Example 4
Source File: ClusterHdfsSource.java From datacollector with Apache License 2.0 | 5 votes |
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException { int previewCount = previewBuffer.size(); Path filePath = fileStatus.getPath(); SeekableInput input = new FsInput(filePath, hadoopConf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); List<Map.Entry> batch = new ArrayList<>(); try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader)) { int count = 0; while (fileReader.hasNext() && batch.size() < batchSize && previewCount < batchSize) { GenericRecord datum = fileReader.next(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord> (datum.getSchema())); try { dataFileWriter.create(datum.getSchema(), out); dataFileWriter.append(datum); } finally { dataFileWriter.close(); out.close(); } batch.add(new Pair(filePath.toUri().getPath() + "::" + count, out.toByteArray())); count++; previewCount++; } } return batch; }
Example 5
Source File: TimelineMetadataUtils.java From hudi with Apache License 2.0 | 5 votes |
public static <T extends SpecificRecordBase> T deserializeAvroMetadata(byte[] bytes, Class<T> clazz) throws IOException { DatumReader<T> reader = new SpecificDatumReader<>(clazz); FileReader<T> fileReader = DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader); ValidationUtils.checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz); return fileReader.next(); }
Example 6
Source File: AvroRecordInputFormat.java From stratosphere with Apache License 2.0 | 5 votes |
@Override public void open(FileInputSplit split) throws IOException { super.open(split); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength()); LOG.info("Opening split " + split); dataFileReader = DataFileReader.openReader(in, datumReader); dataFileReader.sync(split.getStart()); }
Example 7
Source File: AvroFileReader.java From streamx with Apache License 2.0 | 5 votes |
@Override public Schema getSchema(Configuration conf, Path path) throws IOException { SeekableInput input = new FsInput(path, conf); DatumReader<Object> reader = new GenericDatumReader<>(); FileReader<Object> fileReader = DataFileReader.openReader(input, reader); org.apache.avro.Schema schema = fileReader.getSchema(); fileReader.close(); return avroData.toConnectSchema(schema); }
Example 8
Source File: ServerSinkSourceConfigurationTest.java From divolte-collector with Apache License 2.0 | 5 votes |
private static Stream<GenericRecord> listRecords(final Path avroFile) { final GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); logger.debug("Reading records from new Avro file: {}", avroFile); try (final FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroFile.toFile(), datumReader)) { final ImmutableList<GenericRecord> records = ImmutableList.copyOf(fileReader.iterator()); logger.info("Read {} record(s) from new Avro file: {}", records.size(), avroFile); return records.stream(); } catch (final IOException e) { throw new UncheckedIOException("Error reading records from file: " + avroFile, e); } }
Example 9
Source File: AvroRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration conf = context.getConfiguration(); SeekableInput in = new FsInput(split.getPath(), conf); DatumReader<T> datumReader = new GenericDatumReader<T>(); this.reader = DataFileReader.openReader(in, datumReader); reader.sync(split.getStart()); // sync to start this.start = reader.tell(); this.end = split.getStart() + split.getLength(); }
Example 10
Source File: AvroUtil.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
/** * Get the schema of AVRO files stored in a directory */ public static Schema getAvroSchema(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); Path fileToTest; if (fs.isDirectory(path)) { FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }); if (fileStatuses.length == 0) { return null; } fileToTest = fileStatuses[0].getPath(); } else { fileToTest = path; } SeekableInput input = new FsInput(fileToTest, conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); Schema result = fileReader.getSchema(); fileReader.close(); return result; }
Example 11
Source File: AvroArrayReader.java From spork with Apache License 2.0 | 5 votes |
@Override public void initialize(final InputSplit isplit, final TaskAttemptContext tc) throws IOException, InterruptedException { FileSplit fsplit = (FileSplit) isplit; start = fsplit.getStart(); end = fsplit.getStart() + fsplit.getLength(); DatumReader<GenericData.Array<Object>> datumReader = new GenericDatumReader<GenericData.Array<Object>>(schema); reader = DataFileReader.openReader( new FsInput(fsplit.getPath(), tc.getConfiguration()), datumReader); reader.sync(start); }
Example 12
Source File: AvroIterable.java From iceberg with Apache License 2.0 | 5 votes |
private DataFileReader<D> newFileReader() { try { return (DataFileReader<D>) DataFileReader.openReader( AvroIO.stream(file.newStream(), file.getLength()), reader); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to open file: %s", file); } }
Example 13
Source File: AvroToJsonConverter.java From celos with Apache License 2.0 | 5 votes |
@Override public FixFile convert(TestRun testRun, FixFile ff) throws IOException { byte[] bytes = IOUtils.toByteArray(ff.getContent()); if (bytes.length == 0) { return ff; } ByteArrayOutputStream os = new ByteArrayOutputStream(); GenericDatumReader<Object> reader = new GenericDatumReader<>(); FileReader<Object> fileReader = DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader); try { Schema schema = fileReader.getSchema(); DatumWriter<Object> writer = new GenericDatumWriter<>(schema); JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, os); for (Object datum : fileReader) { writer.write(datum, encoder); } encoder.flush(); } finally { fileReader.close(); } return new FixFile(new ByteArrayInputStream(os.toByteArray())); }
Example 14
Source File: AvroRecordInputFormatTest.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
/** * This test validates proper serialization with specific (generated POJO) types. */ @Test public void testDeserializeToSpecificType() throws IOException { DatumReader<User> datumReader = new SpecificDatumReader<>(userSchema); try (FileReader<User> dataFileReader = DataFileReader.openReader(testFile, datumReader)) { User rec = dataFileReader.next(); // check if record has been read correctly assertNotNull(rec); assertEquals("name not equal", TEST_NAME, rec.get("name").toString()); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString()); // now serialize it with our framework: ExecutionConfig ec = new ExecutionConfig(); TypeInformation<User> te = TypeExtractor.createTypeInfo(User.class); assertEquals(AvroTypeInfo.class, te.getClass()); TypeSerializer<User> tser = te.createSerializer(ec); ByteArrayOutputStream out = new ByteArrayOutputStream(); try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) { tser.serialize(rec, outView); } User newRec; try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper( new ByteArrayInputStream(out.toByteArray()))) { newRec = tser.deserialize(inView); } // check if it is still the same assertNotNull(newRec); assertEquals("name not equal", TEST_NAME, newRec.getName().toString()); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.getTypeEnum().toString()); } }
Example 15
Source File: AvroAsTextInputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
AvroAsTextRecordReader(JobConf job, FileSplit split) throws IOException { this(DataFileReader.openReader (new FsInput(split.getPath(), job), new GenericDatumReader<T>()), split); }
Example 16
Source File: RowLevelQualityCheckerTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
private FileReader<GenericRecord> openFile(State state) throws Exception { DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(new File(TestConstants.TEST_FILE_NAME), reader); return fileReader; }
Example 17
Source File: AvroRecordInputFormatTest.java From flink with Apache License 2.0 | 4 votes |
/** * Test if the Flink serialization is able to properly process GenericData.Record types. * Usually users of Avro generate classes (POJOs) from Avro schemas. * However, if generated classes are not available, one can also use GenericData.Record. * It is an untyped key-value record which is using a schema to validate the correctness of the data. * * <p>It is not recommended to use GenericData.Record with Flink. Use generated POJOs instead. */ @Test public void testDeserializeToGenericType() throws IOException { DatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(userSchema); try (FileReader<GenericData.Record> dataFileReader = DataFileReader.openReader(testFile, datumReader)) { // initialize Record by reading it from disk (that's easier than creating it by hand) GenericData.Record rec = new GenericData.Record(userSchema); dataFileReader.next(rec); // check if record has been read correctly assertNotNull(rec); assertEquals("name not equal", TEST_NAME, rec.get("name").toString()); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString()); assertEquals(null, rec.get("type_long_test")); // it is null for the first record. // now serialize it with our framework: TypeInformation<GenericData.Record> te = TypeExtractor.createTypeInfo(GenericData.Record.class); ExecutionConfig ec = new ExecutionConfig(); assertEquals(GenericTypeInfo.class, te.getClass()); Serializers.recursivelyRegisterType(te.getTypeClass(), ec, new HashSet<>()); TypeSerializer<GenericData.Record> tser = te.createSerializer(ec); assertEquals(1, ec.getDefaultKryoSerializerClasses().size()); assertTrue( ec.getDefaultKryoSerializerClasses().containsKey(Schema.class) && ec.getDefaultKryoSerializerClasses().get(Schema.class).equals(AvroKryoSerializerUtils.AvroSchemaSerializer.class)); ByteArrayOutputStream out = new ByteArrayOutputStream(); try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) { tser.serialize(rec, outView); } GenericData.Record newRec; try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper( new ByteArrayInputStream(out.toByteArray()))) { newRec = tser.deserialize(inView); } // check if it is still the same assertNotNull(newRec); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.get("type_enum").toString()); assertEquals("name not equal", TEST_NAME, newRec.get("name").toString()); assertEquals(null, newRec.get("type_long_test")); } }
Example 18
Source File: AvroRecordInputFormatTest.java From flink with Apache License 2.0 | 4 votes |
/** * Test if the Flink serialization is able to properly process GenericData.Record types. * Usually users of Avro generate classes (POJOs) from Avro schemas. * However, if generated classes are not available, one can also use GenericData.Record. * It is an untyped key-value record which is using a schema to validate the correctness of the data. * * <p>It is not recommended to use GenericData.Record with Flink. Use generated POJOs instead. */ @Test public void testDeserializeToGenericType() throws IOException { DatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(userSchema); try (FileReader<GenericData.Record> dataFileReader = DataFileReader.openReader(testFile, datumReader)) { // initialize Record by reading it from disk (that's easier than creating it by hand) GenericData.Record rec = new GenericData.Record(userSchema); dataFileReader.next(rec); // check if record has been read correctly assertNotNull(rec); assertEquals("name not equal", TEST_NAME, rec.get("name").toString()); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString()); assertEquals(null, rec.get("type_long_test")); // it is null for the first record. // now serialize it with our framework: TypeInformation<GenericData.Record> te = TypeExtractor.createTypeInfo(GenericData.Record.class); ExecutionConfig ec = new ExecutionConfig(); assertEquals(GenericTypeInfo.class, te.getClass()); Serializers.recursivelyRegisterType(te.getTypeClass(), ec, new HashSet<>()); TypeSerializer<GenericData.Record> tser = te.createSerializer(ec); assertEquals(1, ec.getDefaultKryoSerializerClasses().size()); assertTrue( ec.getDefaultKryoSerializerClasses().containsKey(Schema.class) && ec.getDefaultKryoSerializerClasses().get(Schema.class).equals(AvroKryoSerializerUtils.AvroSchemaSerializer.class)); ByteArrayOutputStream out = new ByteArrayOutputStream(); try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) { tser.serialize(rec, outView); } GenericData.Record newRec; try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper( new ByteArrayInputStream(out.toByteArray()))) { newRec = tser.deserialize(inView); } // check if it is still the same assertNotNull(newRec); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.get("type_enum").toString()); assertEquals("name not equal", TEST_NAME, newRec.get("name").toString()); assertEquals(null, newRec.get("type_long_test")); } }
Example 19
Source File: AvroConversionBaseMapper.java From datacollector with Apache License 2.0 | 4 votes |
@Override protected void map(String input, String output, Context context) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(context.getConfiguration()); Configuration conf = context.getConfiguration(); LOG.info("Converting input file: {}", input); LOG.info("Output directory: {}", output); Path inputPath = new Path(input); Path outputDir = new Path(output); fs.mkdirs(outputDir); Path tempFile = new Path(outputDir, getTempFilePrefix() + inputPath.getName()); if(fs.exists(tempFile)) { if(conf.getBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, false)) { fs.delete(tempFile, true); } else { throw new IOException("Temporary file " + tempFile + " already exists."); } } LOG.info("Using temp file: {}", tempFile); // Output file is the same as input except of dropping .avro extension if it exists and appending .parquet or .orc String outputFileName = inputPath.getName().replaceAll("\\.avro$", "") + getOutputFileSuffix(); Path finalFile = new Path(outputDir, outputFileName); LOG.info("Final path will be: {}", finalFile); // Avro reader SeekableInput seekableInput = new FsInput(inputPath, conf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(seekableInput, reader); Schema avroSchema = fileReader.getSchema(); initializeWriter(tempFile, avroSchema, conf, context); LOG.info("Started reading input file"); long recordCount = 0; try { while (fileReader.hasNext()) { GenericRecord record = fileReader.next(); handleAvroRecord(record); context.getCounter(Counters.PROCESSED_RECORDS).increment(1); recordCount++; } } catch (Exception e) { // Various random stuff can happen while converting, so we wrap the underlying exception with more details String message = String.format( "Exception at offset %d (record %d): %s", fileReader.tell(), recordCount, e.toString() ); throw new IOException(message, e); } LOG.info("Done reading input file"); closeWriter(); LOG.info("Moving temporary file {} to final destination {}", tempFile, finalFile); fs.rename(tempFile, finalFile); if(!context.getConfiguration().getBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, false)) { LOG.info("Removing input file", inputPath); fs.delete(inputPath, true); } LOG.info("Done converting input file into output directory {}", output); }
Example 20
Source File: AvroRecordInputFormatTest.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Test if the Flink serialization is able to properly process GenericData.Record types. * Usually users of Avro generate classes (POJOs) from Avro schemas. * However, if generated classes are not available, one can also use GenericData.Record. * It is an untyped key-value record which is using a schema to validate the correctness of the data. * * <p>It is not recommended to use GenericData.Record with Flink. Use generated POJOs instead. */ @Test public void testDeserializeToGenericType() throws IOException { DatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(userSchema); try (FileReader<GenericData.Record> dataFileReader = DataFileReader.openReader(testFile, datumReader)) { // initialize Record by reading it from disk (that's easier than creating it by hand) GenericData.Record rec = new GenericData.Record(userSchema); dataFileReader.next(rec); // check if record has been read correctly assertNotNull(rec); assertEquals("name not equal", TEST_NAME, rec.get("name").toString()); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString()); assertEquals(null, rec.get("type_long_test")); // it is null for the first record. // now serialize it with our framework: TypeInformation<GenericData.Record> te = TypeExtractor.createTypeInfo(GenericData.Record.class); ExecutionConfig ec = new ExecutionConfig(); assertEquals(GenericTypeInfo.class, te.getClass()); Serializers.recursivelyRegisterType(te.getTypeClass(), ec, new HashSet<>()); TypeSerializer<GenericData.Record> tser = te.createSerializer(ec); assertEquals(1, ec.getDefaultKryoSerializerClasses().size()); assertTrue( ec.getDefaultKryoSerializerClasses().containsKey(Schema.class) && ec.getDefaultKryoSerializerClasses().get(Schema.class).equals(AvroKryoSerializerUtils.AvroSchemaSerializer.class)); ByteArrayOutputStream out = new ByteArrayOutputStream(); try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) { tser.serialize(rec, outView); } GenericData.Record newRec; try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper( new ByteArrayInputStream(out.toByteArray()))) { newRec = tser.deserialize(inView); } // check if it is still the same assertNotNull(newRec); assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.get("type_enum").toString()); assertEquals("name not equal", TEST_NAME, newRec.get("name").toString()); assertEquals(null, newRec.get("type_long_test")); } }