Java Code Examples for org.apache.avro.file.DataFileReader#getSchema()
The following examples show how to use
org.apache.avro.file.DataFileReader#getSchema() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroToDdlTool.java From DataflowTemplates with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException { if (args.length == 0) { System.out.println("Please specify the avro files"); System.exit(1); } List<Schema> schemaList = new ArrayList<>(); for (String filePath : args) { DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(filePath), datumReader); Schema schema = dataFileReader.getSchema(); System.out.println(schema.toString(true)); schemaList.add(schema); } Ddl ddl = new AvroSchemaToDdlConverter().toDdl(schemaList); ddl.prettyPrint(System.out); }
Example 2
Source File: TestAvroEventSerializer.java From mt-flume with Apache License 2.0 | 6 votes |
public void validateAvroFile(File file) throws IOException { // read the events back using GenericRecord DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); String bodyStr = record.get("message").toString(); System.out.println(bodyStr); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 3, numEvents); }
Example 3
Source File: TestAvroImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testOverrideTypeMapping() throws IOException { String [] types = { "INT" }; String [] vals = { "10" }; createTableWithColTypes(types, vals); String [] extraArgs = { "--map-column-java", "DATA_COL0=String"}; runImport(getOutputArgv(true, extraArgs)); Path outputFile = new Path(getTablePath(), "part-m-00000.avro"); DataFileReader<GenericRecord> reader = read(outputFile); Schema schema = reader.getSchema(); assertEquals(Schema.Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "DATA_COL0", Schema.Type.STRING); GenericRecord record1 = reader.next(); assertEquals("DATA_COL0", new Utf8("10"), record1.get("DATA_COL0")); }
Example 4
Source File: TestAvroImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testFirstUnderscoreInColumnName() throws IOException { String [] names = { "_NAME" }; String [] types = { "INT" }; String [] vals = { "1987" }; createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); Path outputFile = new Path(getTablePath(), "part-m-00000.avro"); DataFileReader<GenericRecord> reader = read(outputFile); Schema schema = reader.getSchema(); assertEquals(Schema.Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "__NAME", Type.INT); GenericRecord record1 = reader.next(); assertEquals("__NAME", 1987, record1.get("__NAME")); }
Example 5
Source File: TestAvroImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testNonstandardCharactersInColumnName() throws IOException { String [] names = { "avro\uC3A11" }; String [] types = { "INT" }; String [] vals = { "1987" }; createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); Path outputFile = new Path(getTablePath(), "part-m-00000.avro"); DataFileReader<GenericRecord> reader = read(outputFile); Schema schema = reader.getSchema(); assertEquals(Schema.Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "AVRO1", Type.INT); GenericRecord record1 = reader.next(); assertEquals("AVRO1", 1987, record1.get("AVRO1")); }
Example 6
Source File: TestAvroImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testNonIdentCharactersInColumnName() throws IOException { String [] names = { "test_a-v+r/o" }; String [] types = { "INT" }; String [] vals = { "2015" }; createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); Path outputFile = new Path(getTablePath(), "part-m-00000.avro"); DataFileReader<GenericRecord> reader = read(outputFile); Schema schema = reader.getSchema(); assertEquals(Schema.Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "TEST_A_V_R_O", Type.INT); GenericRecord record1 = reader.next(); assertEquals("TEST_A_V_R_O", 2015, record1.get("TEST_A_V_R_O")); }
Example 7
Source File: TestFlumeEventAvroEventSerializer.java From mt-flume with Apache License 2.0 | 6 votes |
public void validateAvroFile(File file) throws IOException { // read the events back using GenericRecord DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); ByteBuffer body = (ByteBuffer) record.get("body"); CharsetDecoder decoder = Charsets.UTF_8.newDecoder(); String bodyStr = decoder.decode(body).toString(); System.out.println(bodyStr); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 3, numEvents); }
Example 8
Source File: BinaryAvroSchemaFileReader.java From pxf with Apache License 2.0 | 6 votes |
@Override public Schema readSchema(Configuration configuration, String schemaName, HcfsType hcfsType, AvroUtilities.FileSearcher fileSearcher) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DataFileReader<GenericRecord> fileReader = null; try { File file = fileSearcher.searchForFile(schemaName); if (file == null) { final Path path = new Path(hcfsType.getDataUri(configuration, schemaName)); FsInput inStream = new FsInput(path, configuration); fileReader = new DataFileReader<>(inStream, datumReader); } else { fileReader = new DataFileReader<>(file, datumReader); } return fileReader.getSchema(); } finally { if (fileReader != null) { fileReader.close(); } } }
Example 9
Source File: FileAwareInputStreamExtractorWithCheckSchema.java From incubator-gobblin with Apache License 2.0 | 6 votes |
/** * Use {@link AvroSchemaCheckStrategy} to make sure the real schema and the expected schema have matching field names and types * @param fsFromFile * @return * @throws IOException */ protected boolean schemaChecking(FileSystem fsFromFile) throws IOException { if( !this.state.getPropAsBoolean(CopySource.SCHEMA_CHECK_ENABLED, CopySource.DEFAULT_SCHEMA_CHECK_ENABLED) ) { return true; } DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader(new FsInput(this.file.getFileStatus().getPath(), new Configuration()), datumReader); Schema schema = dataFileReader.getSchema(); if(this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA) == null) { throw new IOException("Expected schema is not set properly"); } Schema expectedSchema = new Schema.Parser().parse(this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA)); AvroSchemaCheckStrategy strategy = AvroSchemaCheckStrategy.AvroSchemaCheckStrategyFactory.create(this.state); if(strategy == null) { throw new IOException("schema check strategy cannot be initialized"); } return strategy.compare(expectedSchema,schema); }
Example 10
Source File: AvroUtils.java From Cubert with Apache License 2.0 | 5 votes |
public static Schema getSchema(SeekableInput input) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(input, datumReader); Schema schema = dataFileReader.getSchema(); if (PadDefaultNullsToSchema) { // a list of "cloned" fields, with optional default value set to null ArrayList<Field> paddedFields = new ArrayList<Field>(); for (Field field: schema.getFields()) { // should this field be padded? boolean needsNullPadding = (field.schema() != null) // the field has nested schema && (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION && (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue(); Field f = new Field(field.name(), field.schema(), field.doc(), defValue); paddedFields.add(f); } schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); schema.setFields(paddedFields); } return schema; }
Example 11
Source File: Purge.java From Cubert with Apache License 2.0 | 5 votes |
private DataFileWriter<GenericRecord> createDataFileWriter(DataFileReader<GenericRecord> dataFileReader) throws IllegalArgumentException, IOException { Schema schema = dataFileReader.getSchema(); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(datumWriter); // Get the codec of the reader String codecStr = dataFileReader.getMetaString(DataFileConstants.CODEC); int level = conf.getInt("avro.mapred.deflate.level", 1); String codecName = conf.get("avro.output.codec", codecStr); CodecFactory factory = codecName.equals("deflate") ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); // Set the codec of the writer writer.setCodec(factory); writer.setSyncInterval(conf.getInt("avro.mapred.sync.interval", Math.max(conf.getInt("io.file.buffer.size", 16000), 16000))); writer.create(schema, new Path(tempFileName).getFileSystem(conf) .create(new Path(tempFileName))); return writer; }
Example 12
Source File: AvroHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Test public void testWrite() throws IOException { // Write all test records for (String record : TestConstants.JSON_RECORDS) { this.writer.write(convertRecord(record)); } Assert.assertEquals(this.writer.recordsWritten(), 3); this.writer.close(); this.writer.commit(); File outputFile = new File(TestConstants.TEST_OUTPUT_DIR + Path.SEPARATOR + this.filePath, TestConstants.TEST_FILE_NAME); DataFileReader<GenericRecord> reader = new DataFileReader<>(outputFile, new GenericDatumReader<GenericRecord>()); Schema fileSchema = reader.getSchema(); Assert.assertEquals(fileSchema.getProp(TEST_PROPERTY_KEY), TEST_PROPERTY_VALUE); // Read the records back and assert they are identical to the ones written GenericRecord user1 = reader.next(); // Strings are in UTF8, so we have to call toString() here and below Assert.assertEquals(user1.get("name").toString(), "Alyssa"); Assert.assertEquals(user1.get("favorite_number"), 256); Assert.assertEquals(user1.get("favorite_color").toString(), "yellow"); GenericRecord user2 = reader.next(); Assert.assertEquals(user2.get("name").toString(), "Ben"); Assert.assertEquals(user2.get("favorite_number"), 7); Assert.assertEquals(user2.get("favorite_color").toString(), "red"); GenericRecord user3 = reader.next(); Assert.assertEquals(user3.get("name").toString(), "Charlie"); Assert.assertEquals(user3.get("favorite_number"), 68); Assert.assertEquals(user3.get("favorite_color").toString(), "blue"); reader.close(); FsWriterMetrics metrics = FsWriterMetrics.fromJson(properties.getProp(FsDataWriter.FS_WRITER_METRICS_KEY)); Assert.assertEquals(metrics.fileInfos.size(),1); FsWriterMetrics.FileInfo fileInfo = metrics.fileInfos.iterator().next(); Assert.assertEquals(fileInfo.fileName, TestConstants.TEST_FILE_NAME); Assert.assertEquals(fileInfo.numRecords, 3); Assert.assertNull(metrics.partitionInfo.partitionKey); Assert.assertEquals(metrics.partitionInfo.branchId, 0); }
Example 13
Source File: TestSyslogAvroEventSerializer.java From mt-flume with Apache License 2.0 | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // Snappy currently broken on Mac in OpenJDK 7 per FLUME-2012 Assume.assumeTrue(!"Mac OS X".equals(System.getProperty("os.name")) || !System.getProperty("java.version").startsWith("1.7.")); //Schema schema = new Schema.Parser().parse(schemaFile); // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = SyslogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); ctx.put("compressionCodec", "snappy"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateSyslogEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); int facility = (Integer) record.get("facility"); int severity = (Integer) record.get("severity"); long timestamp = (Long) record.get("timestamp"); String hostname = record.get("hostname").toString(); String message = record.get("message").toString(); Assert.assertEquals("Facility should be 1", 1, facility); System.out.println(timestamp + ": " + message); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 3, numEvents); FileUtils.forceDelete(testFile); }
Example 14
Source File: TestAvroImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 4 votes |
/** * Helper method that runs an import using Avro with optional command line * arguments and checks that the created file matches the expectations. * <p/> * This can be used to test various extra options that are implemented for * the Avro input. * * @param extraArgs extra command line arguments to pass to Sqoop in addition * to those that {@link #getOutputArgv(boolean, String[])} * returns */ private void avroImportTestHelper(String[] extraArgs, String codec) throws IOException { String[] types = {"BIT", "INTEGER", "BIGINT", "REAL", "DOUBLE", "VARCHAR(6)", "VARBINARY(2)", }; String[] vals = {"true", "100", "200", "1.0", "2.0", "'s'", "'0102'", }; createTableWithColTypes(types, vals); runImport(getOutputArgv(true, extraArgs)); Path outputFile = new Path(getTablePath(), "part-m-00000.avro"); DataFileReader<GenericRecord> reader = read(outputFile); Schema schema = reader.getSchema(); assertEquals(Schema.Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "DATA_COL0", Schema.Type.BOOLEAN); checkField(fields.get(1), "DATA_COL1", Schema.Type.INT); checkField(fields.get(2), "DATA_COL2", Schema.Type.LONG); checkField(fields.get(3), "DATA_COL3", Schema.Type.FLOAT); checkField(fields.get(4), "DATA_COL4", Schema.Type.DOUBLE); checkField(fields.get(5), "DATA_COL5", Schema.Type.STRING); checkField(fields.get(6), "DATA_COL6", Schema.Type.BYTES); GenericRecord record1 = reader.next(); assertEquals("DATA_COL0", true, record1.get("DATA_COL0")); assertEquals("DATA_COL1", 100, record1.get("DATA_COL1")); assertEquals("DATA_COL2", 200L, record1.get("DATA_COL2")); assertEquals("DATA_COL3", 1.0f, record1.get("DATA_COL3")); assertEquals("DATA_COL4", 2.0, record1.get("DATA_COL4")); assertEquals("DATA_COL5", new Utf8("s"), record1.get("DATA_COL5")); Object object = record1.get("DATA_COL6"); assertTrue(object instanceof ByteBuffer); ByteBuffer b = ((ByteBuffer) object); assertEquals((byte) 1, b.get(0)); assertEquals((byte) 2, b.get(1)); if (codec != null) { assertEquals(codec, reader.getMetaString(DataFileConstants.CODEC)); } checkSchemaFile(schema); }
Example 15
Source File: TestJavaAvroEventSerializer.java From flume-plugins with MIT License | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = JavaLogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateJavaEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); long timestamp = (Long) record.get("timestamp"); String datetime = record.get("datetime").toString(); String classname = record.get("classname").toString(); String message = record.get("message").toString(); System.out.println(classname + ": " + message + " (at " + datetime + ")"); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 4 events", 4, numEvents); FileUtils.forceDelete(testFile); }
Example 16
Source File: TestSyslogAvroEventSerializer.java From flume-plugins with MIT License | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = SyslogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); ctx.put("path", "src/test/resources/customerToHostsFile.txt"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateSyslogEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); long timestamp = (Long) record.get("timestamp"); String datetime = record.get("datetime").toString(); String hostname = record.get("hostname").toString(); Map<String, String> headers = (Map<String, String>) record.get("headers"); String message = record.get("message").toString(); System.out.println(hostname + " (" + headers + ")" + ": " + message); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 6 events", 6, numEvents); FileUtils.forceDelete(testFile); }
Example 17
Source File: TestApacheAvroEventSerializer.java From flume-plugins with MIT License | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = ApacheLogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateApacheEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); String ip = record.get("ip").toString(); String uri = record.get("uri").toString(); Integer statuscode = (Integer) record.get("statuscode"); String original = record.get("original").toString(); String connectionstatus = record.get("connectionstatus").toString(); Assert.assertEquals("Ip should be 80.79.194.3", "80.79.194.3", ip); System.out.println("IP " + ip + " requested: " + uri + " with status code " + statuscode + " and connectionstatus: " + connectionstatus); System.out.println("Original logline: " + original); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 2, numEvents); FileUtils.forceDelete(testFile); }