org.apache.avro.file.DataFileConstants Java Examples
The following examples show how to use
org.apache.avro.file.DataFileConstants.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroKeyValueSinkWriter.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
private CodecFactory getCompressionCodec(Map<String, String> conf) { if (getBoolean(conf, CONF_COMPRESS, false)) { int deflateLevel = getInt(conf, CONF_DEFLATE_LEVEL, CodecFactory.DEFAULT_DEFLATE_LEVEL); int xzLevel = getInt(conf, CONF_XZ_LEVEL, CodecFactory.DEFAULT_XZ_LEVEL); String outputCodec = conf.get(CONF_COMPRESS_CODEC); if (DataFileConstants.DEFLATE_CODEC.equals(outputCodec)) { return CodecFactory.deflateCodec(deflateLevel); } else if (DataFileConstants.XZ_CODEC.equals(outputCodec)) { return CodecFactory.xzCodec(xzLevel); } else { return CodecFactory.fromString(outputCodec); } } return CodecFactory.nullCodec(); }
Example #2
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testCreateFromMetadata() throws Exception { List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); String codec = DataFileConstants.NULL_CODEC; String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); AvroSource<GenericRecord> source = AvroSource.from(fileMeta); AvroSource<Bird> sourceWithSchema = source.withSchema(Bird.class); AvroSource<Bird> sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234); assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode()); assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode()); assertEquals( FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode()); }
Example #3
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testReadMetadataWithCodecs() throws Exception { // Test reading files generated using all codecs. String[] codecs = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC }; List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); for (String codec : codecs) { String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); assertEquals(codec, metadata.getCodec()); } }
Example #4
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testParseFn() throws Exception { List<Bird> expected = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); AvroSource<Bird> source = AvroSource.from(filename) .withParseFn( input -> new Bird( (long) input.get("number"), input.get("species").toString(), input.get("quality").toString(), (long) input.get("quantity")), AvroCoder.of(Bird.class)); List<Bird> actual = SourceTestUtils.readFromSource(source, null); assertThat(actual, containsInAnyOrder(expected.toArray())); }
Example #5
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testSchemaStringIsInterned() throws Exception { List<Bird> birds = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename); String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString(); // Add "" to the schema to make sure it is not interned. AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema("" + schema); AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema("" + schema); assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString()); // Ensure that deserialization still goes through interning AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB); assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString()); }
Example #6
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testSchemaUpdate() throws Exception { List<Bird> birds = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); AvroSource<FancyBird> source = AvroSource.from(filename).withSchema(FancyBird.class); List<FancyBird> actual = SourceTestUtils.readFromSource(source, null); List<FancyBird> expected = new ArrayList<>(); for (Bird bird : birds) { expected.add( new FancyBird( bird.number, bird.species, bird.quality, bird.quantity, null, "MAXIMUM OVERDRIVE")); } assertThat(actual, containsInAnyOrder(expected.toArray())); }
Example #7
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testCreationWithSchema() throws Exception { List<Bird> expected = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); // Create a source with a schema object Schema schema = ReflectData.get().getSchema(Bird.class); AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema); List<GenericRecord> records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); // Create a source with a JSON schema String schemaString = ReflectData.get().getSchema(Bird.class).toString(); source = AvroSource.from(filename).withSchema(schemaString); records = SourceTestUtils.readFromSource(source, null); assertEqualsWithGeneric(expected, records); }
Example #8
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testMultipleFiles() throws Exception { String baseName = "tmp-"; List<Bird> expected = new ArrayList<>(); for (int i = 0; i < 10; i++) { List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10); expected.addAll(contents); generateTestFile( baseName + i, contents, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); } AvroSource<Bird> source = AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString()) .withSchema(Bird.class); List<Bird> actual = SourceTestUtils.readFromSource(source, null); assertThat(actual, containsInAnyOrder(expected.toArray())); }
Example #9
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testSplitAtFractionExhaustive() throws Exception { // A small-sized input is sufficient, because the test verifies that splitting is non-vacuous. List<FixedRecord> expected = createFixedRecords(20); String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_REGULAR, 5, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); SourceTestUtils.assertSplitAtFractionExhaustive(source, null); }
Example #10
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testGetCurrentFromUnstartedReader() throws Exception { List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_DEFAULT, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BlockBasedSource.BlockBasedReader<FixedRecord> reader = (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) { assertEquals(null, reader.getCurrentBlock()); expectedException.expect(NoSuchElementException.class); expectedException.expectMessage("No block has been successfully read from"); reader.getCurrent(); } }
Example #11
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testGetProgressFromUnstartedReader() throws Exception { List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_DEFAULT, 1000, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); File file = new File(filename); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) { assertEquals(Double.valueOf(0.0), reader.getFractionConsumed()); } List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) { assertEquals(Double.valueOf(0.0), reader.getFractionConsumed()); } } }
Example #12
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testReadWithDifferentCodecs() throws Exception { // Test reading files generated using all codecs. String[] codecs = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC, }; // As Avro's default block size is 64KB, write 64K records to ensure at least one full block. // We could make this smaller than 64KB assuming each record is at least B bytes, but then the // test could silently stop testing the failure condition from BEAM-422. List<Bird> expected = createRandomRecords(1 << 16); for (String codec : codecs) { String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class); List<Bird> actual = SourceTestUtils.readFromSource(source, null); assertThat(expected, containsInAnyOrder(actual.toArray())); } }
Example #13
Source File: AvroSource.java From beam with Apache License 2.0 | 6 votes |
AvroBlock(byte[] data, long numRecords, Mode<T> mode, String writerSchemaString, String codec) throws IOException { this.mode = mode; this.numRecords = numRecords; checkNotNull(writerSchemaString, "writerSchemaString"); Schema writerSchema = internOrParseSchemaString(writerSchemaString); Schema readerSchema = internOrParseSchemaString( MoreObjects.firstNonNull(mode.readerSchemaString, writerSchemaString)); this.reader = mode.createReader(writerSchema, readerSchema); if (codec.equals(DataFileConstants.NULL_CODEC)) { // Avro can read from a byte[] using a more efficient implementation. If the input is not // compressed, pass the data in directly. this.decoder = DecoderFactory.get().binaryDecoder(data, null); } else { this.decoder = DecoderFactory.get().binaryDecoder(decodeAsInputStream(data, codec), null); } }
Example #14
Source File: AvroSource.java From beam with Apache License 2.0 | 6 votes |
/** * Decodes a byte array as an InputStream. The byte array may be compressed using some codec. * Reads from the returned stream will result in decompressed bytes. * * <p>This supports the same codecs as Avro's {@link CodecFactory}, namely those defined in * {@link DataFileConstants}. * * <ul> * <li>"snappy" : Google's Snappy compression * <li>"deflate" : deflate compression * <li>"bzip2" : Bzip2 compression * <li>"xz" : xz compression * <li>"null" (the string, not the value): Uncompressed data * </ul> */ private static InputStream decodeAsInputStream(byte[] data, String codec) throws IOException { ByteArrayInputStream byteStream = new ByteArrayInputStream(data); switch (codec) { case DataFileConstants.SNAPPY_CODEC: return new SnappyCompressorInputStream(byteStream, 1 << 16 /* Avro uses 64KB blocks */); case DataFileConstants.DEFLATE_CODEC: // nowrap == true: Do not expect ZLIB header or checksum, as Avro does not write them. Inflater inflater = new Inflater(true); return new InflaterInputStream(byteStream, inflater); case DataFileConstants.XZ_CODEC: return new XZCompressorInputStream(byteStream); case DataFileConstants.BZIP2_CODEC: return new BZip2CompressorInputStream(byteStream); case DataFileConstants.NULL_CODEC: return byteStream; default: throw new IllegalArgumentException("Unsupported codec: " + codec); } }
Example #15
Source File: AvroKeyValueSinkWriter.java From flink with Apache License 2.0 | 6 votes |
private CodecFactory getCompressionCodec(Map<String, String> conf) { if (getBoolean(conf, CONF_COMPRESS, false)) { int deflateLevel = getInt(conf, CONF_DEFLATE_LEVEL, CodecFactory.DEFAULT_DEFLATE_LEVEL); int xzLevel = getInt(conf, CONF_XZ_LEVEL, CodecFactory.DEFAULT_XZ_LEVEL); String outputCodec = conf.get(CONF_COMPRESS_CODEC); if (DataFileConstants.DEFLATE_CODEC.equals(outputCodec)) { return CodecFactory.deflateCodec(deflateLevel); } else if (DataFileConstants.XZ_CODEC.equals(outputCodec)) { return CodecFactory.xzCodec(xzLevel); } else { return CodecFactory.fromString(outputCodec); } } return CodecFactory.nullCodec(); }
Example #16
Source File: LobAvroImportTestCase.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
/** * Import blob data that is smaller than inline lob limit and compress with * deflate codec. Blob data should be encoded and saved as Avro bytes. * @throws IOException * @throws SQLException */ public void testBlobCompressedAvroImportInline() throws IOException, SQLException { String [] types = { getBlobType() }; String expectedVal = "This is short BLOB data"; String [] vals = { getBlobInsertStr(expectedVal) }; createTableWithColTypes(types, vals); runImport(getArgv("--compression-codec", CodecMap.DEFLATE)); Path outputFile = new Path(getTablePath(), "part-m-00000.avro"); DataFileReader<GenericRecord> reader = read(outputFile); GenericRecord record = reader.next(); // Verify that the data block of the Avro file is compressed with deflate // codec. assertEquals(CodecMap.DEFLATE, reader.getMetaString(DataFileConstants.CODEC)); // Verify that all columns are imported correctly. ByteBuffer buf = (ByteBuffer) record.get(getColName(0)); String returnVal = new String(buf.array()); assertEquals(getColName(0), expectedVal, returnVal); }
Example #17
Source File: AvroKeyValueSinkWriterTest.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
@Test public void testDuplicate() { Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.STRING); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); AvroKeyValueSinkWriter<String, String> writer = new AvroKeyValueSinkWriter(properties); writer.setSyncOnFlush(true); AvroKeyValueSinkWriter<String, String> other = writer.duplicate(); assertTrue(StreamWriterBaseComparator.equals(writer, other)); writer.setSyncOnFlush(false); assertFalse(StreamWriterBaseComparator.equals(writer, other)); }
Example #18
Source File: AvroKeyValueSinkWriter.java From flink with Apache License 2.0 | 6 votes |
private CodecFactory getCompressionCodec(Map<String, String> conf) { if (getBoolean(conf, CONF_COMPRESS, false)) { int deflateLevel = getInt(conf, CONF_DEFLATE_LEVEL, CodecFactory.DEFAULT_DEFLATE_LEVEL); int xzLevel = getInt(conf, CONF_XZ_LEVEL, CodecFactory.DEFAULT_XZ_LEVEL); String outputCodec = conf.get(CONF_COMPRESS_CODEC); if (DataFileConstants.DEFLATE_CODEC.equals(outputCodec)) { return CodecFactory.deflateCodec(deflateLevel); } else if (DataFileConstants.XZ_CODEC.equals(outputCodec)) { return CodecFactory.xzCodec(xzLevel); } else { return CodecFactory.fromString(outputCodec); } } return CodecFactory.nullCodec(); }
Example #19
Source File: AvroKeyValueSinkWriterTest.java From flink with Apache License 2.0 | 6 votes |
@Test public void testDuplicate() { Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.STRING); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); AvroKeyValueSinkWriter<String, String> writer = new AvroKeyValueSinkWriter(properties); writer.setSyncOnFlush(true); AvroKeyValueSinkWriter<String, String> other = writer.duplicate(); assertTrue(StreamWriterBaseComparator.equals(writer, other)); writer.setSyncOnFlush(false); assertFalse(StreamWriterBaseComparator.equals(writer, other)); }
Example #20
Source File: AvroKeyValueSinkWriterTest.java From flink with Apache License 2.0 | 6 votes |
@Test public void testDuplicate() { Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.STRING); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); AvroKeyValueSinkWriter<String, String> writer = new AvroKeyValueSinkWriter(properties); writer.setSyncOnFlush(true); AvroKeyValueSinkWriter<String, String> other = writer.duplicate(); assertTrue(StreamWriterBaseComparator.equals(writer, other)); writer.setSyncOnFlush(false); assertFalse(StreamWriterBaseComparator.equals(writer, other)); }
Example #21
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testSplitAtFraction() throws Exception { // A reduced dataset is enough here. List<FixedRecord> expected = createFixedRecords(DEFAULT_RECORD_COUNT); // Create an AvroSource where each block is 1/10th of the total set of records. String filename = generateTestFile( "tmp.avro", expected, SyncBehavior.SYNC_REGULAR, DEFAULT_RECORD_COUNT / 10 /* max records per block */, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); File file = new File(filename); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null); for (BoundedSource<FixedRecord> subSource : splits) { int items = SourceTestUtils.readFromSource(subSource, null).size(); // Shouldn't split while unstarted. SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null); SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent( subSource, DEFAULT_RECORD_COUNT / 100, 0.7, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent( subSource, DEFAULT_RECORD_COUNT / 10, 0.1, null); SourceTestUtils.assertSplitAtFractionFails( subSource, DEFAULT_RECORD_COUNT / 10 + 1, 0.1, null); SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null); SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null); SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null); } }
Example #22
Source File: Purge.java From Cubert with Apache License 2.0 | 5 votes |
private DataFileWriter<GenericRecord> createDataFileWriter(DataFileReader<GenericRecord> dataFileReader) throws IllegalArgumentException, IOException { Schema schema = dataFileReader.getSchema(); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(datumWriter); // Get the codec of the reader String codecStr = dataFileReader.getMetaString(DataFileConstants.CODEC); int level = conf.getInt("avro.mapred.deflate.level", 1); String codecName = conf.get("avro.output.codec", codecStr); CodecFactory factory = codecName.equals("deflate") ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); // Set the codec of the writer writer.setCodec(factory); writer.setSyncInterval(conf.getInt("avro.mapred.sync.interval", Math.max(conf.getInt("io.file.buffer.size", 16000), 16000))); writer.create(schema, new Path(tempFileName).getFileSystem(conf) .create(new Path(tempFileName))); return writer; }
Example #23
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testProgressEmptySource() throws Exception { // 0 records, 20 per block. List<FixedRecord> records = Collections.emptyList(); String filename = generateTestFile( "tmp.avro", records, SyncBehavior.SYNC_REGULAR, 2, AvroCoder.of(FixedRecord.class), DataFileConstants.NULL_CODEC); AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class); try (BoundedSource.BoundedReader<FixedRecord> readerOrig = source.createReader(null)) { assertThat(readerOrig, Matchers.instanceOf(BlockBasedReader.class)); BlockBasedReader<FixedRecord> reader = (BlockBasedReader<FixedRecord>) readerOrig; // before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); // confirm empty assertFalse(reader.start()); // after reading empty source assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); assertEquals(1.0, reader.getFractionConsumed(), 1e-6); } }
Example #24
Source File: WriterUtils.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Creates a {@link CodecFactory} based on the specified codec name and deflate level. If codecName is absent, then * a {@link CodecFactory#deflateCodec(int)} is returned. Otherwise the codecName is converted into a * {@link CodecFactory} via the {@link CodecFactory#fromString(String)} method. * * @param codecName the name of the codec to use (e.g. deflate, snappy, xz, etc.). * @param deflateLevel must be an integer from [0-9], and is only applicable if the codecName is "deflate". * @return a {@link CodecFactory}. */ public static CodecFactory getCodecFactory(Optional<String> codecName, Optional<String> deflateLevel) { if (!codecName.isPresent()) { return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL); } else if (codecName.get().equalsIgnoreCase(DataFileConstants.DEFLATE_CODEC)) { if (!deflateLevel.isPresent()) { return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL); } return CodecFactory.deflateCodec(Integer.parseInt(deflateLevel.get())); } else { return CodecFactory.fromString(codecName.get().toLowerCase()); } }
Example #25
Source File: ImportTransform.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) { KV<String, String> kv = c.element(); String schema = null; ResourceId resourceId = FileSystems.matchNewResource(kv.getValue(), false); try (InputStream stream = Channels.newInputStream(FileSystems.open(resourceId))) { BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null); byte[] magic = new byte[DataFileConstants.MAGIC.length]; decoder.readFixed(magic); if (!Arrays.equals(magic, DataFileConstants.MAGIC)) { throw new IOException("Missing Avro file signature: " + kv.getValue()); } // Read the metadata to find the codec and schema. ByteBuffer valueBuffer = ByteBuffer.allocate(512); long numRecords = decoder.readMapStart(); while (numRecords > 0 && schema == null) { for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) { String key = decoder.readString(); // readBytes() clears the buffer and returns a buffer where: // - position is the start of the bytes read // - limit is the end of the bytes read valueBuffer = decoder.readBytes(valueBuffer); byte[] bytes = new byte[valueBuffer.remaining()]; valueBuffer.get(bytes); if (key.equals(DataFileConstants.SCHEMA)) { schema = new String(bytes, "UTF-8"); break; } } numRecords = decoder.mapNext(); } } catch (IOException e) { throw new RuntimeException(e); } c.output(KV.of(kv.getKey(), schema)); }
Example #26
Source File: TestExecuteSQL.java From nifi with Apache License 2.0 | 5 votes |
@Test public void testCompression() throws SQLException, CompressorException, IOException { // remove previous test database, if any final File dbLocation = new File(DB_LOCATION); dbLocation.delete(); // load test data to database final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection(); Statement stmt = con.createStatement(); try { stmt.execute("drop table TEST_NULL_INT"); } catch (final SQLException sqle) { } stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, constraint my_pk primary key (id))"); stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (0, NULL, 1)"); stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (1, 1, 1)"); runner.setIncomingConnection(false); runner.setProperty(ExecuteSQL.COMPRESSION_FORMAT, AvroUtil.CodecType.BZIP2.name()); runner.setProperty(ExecuteSQL.SQL_SELECT_QUERY, "SELECT * FROM TEST_NULL_INT"); runner.run(); runner.assertAllFlowFilesTransferred(ExecuteSQL.REL_SUCCESS, 1); MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExecuteSQL.REL_SUCCESS).get(0); try (DataFileStream<GenericRecord> dfs = new DataFileStream<>(new ByteArrayInputStream(flowFile.toByteArray()), new GenericDatumReader<GenericRecord>())) { assertEquals(AvroUtil.CodecType.BZIP2.name().toLowerCase(), dfs.getMetaString(DataFileConstants.CODEC).toLowerCase()); } }
Example #27
Source File: AvroHdfsFileSink.java From components with Apache License 2.0 | 5 votes |
@Override protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException { try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) { FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder); Schema schema = null; String inputCodec = null; OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile))); for (FileStatus sourceStatus : sourceStatuses) { try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>( new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) { if (schema == null) { schema = reader.getSchema(); for (String key : reader.getMetaKeys()) { if (!DataFileWriter.isReservedMeta(key)) { writer.setMeta(key, reader.getMeta(key)); } } inputCodec = reader.getMetaString(DataFileConstants.CODEC); if (inputCodec == null) { inputCodec = DataFileConstants.NULL_CODEC; } writer.setCodec(CodecFactory.fromString(inputCodec)); writer.create(schema, output); } writer.appendAllFrom(reader, false); } } } }
Example #28
Source File: AvroSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testReadSchemaString() throws Exception { List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); String codec = DataFileConstants.NULL_CODEC; String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); // By default, parse validates the schema, which is what we want. Schema schema = new Schema.Parser().parse(metadata.getSchemaString()); assertEquals(4, schema.getFields().size()); }
Example #29
Source File: AvroHdfsFileSink.java From components with Apache License 2.0 | 5 votes |
@Override protected void configure(Job job, KV<AvroKey<IndexedRecord>, NullWritable> sample) { super.configure(job, sample); AvroKey<IndexedRecord> k = sample.getKey(); AvroJob.setOutputKeySchema(job, k.datum().getSchema()); FileOutputFormat.setCompressOutput(job, true); job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.SNAPPY_CODEC); }
Example #30
Source File: BucketingSinkTest.java From flink with Apache License 2.0 | 4 votes |
/** * This tests {@link AvroKeyValueSinkWriter} * with non-rolling output and with compression. */ @Test public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception { final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out"; final int numElements = 20; Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.INT); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath) .setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)) .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()) .setPartPrefix(PART_PREFIX) .setPendingPrefix("") .setPendingSuffix(""); OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0); testHarness.setProcessingTime(0L); testHarness.setup(); testHarness.open(); for (int i = 0; i < numElements; i++) { testHarness.processElement(new StreamRecord<>(Tuple2.of( i, "message #" + Integer.toString(i) ))); } testHarness.close(); GenericData.setStringType(valueSchema, GenericData.StringType.String); Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema); FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0")); SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader); for (int i = 0; i < numElements; i++) { AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next()); int key = wrappedEntry.getKey(); Assert.assertEquals(i, key); String value = wrappedEntry.getValue(); Assert.assertEquals("message #" + i, value); } dataFileStream.close(); inStream.close(); }