Java Code Examples for org.apache.avro.file.DataFileStream#close()
The following examples show how to use
org.apache.avro.file.DataFileStream#close() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Examples.java From datafu with Apache License 2.0 | 6 votes |
private Long loadMemberCount(Path path, String timestamp) throws IOException { FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { GenericRecord r = dataFileStream.next(); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertNotNull(count); System.out.println("found count: " + count); return count; } finally { dataFileStream.close(); } } throw new RuntimeException("found no data"); }
Example 2
Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadIntermediateCounts(Path path, String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_intermediatePath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_intermediatePath,nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 3
Source File: PartitionPreservingTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 4
Source File: TestAvroStorage.java From spork with Apache License 2.0 | 5 votes |
private Set<Object> getExpected (String pathstr ) throws IOException { Set<Object> ret = new HashSet<Object>(); FileSystem fs = FileSystem.getLocal(new Configuration()); /* read in output results and compare */ Path output = new Path(pathstr); assertTrue("Expected output does not exists!", fs.exists(output)); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); DataFileStream<Object> in = new DataFileStream<Object>(fs.open(filePath), reader); while (in.hasNext()) { Object obj = in.next(); ret.add(obj); } in.close(); } } return ret; }
Example 5
Source File: TestAvroStorage.java From spork with Apache License 2.0 | 5 votes |
private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException { FileSystem fs = FileSystem.getLocal(new Configuration()) ; /* read in expected results*/ Set<Object> expected = getExpected (expectedOutpath); /* read in output results and compare */ Path output = new Path(outPath); assertTrue("Output dir does not exists!", fs.exists(output) && fs.getFileStatus(output).isDir()); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); DataFileStream<Object> in = new DataFileStream<Object>( fs.open(filePath), reader); assertEquals("codec", expectedCodec, in.getMetaString("avro.codec")); int count = 0; while (in.hasNext()) { Object obj = in.next(); //System.out.println("obj = " + (GenericData.Array<Float>)obj); assertTrue("Avro result object found that's not expected: " + obj, expected.contains(obj)); count++; } in.close(); assertEquals(expected.size(), count); } } }
Example 6
Source File: PartitionCollapsingTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 7
Source File: TestAvroJob.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)r.get("id"); Long count = (Long)r.get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 8
Source File: AvroFileReader.java From ml-ease with Apache License 2.0 | 5 votes |
public <T> void build(String filePath, AvroConsumer<T> builder) throws IOException { List<Path> paths = getPaths(filePath); for (Path path: paths) { DataFileStream<Object> stream = null; try { stream = getAvroDataStream(path); while (stream.hasNext()) { builder.consume(stream.next()); } } finally { if (stream != null) { stream.close(); } } } builder.done(); }
Example 9
Source File: TestAvroStorage.java From spork with Apache License 2.0 | 5 votes |
private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException { FileSystem fs = FileSystem.getLocal(new Configuration()) ; /* read in expected results*/ Set<GenericData.Record> expected = getExpected (expectedOutpath); /* read in output results and compare */ Path output = new Path(outPath); assertTrue("Output dir does not exists!", fs.exists(output) && fs.getFileStatus(output).isDir()); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>(); DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>( fs.open(filePath), reader); assertEquals("codec", expectedCodec, in.getMetaString("avro.codec")); int count = 0; while (in.hasNext()) { GenericData.Record obj = in.next(); assertTrue("Avro result object found that's not expected: Found " + (obj != null ? obj.getSchema() : "null") + ", " + obj.toString() + "\nExpected " + (expected != null ? expected.toString() : "null") + "\n" , expected.contains(obj)); count++; } in.close(); assertEquals(expected.size(), count); } } }
Example 10
Source File: AvroRowDecoder.java From presto with Apache License 2.0 | 5 votes |
private void closeQuietly(DataFileStream<GenericRecord> stream) { try { if (stream != null) { stream.close(); } } catch (IOException ignored) { } }
Example 11
Source File: AvroStorage.java From spork with Apache License 2.0 | 5 votes |
/** * Reads the avro schemas at the specified location. * @param p Location of file * @param job Hadoop job object * @return an Avro Schema object derived from the specified file * @throws IOException * */ public Schema getAvroSchema(final Path[] p, final Job job) throws IOException { GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>(); ArrayList<FileStatus> statusList = new ArrayList<FileStatus>(); FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration()); for (Path temp : p) { for (FileStatus tempf : fs.globStatus(temp)) { statusList.add(tempf); } } FileStatus[] statusArray = (FileStatus[]) statusList .toArray(new FileStatus[statusList.size()]); if (statusArray == null) { throw new IOException("Path " + p.toString() + " does not exist."); } if (statusArray.length == 0) { throw new IOException("No path matches pattern " + p.toString()); } Path filePath = Utils.depthFirstSearchForFile(statusArray, fs); if (filePath == null) { throw new IOException("No path matches pattern " + p.toString()); } InputStream hdfsInputStream = fs.open(filePath); DataFileStream<Object> avroDataStream = new DataFileStream<Object>( hdfsInputStream, avroReader); Schema s = avroDataStream.getSchema(); avroDataStream.close(); return s; }
Example 12
Source File: BucketingSinkTest.java From flink with Apache License 2.0 | 4 votes |
/** * This tests user defined hdfs configuration. * @throws Exception */ @Test public void testUserDefinedConfiguration() throws Exception { final String outPath = hdfsURI + "/string-non-rolling-with-config"; final int numElements = 20; Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.INT); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); Configuration conf = new Configuration(); conf.set("io.file.buffer.size", "40960"); BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath) .setFSConfig(conf) .setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960")) .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()) .setPartPrefix(PART_PREFIX) .setPendingPrefix("") .setPendingSuffix(""); OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0); testHarness.setProcessingTime(0L); testHarness.setup(); testHarness.open(); for (int i = 0; i < numElements; i++) { testHarness.processElement(new StreamRecord<>(Tuple2.of( i, "message #" + Integer.toString(i) ))); } testHarness.close(); GenericData.setStringType(valueSchema, GenericData.StringType.String); Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema); FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0")); SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader); for (int i = 0; i < numElements; i++) { AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next()); int key = wrappedEntry.getKey(); Assert.assertEquals(i, key); String value = wrappedEntry.getValue(); Assert.assertEquals("message #" + i, value); } dataFileStream.close(); inStream.close(); }
Example 13
Source File: BucketingSinkTest.java From flink with Apache License 2.0 | 4 votes |
/** * This tests {@link AvroKeyValueSinkWriter} * with non-rolling output and with compression. */ @Test public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception { final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out"; final int numElements = 20; Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.INT); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath) .setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)) .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()) .setPartPrefix(PART_PREFIX) .setPendingPrefix("") .setPendingSuffix(""); OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0); testHarness.setProcessingTime(0L); testHarness.setup(); testHarness.open(); for (int i = 0; i < numElements; i++) { testHarness.processElement(new StreamRecord<>(Tuple2.of( i, "message #" + Integer.toString(i) ))); } testHarness.close(); GenericData.setStringType(valueSchema, GenericData.StringType.String); Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema); FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0")); SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader); for (int i = 0; i < numElements; i++) { AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next()); int key = wrappedEntry.getKey(); Assert.assertEquals(i, key); String value = wrappedEntry.getValue(); Assert.assertEquals("message #" + i, value); } dataFileStream.close(); inStream.close(); }
Example 14
Source File: ProtoToJsonTool.java From gcs-tools with Apache License 2.0 | 4 votes |
@Override public int run(InputStream in, PrintStream out, PrintStream err, List<String> args) throws Exception { OptionParser optionParser = new OptionParser(); OptionSpec<Void> prettyOption = optionParser .accepts("pretty", "Turns on pretty printing."); OptionSet optionSet = optionParser.parse(args.toArray(new String[0])); Boolean pretty = optionSet.has(prettyOption); List<String> nargs = (List<String>)optionSet.nonOptionArguments(); if (nargs.size() != 1) { printHelp(err); err.println(); optionParser.printHelpOn(err); return 1; } BufferedInputStream inStream = Util.fileOrStdin(nargs.get(0), in); GenericDatumReader<Object> reader = new GenericDatumReader<>(); DataFileStream<Object> streamReader = new DataFileStream<>(inStream, reader); ObjectMapper mapper = new ObjectMapper(); try { String schema = streamReader.getMetaString("protobuf.generic.schema"); checkNotNull(schema, "Missing metadata key protobuf.generic.schema"); ProtobufReader protoReader = new ProtobufReader(schema); for (Object datum : streamReader) { ByteBuffer byteBuffer = (ByteBuffer) ((GenericRecord) datum).get("bytes"); String json = protoReader.toJson(byteBuffer); if (pretty) { String prettyJson = mapper .writerWithDefaultPrettyPrinter() .writeValueAsString(mapper.readValue(json, Object.class)); out.println(prettyJson); } else { out.println(json); } } out.println(); out.flush(); } finally { streamReader.close(); } return 0; }
Example 15
Source File: TestConvertAvroSchema.java From localization_nifi with Apache License 2.0 | 4 votes |
@Test public void testNestedConversion() throws IOException { TestRunner runner = TestRunners.newTestRunner(ConvertAvroSchema.class); runner.assertNotValid(); runner.setProperty(ConvertAvroSchema.INPUT_SCHEMA, TestAvroRecordConverter.NESTED_RECORD_SCHEMA.toString()); runner.setProperty(ConvertAvroSchema.OUTPUT_SCHEMA, TestAvroRecordConverter.UNNESTED_OUTPUT_SCHEMA.toString()); runner.setProperty("parent.id", "parentId"); runner.assertValid(); // Two valid rows Record goodRecord1 = dataNested(1L, "200", null, null); Record goodRecord2 = dataNested(2L, "300", 5L, "ParentCompany"); List<Record> input = Lists.newArrayList(goodRecord1, goodRecord2); runner.enqueue(streamFor(input)); runner.run(); long converted = runner.getCounterValue("Converted records"); long errors = runner.getCounterValue("Conversion errors"); Assert.assertEquals("Should convert 2 rows", 2, converted); Assert.assertEquals("Should reject 0 rows", 0, errors); runner.assertTransferCount("success", 1); runner.assertTransferCount("failure", 0); GenericDatumReader<Record> successReader = new GenericDatumReader<Record>( TestAvroRecordConverter.UNNESTED_OUTPUT_SCHEMA); DataFileStream<Record> successStream = new DataFileStream<Record>( new ByteArrayInputStream(runner.getContentAsByteArray(runner .getFlowFilesForRelationship("success").get(0))), successReader); int count = 0; for (Record r : successStream) { if (count == 0) { Assert.assertEquals(convertNested(goodRecord1), r); } else { Assert.assertEquals(convertNested(goodRecord2), r); } count++; } successStream.close(); Assert.assertEquals(2, count); }
Example 16
Source File: BucketingSinkTest.java From flink with Apache License 2.0 | 4 votes |
/** * This tests user defined hdfs configuration. * @throws Exception */ @Test public void testUserDefinedConfiguration() throws Exception { final String outPath = hdfsURI + "/string-non-rolling-with-config"; final int numElements = 20; Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.INT); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); Configuration conf = new Configuration(); conf.set("io.file.buffer.size", "40960"); BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath) .setFSConfig(conf) .setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960")) .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()) .setPartPrefix(PART_PREFIX) .setPendingPrefix("") .setPendingSuffix(""); OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0); testHarness.setProcessingTime(0L); testHarness.setup(); testHarness.open(); for (int i = 0; i < numElements; i++) { testHarness.processElement(new StreamRecord<>(Tuple2.of( i, "message #" + Integer.toString(i) ))); } testHarness.close(); GenericData.setStringType(valueSchema, GenericData.StringType.String); Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema); FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0")); SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader); for (int i = 0; i < numElements; i++) { AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next()); int key = wrappedEntry.getKey(); Assert.assertEquals(i, key); String value = wrappedEntry.getValue(); Assert.assertEquals("message #" + i, value); } dataFileStream.close(); inStream.close(); }
Example 17
Source File: BucketingSinkTest.java From flink with Apache License 2.0 | 4 votes |
/** * This tests {@link AvroKeyValueSinkWriter} * with non-rolling output and with compression. */ @Test public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception { final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out"; final int numElements = 20; Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.INT); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath) .setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)) .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()) .setPartPrefix(PART_PREFIX) .setPendingPrefix("") .setPendingSuffix(""); OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0); testHarness.setProcessingTime(0L); testHarness.setup(); testHarness.open(); for (int i = 0; i < numElements; i++) { testHarness.processElement(new StreamRecord<>(Tuple2.of( i, "message #" + Integer.toString(i) ))); } testHarness.close(); GenericData.setStringType(valueSchema, GenericData.StringType.String); Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema); FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0")); SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader); for (int i = 0; i < numElements; i++) { AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next()); int key = wrappedEntry.getKey(); Assert.assertEquals(i, key); String value = wrappedEntry.getValue(); Assert.assertEquals("message #" + i, value); } dataFileStream.close(); inStream.close(); }
Example 18
Source File: BucketingSinkTest.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * This tests user defined hdfs configuration. * @throws Exception */ @Test public void testUserDefinedConfiguration() throws Exception { final String outPath = hdfsURI + "/string-non-rolling-with-config"; final int numElements = 20; Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.INT); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); Configuration conf = new Configuration(); conf.set("io.file.buffer.size", "40960"); BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath) .setFSConfig(conf) .setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960")) .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()) .setPartPrefix(PART_PREFIX) .setPendingPrefix("") .setPendingSuffix(""); OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0); testHarness.setProcessingTime(0L); testHarness.setup(); testHarness.open(); for (int i = 0; i < numElements; i++) { testHarness.processElement(new StreamRecord<>(Tuple2.of( i, "message #" + Integer.toString(i) ))); } testHarness.close(); GenericData.setStringType(valueSchema, GenericData.StringType.String); Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema); FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0")); SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader); for (int i = 0; i < numElements; i++) { AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next()); int key = wrappedEntry.getKey(); Assert.assertEquals(i, key); String value = wrappedEntry.getValue(); Assert.assertEquals("message #" + i, value); } dataFileStream.close(); inStream.close(); }
Example 19
Source File: BucketingSinkTest.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * This tests {@link AvroKeyValueSinkWriter} * with non-rolling output and with compression. */ @Test public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception { final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out"; final int numElements = 20; Map<String, String> properties = new HashMap<>(); Schema keySchema = Schema.create(Schema.Type.INT); Schema valueSchema = Schema.create(Schema.Type.STRING); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString()); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true)); properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC); BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath) .setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)) .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()) .setPartPrefix(PART_PREFIX) .setPendingPrefix("") .setPendingSuffix(""); OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0); testHarness.setProcessingTime(0L); testHarness.setup(); testHarness.open(); for (int i = 0; i < numElements; i++) { testHarness.processElement(new StreamRecord<>(Tuple2.of( i, "message #" + Integer.toString(i) ))); } testHarness.close(); GenericData.setStringType(valueSchema, GenericData.StringType.String); Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema); FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0")); SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader); for (int i = 0; i < numElements; i++) { AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next()); int key = wrappedEntry.getKey(); Assert.assertEquals(i, key); String value = wrappedEntry.getValue(); Assert.assertEquals("message #" + i, value); } dataFileStream.close(); inStream.close(); }
Example 20
Source File: ThirdeyeAvroUtils.java From incubator-pinot with Apache License 2.0 | 3 votes |
/** * extracts avro schema from avro file * @param avroFile * @return * @throws FileNotFoundException * @throws IOException */ public static Schema extractSchemaFromAvro(Path avroFile) throws IOException { DataFileStream<GenericRecord> dataStreamReader = getAvroReader(avroFile); Schema avroSchema = dataStreamReader.getSchema(); dataStreamReader.close(); return avroSchema; }