Java Code Examples for org.apache.avro.file.DataFileStream#hasNext()
The following examples show how to use
org.apache.avro.file.DataFileStream#hasNext() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SegmentCreationPhaseMapReduceJob.java From incubator-pinot with Apache License 2.0 | 6 votes |
private LongColumnPreIndexStatsCollector getTimeColumnStatsCollector(Schema schema, File localAvroFile) throws FileNotFoundException, IOException { String timeColumnName = schema.getTimeColumnName(); FieldSpec spec = schema.getTimeFieldSpec(); LOGGER.info("Spec for " + timeColumnName + " is " + spec); LongColumnPreIndexStatsCollector timeColumnStatisticsCollector = new LongColumnPreIndexStatsCollector(spec.getName(), new StatsCollectorConfig(schema, null)); LOGGER.info("StatsCollector :" + timeColumnStatisticsCollector); DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(localAvroFile), new GenericDatumReader<GenericRecord>()); while (dataStream.hasNext()) { GenericRecord next = dataStream.next(); timeColumnStatisticsCollector.collect(next.get(timeColumnName)); } dataStream.close(); timeColumnStatisticsCollector.seal(); return timeColumnStatisticsCollector; }
Example 2
Source File: Examples.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Integer> loadOutputCounts(Path path, String timestamp) throws IOException { HashMap<Long,Integer> counts = new HashMap<Long,Integer>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); _log.info("found: " + r.toString()); Long memberId = (Long)((GenericRecord)r.get("key")).get("member_id"); Assert.assertNotNull(memberId); Integer count = (Integer)((GenericRecord)r.get("value")).get("count"); Assert.assertNotNull(count); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 3
Source File: TestHDFSCompressedDataStream.java From mt-flume with Apache License 2.0 | 5 votes |
@Test public void testGzipDurabilityWithSerializer() throws Exception { Context context = new Context(); context.put("serializer", "AVRO_EVENT"); HDFSCompressedDataStream writer = new HDFSCompressedDataStream(); writer.configure(context); writer.open(fileURI, factory.getCodec(new Path(fileURI)), SequenceFile.CompressionType.BLOCK); String[] bodies = { "yarf!", "yarfing!" }; writeBodies(writer, bodies); int found = 0; int expected = bodies.length; List<String> expectedBodies = Lists.newArrayList(bodies); GZIPInputStream cmpIn = new GZIPInputStream(new FileInputStream(file)); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> avroStream = new DataFileStream<GenericRecord>(cmpIn, reader); GenericRecord record = new GenericData.Record(avroStream.getSchema()); while (avroStream.hasNext()) { avroStream.next(record); CharsetDecoder decoder = Charsets.UTF_8.newDecoder(); String bodyStr = decoder.decode((ByteBuffer) record.get("body")) .toString(); expectedBodies.remove(bodyStr); found++; } avroStream.close(); cmpIn.close(); Assert.assertTrue("Found = " + found + ", Expected = " + expected + ", Left = " + expectedBodies.size() + " " + expectedBodies, expectedBodies.size() == 0); }
Example 4
Source File: TestHDFSEventSink.java From mt-flume with Apache License 2.0 | 5 votes |
private void verifyOutputAvroFiles(FileSystem fs, Configuration conf, String dir, String prefix, List<String> bodies) throws IOException { int found = 0; int expected = bodies.size(); for(String outputFile : getAllFiles(dir)) { String name = (new File(outputFile)).getName(); if(name.startsWith(prefix)) { FSDataInputStream input = fs.open(new Path(outputFile)); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> avroStream = new DataFileStream<GenericRecord>(input, reader); GenericRecord record = new GenericData.Record(avroStream.getSchema()); while (avroStream.hasNext()) { avroStream.next(record); ByteBuffer body = (ByteBuffer) record.get("body"); CharsetDecoder decoder = Charsets.UTF_8.newDecoder(); String bodyStr = decoder.decode(body).toString(); LOG.debug("Removing event: {}", bodyStr); bodies.remove(bodyStr); found++; } avroStream.close(); input.close(); } } Assert.assertTrue("Found = " + found + ", Expected = " + expected + ", Left = " + bodies.size() + " " + bodies, bodies.size() == 0); }
Example 5
Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(Path path, String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 6
Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadIntermediateCounts(Path path, String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_intermediatePath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_intermediatePath,nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 7
Source File: PartitionPreservingJoinTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,ImpressionClick> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,ImpressionClick> counts = new HashMap<Long,ImpressionClick>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Integer impressions = (Integer)((GenericRecord)r.get("value")).get("impressions"); Integer clicks = (Integer)((GenericRecord)r.get("value")).get("clicks"); Assert.assertFalse(counts.containsKey(memberId)); ImpressionClick data = new ImpressionClick(); data.clicks = clicks; data.impressions = impressions; counts.put(memberId, data); } } finally { dataFileStream.close(); } } return counts; }
Example 8
Source File: PartitionCollapsingJoinTest.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,ImpressionClick> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,ImpressionClick> counts = new HashMap<Long,ImpressionClick>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Integer impressions = (Integer)((GenericRecord)r.get("value")).get("impressions"); Integer clicks = (Integer)((GenericRecord)r.get("value")).get("clicks"); Assert.assertFalse(counts.containsKey(memberId)); ImpressionClick data = new ImpressionClick(); data.clicks = clicks; data.impressions = impressions; counts.put(memberId, data); } } finally { dataFileStream.close(); } } return counts; }
Example 9
Source File: PartitionCollapsingTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 10
Source File: TestAvroJob.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)r.get("id"); Long count = (Long)r.get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 11
Source File: PartitionPreservingTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example 12
Source File: AvroRowDecoder.java From presto with Apache License 2.0 | 5 votes |
@Override public Optional<Map<DecoderColumnHandle, FieldValueProvider>> decodeRow(byte[] data, Map<String, String> dataMap) { GenericRecord avroRecord; DataFileStream<GenericRecord> dataFileReader = null; try { // Assumes producer uses DataFileWriter or data comes in this particular format. // TODO: Support other forms for producers dataFileReader = new DataFileStream<>(new ByteArrayInputStream(data), avroRecordReader); if (!dataFileReader.hasNext()) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "No avro record found"); } avroRecord = dataFileReader.next(); if (dataFileReader.hasNext()) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "Unexpected extra record found"); } } catch (Exception e) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "Decoding Avro record failed.", e); } finally { closeQuietly(dataFileReader); } return Optional.of(columnDecoders.entrySet().stream() .collect(toImmutableMap( Map.Entry::getKey, entry -> entry.getValue().decodeField(avroRecord)))); }
Example 13
Source File: TestAvroStorage.java From spork with Apache License 2.0 | 5 votes |
private Set<Object> getExpected (String pathstr ) throws IOException { Set<Object> ret = new HashSet<Object>(); FileSystem fs = FileSystem.getLocal(new Configuration()); /* read in output results and compare */ Path output = new Path(pathstr); assertTrue("Expected output does not exists!", fs.exists(output)); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); DataFileStream<Object> in = new DataFileStream<Object>(fs.open(filePath), reader); while (in.hasNext()) { Object obj = in.next(); ret.add(obj); } in.close(); } } return ret; }
Example 14
Source File: TestAvroStorage.java From spork with Apache License 2.0 | 5 votes |
private Set<GenericData.Record> getExpected (String pathstr ) throws IOException { Set<GenericData.Record> ret = new TreeSet<GenericData.Record>( new Comparator<GenericData.Record>() { @Override public int compare(Record o1, Record o2) { return o1.toString().compareTo(o2.toString()); }} ); FileSystem fs = FileSystem.getLocal(new Configuration()); /* read in output results and compare */ Path output = new Path(pathstr); assertTrue("Expected output does not exists!", fs.exists(output)); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>(); DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(fs.open(filePath), reader); while (in.hasNext()) { GenericData.Record obj = in.next(); ret.add(obj); } in.close(); } } return ret; }
Example 15
Source File: TestAvroStorage.java From spork with Apache License 2.0 | 5 votes |
private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException { FileSystem fs = FileSystem.getLocal(new Configuration()) ; /* read in expected results*/ Set<GenericData.Record> expected = getExpected (expectedOutpath); /* read in output results and compare */ Path output = new Path(outPath); assertTrue("Output dir does not exists!", fs.exists(output) && fs.getFileStatus(output).isDir()); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>(); DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>( fs.open(filePath), reader); assertEquals("codec", expectedCodec, in.getMetaString("avro.codec")); int count = 0; while (in.hasNext()) { GenericData.Record obj = in.next(); assertTrue("Avro result object found that's not expected: Found " + (obj != null ? obj.getSchema() : "null") + ", " + obj.toString() + "\nExpected " + (expected != null ? expected.toString() : "null") + "\n" , expected.contains(obj)); count++; } in.close(); assertEquals(expected.size(), count); } } }
Example 16
Source File: BloomFilterCreator.java From hiped2 with Apache License 2.0 | 5 votes |
public static BloomFilter readFromAvro(InputStream is) throws IOException { DataFileStream<Object> reader = new DataFileStream<Object>( is, new GenericDatumReader<Object>()); reader.hasNext(); BloomFilter filter = new BloomFilter(); AvroBytesRecord .fromGenericRecord((GenericRecord) reader.next(), filter); IOUtils.closeQuietly(is); IOUtils.closeQuietly(reader); return filter; }
Example 17
Source File: BloomFilterDumper.java From hiped2 with Apache License 2.0 | 5 votes |
public static BloomFilter readFromAvro(InputStream is) throws IOException { DataFileStream<Object> reader = new DataFileStream<Object>( is, new GenericDatumReader<Object>()); reader.hasNext(); BloomFilter filter = new BloomFilter(); AvroBytesRecord .fromGenericRecord((GenericRecord) reader.next(), filter); IOUtils.closeQuietly(is); IOUtils.closeQuietly(reader); return filter; }
Example 18
Source File: RegressionAdmmTrain.java From ml-ease with Apache License 2.0 | 5 votes |
private void computeU(JobConf conf, String uPath, String uplusxPath, Map<String, LinearModel> z) throws IOException { AvroHdfsFileWriter<GenericRecord> writer = new AvroHdfsFileWriter<GenericRecord>(conf, uPath, LinearModelAvro.SCHEMA$); DataFileWriter<GenericRecord> recordwriter = writer.get(); // read u+x for (Path path : Util.findPartFiles(conf, new Path(uplusxPath))) { DataFileStream<Object> stream = AvroUtils.getAvroDataStream(conf, path); while (stream.hasNext()) { GenericData.Record record = (GenericData.Record) stream.next(); String partitionID = Util.getStringAvro(record, "key", false); if (record.get("uplusx") != null) { String lambda = Util.getLambda(partitionID); LinearModel newu = new LinearModel(LibLinearDataset.INTERCEPT_NAME, (List<?>) record.get("uplusx")); newu.linearCombine(1.0, -1.0, z.get(lambda)); GenericData.Record newvaluemap = new GenericData.Record(LinearModelAvro.SCHEMA$); List modellist = newu.toAvro(LibLinearDataset.INTERCEPT_NAME); newvaluemap.put("key", partitionID); newvaluemap.put("model", modellist); recordwriter.append(newvaluemap); } } } recordwriter.close(); }
Example 19
Source File: AvroFileReader.java From ml-ease with Apache License 2.0 | 5 votes |
public <T> void build(String filePath, AvroConsumer<T> builder) throws IOException { List<Path> paths = getPaths(filePath); for (Path path: paths) { DataFileStream<Object> stream = null; try { stream = getAvroDataStream(path); while (stream.hasNext()) { builder.consume(stream.next()); } } finally { if (stream != null) { stream.close(); } } } builder.done(); }
Example 20
Source File: WholeFileTransformerProcessor.java From datacollector with Apache License 2.0 | 4 votes |
/** * Convert Avro record to Parquet * @param sourceFileName the source Avro file name * @param fileReader the {@link org.apache.avro.file.DataFileStream} Avro file reader * @param tempParquetFile the {@link java.nio.file.Path} temporary parquet file path */ private void writeParquet(String sourceFileName, DataFileStream<GenericRecord> fileReader, Path tempParquetFile) throws StageException { long recordCount = 0; GenericRecord avroRecord; Schema schema = fileReader.getSchema(); LOG.debug("Start reading input file : {}", sourceFileName); try { // initialize parquet writer Configuration jobConfiguration = new Configuration(); String compressionCodecName = compressionElEval.eval(variables, jobConfig.avroParquetConfig.compressionCodec, String.class); jobConfiguration.set(AvroParquetConstants.COMPRESSION_CODEC_NAME, compressionCodecName); jobConfiguration.setInt(AvroParquetConstants.ROW_GROUP_SIZE, jobConfig.avroParquetConfig.rowGroupSize); jobConfiguration.setInt(AvroParquetConstants.PAGE_SIZE, jobConfig.avroParquetConfig.pageSize); jobConfiguration.setInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, jobConfig.avroParquetConfig.dictionaryPageSize); jobConfiguration.setInt(AvroParquetConstants.MAX_PADDING_SIZE, jobConfig.avroParquetConfig.maxPaddingSize); // Parquet writer ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter( new org.apache.hadoop.fs.Path(tempParquetFile.toString()), schema, jobConfiguration ); parquetWriter = builder.build(); while (fileReader.hasNext()) { avroRecord = fileReader.next(); parquetWriter.write(avroRecord); recordCount++; } parquetWriter.close(); } catch (IOException ex) { throw new TransformerStageCheckedException( Errors.CONVERT_08, sourceFileName, recordCount, ex ); } LOG.debug("Finished writing {} records to {}", recordCount, tempParquetFile.getFileName()); }