org.apache.avro.file.DataFileStream Java Examples
The following examples show how to use
org.apache.avro.file.DataFileStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroStorageUtils.java From Cubert with Apache License 2.0 | 6 votes |
/** * This method is called by {@link #getAvroSchema}. The default implementation * returns the schema of an avro file; or the schema of the last file in a first-level * directory (it does not contain sub-directories). * * @param path path of a file or first level directory * @param fs file system * @return avro schema * @throws IOException */ public static Schema getSchema(Path path, FileSystem fs) throws IOException { /* get path of the last file */ Path lastFile = AvroStorageUtils.getLast(path, fs); if (lastFile == null) { return null; } /* read in file and obtain schema */ GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>(); InputStream hdfsInputStream = fs.open(lastFile); DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader); Schema ret = avroDataStream.getSchema(); avroDataStream.close(); return ret; }
Example #2
Source File: PutHiveStreaming.java From localization_nifi with Apache License 2.0 | 6 votes |
private void appendRecordsToFlowFile(ProcessSession session, List<HiveStreamingRecord> records, AtomicReference<FlowFile> appendFlowFile, DataFileWriter<GenericRecord> avroWriter, DataFileStream<GenericRecord> reader) throws IOException { appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> { try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) { for (HiveStreamingRecord sRecord : records) { writer.append(sRecord.getRecord()); } writer.flush(); } })); }
Example #3
Source File: TestSelectHive3QL.java From nifi with Apache License 2.0 | 6 votes |
private long getNumberOfRecordsFromStream(InputStream in) throws IOException { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) { GenericRecord record = null; long recordsFromStream = 0; while (dataFileReader.hasNext()) { // Reuse record object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. record = dataFileReader.next(record); recordsFromStream += 1; } return recordsFromStream; } }
Example #4
Source File: TestSplitAvro.java From localization_nifi with Apache License 2.0 | 6 votes |
@Test public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException { final TestRunner runner = TestRunners.newTestRunner(new SplitAvro()); runner.setProperty(SplitAvro.TRANSFER_METADATA, "false"); runner.enqueue(users.toByteArray()); runner.run(); runner.assertTransferCount(SplitAvro.REL_SPLIT, 100); runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1); runner.assertTransferCount(SplitAvro.REL_FAILURE, 0); runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100"); final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT); checkDataFileSplitSize(flowFiles, 1, false); for (final MockFlowFile flowFile : flowFiles) { try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray()); final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) { Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1)); Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2)); Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3)); } } }
Example #5
Source File: TestSelectHive_1_1QL.java From nifi with Apache License 2.0 | 6 votes |
private long getNumberOfRecordsFromStream(InputStream in) throws IOException { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) { GenericRecord record = null; long recordsFromStream = 0; while (dataFileReader.hasNext()) { // Reuse record object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. record = dataFileReader.next(record); recordsFromStream += 1; } return recordsFromStream; } }
Example #6
Source File: TestSelectHiveQL.java From nifi with Apache License 2.0 | 6 votes |
private long getNumberOfRecordsFromStream(InputStream in) throws IOException { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) { GenericRecord record = null; long recordsFromStream = 0; while (dataFileReader.hasNext()) { // Reuse record object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. record = dataFileReader.next(record); recordsFromStream += 1; } return recordsFromStream; } }
Example #7
Source File: TestJdbcCommon.java From nifi with Apache License 2.0 | 6 votes |
@Test public void testConvertToAvroStreamForShort() throws SQLException, IOException { final ResultSetMetaData metadata = mock(ResultSetMetaData.class); when(metadata.getColumnCount()).thenReturn(1); when(metadata.getColumnType(1)).thenReturn(Types.TINYINT); when(metadata.getColumnName(1)).thenReturn("t_int"); when(metadata.getTableName(1)).thenReturn("table"); final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata); final short s = 25; when(rs.getObject(Mockito.anyInt())).thenReturn(s); final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs); final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) { GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(record); assertEquals(Short.toString(s), record.get("t_int").toString()); } } }
Example #8
Source File: PutHiveStreaming.java From nifi with Apache License 2.0 | 6 votes |
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader, DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) { writer.setCodec(CodecFactory.fromString(codec)); // Transfer metadata (this is a subset of the incoming file) for (String metaKey : reader.getMetaKeys()) { if (!RESERVED_METADATA.contains(metaKey)) { writer.setMeta(metaKey, reader.getMeta(metaKey)); } } final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream(); flowFileRef.set(session.append(flowFileRef.get(), (out) -> { // Create writer so that records can be appended later. writer.create(reader.getSchema(), avroHeader); writer.close(); final byte[] header = avroHeader.toByteArray(); out.write(header); })); // Capture the Avro header byte array that is just written to the FlowFile. // This is needed when Avro records are appended to the same FlowFile. return avroHeader.toByteArray(); }
Example #9
Source File: SegmentCreationPhaseMapReduceJob.java From incubator-pinot with Apache License 2.0 | 6 votes |
private LongColumnPreIndexStatsCollector getTimeColumnStatsCollector(Schema schema, File localAvroFile) throws FileNotFoundException, IOException { String timeColumnName = schema.getTimeColumnName(); FieldSpec spec = schema.getTimeFieldSpec(); LOGGER.info("Spec for " + timeColumnName + " is " + spec); LongColumnPreIndexStatsCollector timeColumnStatisticsCollector = new LongColumnPreIndexStatsCollector(spec.getName(), new StatsCollectorConfig(schema, null)); LOGGER.info("StatsCollector :" + timeColumnStatisticsCollector); DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(localAvroFile), new GenericDatumReader<GenericRecord>()); while (dataStream.hasNext()) { GenericRecord next = dataStream.next(); timeColumnStatisticsCollector.collect(next.get(timeColumnName)); } dataStream.close(); timeColumnStatisticsCollector.seal(); return timeColumnStatisticsCollector; }
Example #10
Source File: AvroDateRangeMetadata.java From datafu with Apache License 2.0 | 6 votes |
/** * Reads the date range from the metadata stored in an Avro file. * * @param fs file system to access path * @param path path to get date range for * @return date range * @throws IOException IOException */ public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException { path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath(); FSDataInputStream dataInputStream = fs.open(path); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader); try { return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))), new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END)))); } finally { dataFileStream.close(); dataInputStream.close(); } }
Example #11
Source File: Examples.java From datafu with Apache License 2.0 | 6 votes |
private Long loadMemberCount(Path path, String timestamp) throws IOException { FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { GenericRecord r = dataFileStream.next(); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertNotNull(count); System.out.println("found count: " + count); return count; } finally { dataFileStream.close(); } } throw new RuntimeException("found no data"); }
Example #12
Source File: HadoopSegmentPreprocessingJob.java From incubator-pinot with Apache License 2.0 | 6 votes |
/** * Finds the avro file in the input folder, and returns its avro schema * @param inputPathDir Path to input directory * @return Input schema * @throws IOException exception when accessing to IO */ private Schema getSchema(Path inputPathDir) throws IOException { FileSystem fs = FileSystem.get(new Configuration()); Schema avroSchema = null; for (FileStatus fileStatus : fs.listStatus(inputPathDir)) { if (fileStatus.isFile() && fileStatus.getPath().getName().endsWith(".avro")) { _logger.info("Extracting schema from " + fileStatus.getPath()); try (DataFileStream<GenericRecord> dataStreamReader = getAvroReader(inputPathDir)) { avroSchema = dataStreamReader.getSchema(); } break; } } return avroSchema; }
Example #13
Source File: PentahoAvroInputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
private DataFileStream<Object> createNestedDataFileStream() throws Exception { DatumReader<Object> datumReader; if ( useFieldAsInputStream ) { datumReader = new GenericDatumReader<Object>(); inputStream.reset(); return new DataFileStream<Object>( inputStream, datumReader ); } if ( schemaFileName != null && schemaFileName.length() > 0 ) { Schema schema = new Schema.Parser().parse( KettleVFS.getInputStream( schemaFileName, variableSpace ) ); datumReader = new GenericDatumReader<Object>( schema ); } else { datumReader = new GenericDatumReader<Object>(); } FileObject fileObject = KettleVFS.getFileObject( fileName, variableSpace ); if ( fileObject.isFile() ) { this.inputStream = fileObject.getContent().getInputStream(); return new DataFileStream<>( inputStream, datumReader ); } else { FileObject[] avroFiles = fileObject.findFiles( new FileExtensionSelector( "avro" ) ); if ( !Utils.isEmpty( avroFiles ) ) { this.inputStream = avroFiles[ 0 ].getContent().getInputStream(); return new DataFileStream<>( inputStream, datumReader ); } return null; } }
Example #14
Source File: TestSplitAvro.java From nifi with Apache License 2.0 | 6 votes |
@Test public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException { final TestRunner runner = TestRunners.newTestRunner(new SplitAvro()); runner.setProperty(SplitAvro.TRANSFER_METADATA, "false"); runner.enqueue(users.toByteArray()); runner.run(); runner.assertTransferCount(SplitAvro.REL_SPLIT, 100); runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1); runner.assertTransferCount(SplitAvro.REL_FAILURE, 0); runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100"); final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT); checkDataFileSplitSize(flowFiles, 1, false); for (final MockFlowFile flowFile : flowFiles) { try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray()); final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) { Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1)); Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2)); Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3)); } } }
Example #15
Source File: TestJdbcCommonConvertToAvro.java From nifi with Apache License 2.0 | 6 votes |
@Test public void testConvertToAvroStreamForNumbers() throws SQLException, IOException { final ResultSetMetaData metadata = mock(ResultSetMetaData.class); when(metadata.getColumnCount()).thenReturn(1); when(metadata.getColumnType(1)).thenReturn(testParams.sqlType); when(metadata.isSigned(1)).thenReturn(testParams.signed); when(metadata.getPrecision(1)).thenReturn(testParams.precision); when(metadata.getColumnName(1)).thenReturn("t_int"); when(metadata.getTableName(1)).thenReturn("table"); final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata); final int ret = 0; when(rs.getObject(Mockito.anyInt())).thenReturn(ret); final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs); final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) { GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(record); assertEquals(Integer.toString(ret), record.get("t_int").toString()); } } }
Example #16
Source File: QueryDatabaseTableTest.java From nifi with Apache License 2.0 | 6 votes |
private long getNumberOfRecordsFromStream(InputStream in) throws IOException { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) { GenericRecord record = null; long recordsFromStream = 0; while (dataFileReader.hasNext()) { // Reuse record object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. record = dataFileReader.next(record); recordsFromStream += 1; } return recordsFromStream; } }
Example #17
Source File: TestHDFSCompressedDataStream.java From mt-flume with Apache License 2.0 | 5 votes |
@Test public void testGzipDurabilityWithSerializer() throws Exception { Context context = new Context(); context.put("serializer", "AVRO_EVENT"); HDFSCompressedDataStream writer = new HDFSCompressedDataStream(); writer.configure(context); writer.open(fileURI, factory.getCodec(new Path(fileURI)), SequenceFile.CompressionType.BLOCK); String[] bodies = { "yarf!", "yarfing!" }; writeBodies(writer, bodies); int found = 0; int expected = bodies.length; List<String> expectedBodies = Lists.newArrayList(bodies); GZIPInputStream cmpIn = new GZIPInputStream(new FileInputStream(file)); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> avroStream = new DataFileStream<GenericRecord>(cmpIn, reader); GenericRecord record = new GenericData.Record(avroStream.getSchema()); while (avroStream.hasNext()) { avroStream.next(record); CharsetDecoder decoder = Charsets.UTF_8.newDecoder(); String bodyStr = decoder.decode((ByteBuffer) record.get("body")) .toString(); expectedBodies.remove(bodyStr); found++; } avroStream.close(); cmpIn.close(); Assert.assertTrue("Found = " + found + ", Expected = " + expected + ", Left = " + expectedBodies.size() + " " + expectedBodies, expectedBodies.size() == 0); }
Example #18
Source File: AvroStockFileRead.java From hiped2 with Apache License 2.0 | 5 votes |
public static void dumpStream(InputStream is) throws IOException { DataFileStream<Stock> reader = new DataFileStream<Stock>( is, new SpecificDatumReader<Stock>(Stock.class)); for (Stock a : reader) { System.out.println(ToStringBuilder.reflectionToString(a, ToStringStyle.SIMPLE_STYLE )); } IOUtils.closeStream(is); IOUtils.closeStream(reader); }
Example #19
Source File: AvroUtils.java From incubator-pinot with Apache License 2.0 | 5 votes |
/** * Given an Avro data file, map from column to field type and time unit, return the equivalent Pinot schema. * * @param avroDataFile Avro data file * @param fieldTypeMap Map from column to field type * @param timeUnit Time unit * @return Pinot schema */ public static Schema getPinotSchemaFromAvroDataFile(File avroDataFile, @Nullable Map<String, FieldSpec.FieldType> fieldTypeMap, @Nullable TimeUnit timeUnit) throws IOException { try (DataFileStream<GenericRecord> reader = getAvroReader(avroDataFile)) { org.apache.avro.Schema avroSchema = reader.getSchema(); return getPinotSchemaFromAvroSchema(avroSchema, fieldTypeMap, timeUnit); } }
Example #20
Source File: PartitionPreservingJoinTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,ImpressionClick> loadOutputCounts(String timestamp) throws IOException { HashMap<Long,ImpressionClick> counts = new HashMap<Long,ImpressionClick>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_outputPath,nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Integer impressions = (Integer)((GenericRecord)r.get("value")).get("impressions"); Integer clicks = (Integer)((GenericRecord)r.get("value")).get("clicks"); Assert.assertFalse(counts.containsKey(memberId)); ImpressionClick data = new ImpressionClick(); data.clicks = clicks; data.impressions = impressions; counts.put(memberId, data); } } finally { dataFileStream.close(); } } return counts; }
Example #21
Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0 | 5 votes |
/** * Get the Avro file reader for the given file. */ public static DataFileStream<GenericRecord> getAvroReader(File avroFile) throws IOException { if (avroFile.getName().endsWith(".gz")) { return new DataFileStream<>(new GZIPInputStream(new FileInputStream(avroFile)), new GenericDatumReader<>()); } else { return new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<>()); } }
Example #22
Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0 | 5 votes |
private HashMap<Long,Long> loadOutputCounts(Path path, String timestamp) throws IOException { HashMap<Long,Long> counts = new HashMap<Long,Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try { while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long)((GenericRecord)r.get("key")).get("id"); Long count = (Long)((GenericRecord)r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
Example #23
Source File: Schemas.java From parquet-mr with Apache License 2.0 | 5 votes |
public static Schema fromAvro(InputStream in) throws IOException { GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> stream = null; boolean threw = true; try { stream = new DataFileStream<>(in, datumReader); Schema schema = stream.getSchema(); threw = false; return schema; } finally { Closeables.close(stream, threw); } }
Example #24
Source File: AvroUtils.java From ml-ease with Apache License 2.0 | 5 votes |
/** * Loads the schema from an Avro data file. * * @param conf The JobConf. * @param path The path to the data file. * @return The schema read from the data file's metadata. * @throws IOException */ public static Schema getSchemaFromFile(JobConf conf, Path path) throws IOException { FileSystem fs = path.getFileSystem(new Configuration()); FSDataInputStream dataInputStream = fs.open(path); DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader); return dataFileStream.getSchema(); }
Example #25
Source File: AvroDump.java From hiped2 with Apache License 2.0 | 5 votes |
public static void readFromAvro(InputStream is) throws IOException { DataFileStream<Object> reader = new DataFileStream<Object>( is, new GenericDatumReader<Object>()); for (Object o : reader) { System.out.println(o); } IOUtils.closeStream(is); IOUtils.closeStream(reader); }
Example #26
Source File: TestJdbcCommon.java From nifi with Apache License 2.0 | 5 votes |
@Test public void testConvertToAvroStreamForUnsignedIntegerWithPrecision10() throws SQLException, IOException { final String mockColumnName = "t_int"; final ResultSetMetaData metadata = mock(ResultSetMetaData.class); when(metadata.getColumnCount()).thenReturn(1); when(metadata.getColumnType(1)).thenReturn(Types.INTEGER); when(metadata.isSigned(1)).thenReturn(false); when(metadata.getPrecision(1)).thenReturn(10); when(metadata.getColumnName(1)).thenReturn(mockColumnName); when(metadata.getTableName(1)).thenReturn("table"); final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata); final Long ret = 0L; when(rs.getObject(Mockito.anyInt())).thenReturn(ret); final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs); final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) { GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(record); assertEquals(Long.toString(ret), record.get(mockColumnName).toString()); } } }
Example #27
Source File: GeoWaveAvroIngestTest.java From geowave with Apache License 2.0 | 5 votes |
private boolean validate(final URL file) { try (DataFileStream<AvroSimpleFeatureCollection> ds = new DataFileStream<>( file.openStream(), new SpecificDatumReader<AvroSimpleFeatureCollection>( AvroSimpleFeatureCollection.getClassSchema()))) { if (ds.getHeader() != null) { return true; } } catch (final IOException e) { // Do nothing for now } return false; }
Example #28
Source File: AvroFileReader.java From ml-ease with Apache License 2.0 | 5 votes |
public <T> void build(String filePath, AvroConsumer<T> builder) throws IOException { List<Path> paths = getPaths(filePath); for (Path path: paths) { DataFileStream<Object> stream = null; try { stream = getAvroDataStream(path); while (stream.hasNext()) { builder.consume(stream.next()); } } finally { if (stream != null) { stream.close(); } } } builder.done(); }
Example #29
Source File: RegressionAdmmTrain.java From ml-ease with Apache License 2.0 | 5 votes |
private void computeU(JobConf conf, String uPath, String uplusxPath, Map<String, LinearModel> z) throws IOException { AvroHdfsFileWriter<GenericRecord> writer = new AvroHdfsFileWriter<GenericRecord>(conf, uPath, LinearModelAvro.SCHEMA$); DataFileWriter<GenericRecord> recordwriter = writer.get(); // read u+x for (Path path : Util.findPartFiles(conf, new Path(uplusxPath))) { DataFileStream<Object> stream = AvroUtils.getAvroDataStream(conf, path); while (stream.hasNext()) { GenericData.Record record = (GenericData.Record) stream.next(); String partitionID = Util.getStringAvro(record, "key", false); if (record.get("uplusx") != null) { String lambda = Util.getLambda(partitionID); LinearModel newu = new LinearModel(LibLinearDataset.INTERCEPT_NAME, (List<?>) record.get("uplusx")); newu.linearCombine(1.0, -1.0, z.get(lambda)); GenericData.Record newvaluemap = new GenericData.Record(LinearModelAvro.SCHEMA$); List modellist = newu.toAvro(LibLinearDataset.INTERCEPT_NAME); newvaluemap.put("key", partitionID); newvaluemap.put("model", modellist); recordwriter.append(newvaluemap); } } } recordwriter.close(); }
Example #30
Source File: AvroStockAvgFileRead.java From hiped2 with Apache License 2.0 | 5 votes |
public static void readFromAvro(InputStream is) throws IOException { DataFileStream<StockAvg> reader = //<co id="ch03_smallfileread_comment1"/> new DataFileStream<StockAvg>( is, new SpecificDatumReader<StockAvg>(StockAvg.class)); for (StockAvg a : reader) { //<co id="ch03_smallfileread_comment2"/> System.out.println(ToStringBuilder.reflectionToString(a, ToStringStyle.SHORT_PREFIX_STYLE )); } IOUtils.closeStream(is); IOUtils.closeStream(reader); }