org.apache.avro.file.DataFileWriter Java Examples
The following examples show how to use
org.apache.avro.file.DataFileWriter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DailyTrackingWriter.java From datafu with Apache License 2.0 | 7 votes |
public void open(int year, int month, int day) throws IOException { if (_dataWriter != null) { throw new RuntimeException("Already have data writer"); } Path dailyPath = _outputPath; Path path = new Path(dailyPath,String.format("%04d/%02d/%02d",year,month,day)); _outputStream = _fs.create(new Path(path, "part-00000.avro")); GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(); _dataWriter = new DataFileWriter<GenericRecord>(writer); _dataWriter.create(_schema, _outputStream); }
Example #2
Source File: AvroAppender.java From tajo with Apache License 2.0 | 6 votes |
/** * Initializes the Appender. */ public void init() throws IOException { FileSystem fs = path.getFileSystem(conf); FSDataOutputStream outputStream = fs.create(path, false); avroSchema = AvroUtil.getAvroSchema(meta, conf); avroFields = avroSchema.getFields(); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(avroSchema); dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(avroSchema, outputStream); if (tableStatsEnabled) { this.stats = new TableStatistics(schema, columnStatsEnabled); } super.init(); }
Example #3
Source File: TestExtractAvroMetadata.java From localization_nifi with Apache License 2.0 | 6 votes |
@Test public void testExtractionWithCodec() throws IOException { final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata()); runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_CODEC_ATTR); // test dynamic attribute avro.codec final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc")); final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three")); final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema); final ByteArrayOutputStream out = new ByteArrayOutputStream(); final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.setCodec(CodecFactory.deflateCodec(1)); dataFileWriter.create(schema, out); dataFileWriter.append(data); dataFileWriter.close(); runner.enqueue(out.toByteArray()); runner.run(); runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1); final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0); flowFile.assertAttributeEquals("avro.codec", "deflate"); }
Example #4
Source File: TestConvertAvroToORC.java From nifi with Apache License 2.0 | 6 votes |
@Test public void test_onTrigger_routing_to_failure_null_type() throws Exception { String testString = "Hello World"; GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithNull(testString); DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema()); DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer); ByteArrayOutputStream out = new ByteArrayOutputStream(); fileWriter.create(record.getSchema(), out); fileWriter.append(record); fileWriter.flush(); fileWriter.close(); out.close(); Map<String, String> attributes = new HashMap<String, String>() {{ put(CoreAttributes.FILENAME.key(), "test.avro"); }}; runner.enqueue(out.toByteArray(), attributes); runner.run(); runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1); MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0); assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key())); assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (string STRING, null BOOLEAN) STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE)); }
Example #5
Source File: AvroUtils.java From Cubert with Apache License 2.0 | 6 votes |
public static void createFileIfNotExists(BlockSchema fileSchema, String path) throws IOException { Configuration conf = new JobConf(); FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(path))) return; Schema avroSchema = convertFromBlockSchema("CUBERT_MV_RECORD", fileSchema); System.out.println("Creating avro file with schema = " + avroSchema); GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(avroSchema); DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(datumWriter); FSDataOutputStream fout = FileSystem.create(fs, new Path(path), new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.READ_EXECUTE)); writer.create(avroSchema, fout); writer.flush(); writer.close(); }
Example #6
Source File: PutHiveStreaming.java From localization_nifi with Apache License 2.0 | 6 votes |
private void appendRecordsToFlowFile(ProcessSession session, List<HiveStreamingRecord> records, AtomicReference<FlowFile> appendFlowFile, DataFileWriter<GenericRecord> avroWriter, DataFileStream<GenericRecord> reader) throws IOException { appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> { try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) { for (HiveStreamingRecord sRecord : records) { writer.append(sRecord.getRecord()); } writer.flush(); } })); }
Example #7
Source File: TestAzureBlobAvroWriter.java From samza with Apache License 2.0 | 6 votes |
@Before public void setup() throws Exception { threadPool = new ThreadPoolExecutor(1, 1, 60, TimeUnit.SECONDS, new LinkedBlockingDeque<>()); ome = createOME("Topic1"); encodedRecord = new byte[100]; BlobContainerAsyncClient mockContainerAsyncClient = PowerMockito.mock(BlobContainerAsyncClient.class); mockDataFileWriter = mock(DataFileWriter.class); mockAzureBlobOutputStream = mock(AzureBlobOutputStream.class); mockBlockBlobAsyncClient = PowerMockito.mock(BlockBlobAsyncClient.class); when(mockBlockBlobAsyncClient.getBlobUrl()).thenReturn("https://samza.blob.core.windows.net/fake-blob-url"); mockCompression = CompressionFactory.getInstance().getCompression(CompressionType.GZIP); azureBlobAvroWriter = spy(new AzureBlobAvroWriter(mockContainerAsyncClient, mock(AzureBlobWriterMetrics.class), threadPool, THRESHOLD, 60000, "test", mockDataFileWriter, mockAzureBlobOutputStream, mockBlockBlobAsyncClient, blobMetadataGeneratorFactory, blobMetadataGeneratorConfig, STREAM_NAME, Long.MAX_VALUE, Long.MAX_VALUE, mockCompression, false)); // keeping blob size and number of records unlimited doReturn(encodedRecord).when(azureBlobAvroWriter).encodeRecord((IndexedRecord) ome.getMessage()); }
Example #8
Source File: FsSpecProducer.java From incubator-gobblin with Apache License 2.0 | 6 votes |
private void writeAvroJobSpec(AvroJobSpec jobSpec) throws IOException { DatumWriter<AvroJobSpec> datumWriter = new SpecificDatumWriter<>(AvroJobSpec.SCHEMA$); DataFileWriter<AvroJobSpec> dataFileWriter = new DataFileWriter<>(datumWriter); Path jobSpecPath = new Path(this.specConsumerPath, jobSpec.getUri()); //Write the new JobSpec to a temporary path first. Path tmpDir = new Path(this.specConsumerPath, "_tmp"); if (!fs.exists(tmpDir)) { fs.mkdirs(tmpDir); } Path tmpJobSpecPath = new Path(tmpDir, jobSpec.getUri()); OutputStream out = fs.create(tmpJobSpecPath); dataFileWriter.create(AvroJobSpec.SCHEMA$, out); dataFileWriter.append(jobSpec); dataFileWriter.close(); //Rename the JobSpec from temporary to final location. HadoopUtils.renamePath(fs, tmpJobSpecPath, jobSpecPath, true); }
Example #9
Source File: RedshiftIT.java From digdag with Apache License 2.0 | 6 votes |
private byte[] avroTestData(List<Schema.Field> fields, List<Map<String, Object>> records) throws IOException { Schema schema = Schema.createRecord("testdata", null, null, false); schema.setFields(fields); ByteArrayOutputStream out = new ByteArrayOutputStream(); GenericDatumWriter<GenericData.Record> datum = new GenericDatumWriter<>(schema); DataFileWriter<GenericData.Record> writer = new DataFileWriter<>(datum); writer.create(schema, out); for (Map<String, Object> record : records) { GenericData.Record r = new GenericData.Record(schema); for (Map.Entry<String, Object> item : record.entrySet()) { r.put(item.getKey(), item.getValue()); } writer.append(r); } writer.close(); return out.toByteArray(); }
Example #10
Source File: PentahoAvroOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public IPentahoRecordWriter createRecordWriter() throws Exception { validate(); if ( fields == null || StringUtils.isEmpty( nameSpace ) || StringUtils.isEmpty( recordName ) || StringUtils .isEmpty( outputFilename ) ) { throw new Exception( "Invalid state. One of the following required fields is null: 'nameSpace', 'recordNum', or 'outputFileName" ); } Schema schema = getSchema(); writeAvroSchemaToFile( schemaFilename ); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>( schema ); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>( datumWriter ); dataFileWriter.setCodec( codecFactory ); dataFileWriter.create( schema, KettleVFS.getOutputStream( outputFilename, variableSpace, false ) ); return new PentahoAvroRecordWriter( dataFileWriter, schema, fields ); }
Example #11
Source File: TestAvroEventDeserializer.java From mt-flume with Apache License 2.0 | 6 votes |
private File newTestFile(boolean deleteOnExit) throws IOException { File tempFile = File.createTempFile("testDirectFile", "tmp"); if (deleteOnExit) { tempFile.deleteOnExit(); } DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(schema)); writer.create(schema, tempFile); GenericRecordBuilder recordBuilder; recordBuilder = new GenericRecordBuilder(schema); recordBuilder.set("foo", "bar"); GenericRecord record = recordBuilder.build(); writer.append(record); writer.sync(); recordBuilder = new GenericRecordBuilder(schema); recordBuilder.set("foo", "baz"); record = recordBuilder.build(); writer.append(record); writer.sync(); writer.flush(); writer.close(); return tempFile; }
Example #12
Source File: PutHiveStreaming.java From nifi with Apache License 2.0 | 6 votes |
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader, DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) { writer.setCodec(CodecFactory.fromString(codec)); // Transfer metadata (this is a subset of the incoming file) for (String metaKey : reader.getMetaKeys()) { if (!RESERVED_METADATA.contains(metaKey)) { writer.setMeta(metaKey, reader.getMeta(metaKey)); } } final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream(); flowFileRef.set(session.append(flowFileRef.get(), (out) -> { // Create writer so that records can be appended later. writer.create(reader.getSchema(), avroHeader); writer.close(); final byte[] header = avroHeader.toByteArray(); out.write(header); })); // Capture the Avro header byte array that is just written to the FlowFile. // This is needed when Avro records are appended to the same FlowFile. return avroHeader.toByteArray(); }
Example #13
Source File: PutHiveStreaming.java From nifi with Apache License 2.0 | 6 votes |
private void appendAvroRecords(ProcessSession session, byte[] avroHeader, DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef, List<HiveStreamingRecord> hRecords) { flowFileRef.set(session.append(flowFileRef.get(), (out) -> { if (hRecords != null) { // Initialize the writer again as append mode, so that Avro header is written only once. writer.appendTo(new SeekableByteArrayInput(avroHeader), out); try { for (HiveStreamingRecord hRecord : hRecords) { writer.append(hRecord.getRecord()); } } catch (IOException ioe) { // The records were put to Hive Streaming successfully, but there was an error while writing the // Avro records to the flow file. Log as an error and move on. logger.error("Error writing Avro records (which were sent successfully to Hive Streaming) to the flow file, " + ioe, ioe); } } writer.close(); })); }
Example #14
Source File: StageRunData.java From geowave with Apache License 2.0 | 6 votes |
private synchronized DataFileWriter getDataWriterCreateIfNull( final String typeName, final GeoWaveAvroFormatPlugin plugin) { if (!cachedWriters.containsKey(typeName)) { FSDataOutputStream out = null; final DataFileWriter dfw = new DataFileWriter(new GenericDatumWriter()); cachedWriters.put(typeName, dfw); dfw.setCodec(CodecFactory.snappyCodec()); try { // TODO: we should probably clean up the type name to make it // HDFS path safe in case there are invalid characters // also, if a file already exists do we want to delete it or // append to it? out = fs.create(new Path(hdfsBaseDirectory, typeName)); dfw.create(plugin.getAvroSchema(), out); } catch (final IOException e) { LOGGER.error("Unable to create output stream", e); // cache a null value so we don't continually try to recreate cachedWriters.put(typeName, null); return null; } } return cachedWriters.get(typeName); }
Example #15
Source File: AbstractAvroEventSerializer.java From mt-flume with Apache License 2.0 | 6 votes |
@Override public void configure(Context context) { int syncIntervalBytes = context.getInteger(SYNC_INTERVAL_BYTES, DEFAULT_SYNC_INTERVAL_BYTES); String compressionCodec = context.getString(COMPRESSION_CODEC, DEFAULT_COMPRESSION_CODEC); writer = new ReflectDatumWriter<T>(getSchema()); dataFileWriter = new DataFileWriter<T>(writer); dataFileWriter.setSyncInterval(syncIntervalBytes); try { CodecFactory codecFactory = CodecFactory.fromString(compressionCodec); dataFileWriter.setCodec(codecFactory); } catch (AvroRuntimeException e) { logger.warn("Unable to instantiate avro codec with name (" + compressionCodec + "). Compression disabled. Exception follows.", e); } }
Example #16
Source File: JdbcAvroIO.java From dbeam with Apache License 2.0 | 6 votes |
@SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { logger.info("jdbcavroio : Preparing write..."); connection = jdbcAvroArgs.jdbcConnectionConfiguration().createConnection(); Void destination = getDestination(); Schema schema = dynamicDestinations.getSchema(destination); dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(schema)) .setCodec(jdbcAvroArgs.getCodecFactory()) .setSyncInterval(syncInterval); dataFileWriter.setMeta("created_by", this.getClass().getCanonicalName()); this.countingOutputStream = new CountingOutputStream(Channels.newOutputStream(channel)); dataFileWriter.create(schema, this.countingOutputStream); logger.info("jdbcavroio : Write prepared"); }
Example #17
Source File: AvroRecordWriter.java From presto with Apache License 2.0 | 6 votes |
public AvroRecordWriter(Path path, JobConf jobConf, boolean isCompressed, Properties properties) throws IOException { Schema schema; try { schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties); } catch (AvroSerdeException e) { throw new IOException(e); } GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter); if (isCompressed) { int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); dataFileWriter.setCodec(factory); } outputStream = path.getFileSystem(jobConf).create(path); dataFileWriter.create(schema, outputStream); delegate = new AvroGenericRecordWriter(dataFileWriter); }
Example #18
Source File: OsmPbfParser.java From geowave with Apache License 2.0 | 5 votes |
public void setupWriter( final DataFileWriter nodeWriter, final DataFileWriter wayWriter, final DataFileWriter relationWriter) { this.nodeWriter = nodeWriter; this.wayWriter = wayWriter; this.relationWriter = relationWriter; }
Example #19
Source File: AvroKeyValueWithMetadataRecordWriter.java From datafu with Apache License 2.0 | 5 votes |
public AvroKeyValueWithMetadataRecordWriter(AvroDatumConverter<K, ?> keyConverter, AvroDatumConverter<V, ?> valueConverter, CodecFactory compressionCodec, OutputStream outputStream, Configuration conf) throws IOException { // Create the generic record schema for the key/value pair. mKeyValuePairSchema = AvroKeyValue.getSchema( keyConverter.getWriterSchema(), valueConverter.getWriterSchema()); // Create an Avro container file and a writer to it. mAvroFileWriter = new DataFileWriter<GenericRecord>( new ReflectDatumWriter<GenericRecord>(mKeyValuePairSchema)); mAvroFileWriter.setCodec(compressionCodec); for (Entry<String,String> e : conf) { if (e.getKey().startsWith(TEXT_PREFIX)) mAvroFileWriter.setMeta(e.getKey().substring(TEXT_PREFIX.length()), e.getValue()); } mAvroFileWriter.create(mKeyValuePairSchema, outputStream); // Keep a reference to the converters. mKeyConverter = keyConverter; mValueConverter = valueConverter; // Create a reusable output record. mOutputRecord = new AvroKeyValue<Object, Object>(new GenericData.Record(mKeyValuePairSchema)); }
Example #20
Source File: AvroSpoolDirSourceTestUtil.java From datacollector with Apache License 2.0 | 5 votes |
public static File createAvroDataFile() throws Exception { File f = new File(createTestDir(), "file-0.avro"); Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); GenericRecord boss = new GenericData.Record(schema); boss.put("name", "boss"); boss.put("age", 60); boss.put("emails", ImmutableList.of("boss@company.com", "boss2@company.com")); boss.put("boss", null); GenericRecord e3 = new GenericData.Record(schema); e3.put("name", "c"); e3.put("age", 50); e3.put("emails", ImmutableList.of("c@company.com", "c2@company.com")); e3.put("boss", boss); GenericRecord e2 = new GenericData.Record(schema); e2.put("name", "b"); e2.put("age", 40); e2.put("emails", ImmutableList.of("b@company.com", "b2@company.com")); e2.put("boss", boss); GenericRecord e1 = new GenericData.Record(schema); e1.put("name", "a"); e1.put("age", 30); e1.put("emails", ImmutableList.of("a@company.com", "a2@company.com")); e1.put("boss", boss); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, f); dataFileWriter.append(e1); dataFileWriter.append(e2); dataFileWriter.append(e3); dataFileWriter.flush(); dataFileWriter.close(); return f; }
Example #21
Source File: AvroFileGenerator.java From flink-perf with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { // generate only avro file if (args.length == 2) { ordersPath = args[0]; outputOrderAvroPath = args[1]; // Generate file for avro test DatumWriter<Order> orderDatumWriter = new SpecificDatumWriter<Order>(Order.class); DataFileWriter<Order> dataFileWriter = new DataFileWriter<Order>(orderDatumWriter); dataFileWriter.create(Order.getClassSchema(), new File(outputOrderAvroPath)); Scanner s = new Scanner(new File(ordersPath)); while (s.hasNextLine()) { @SuppressWarnings("resource") Scanner lineScanner = new Scanner(s.nextLine()).useDelimiter("\\|"); Order o = new Order(); o.setOOrderkey(lineScanner.nextInt()); o.setOCustkey(lineScanner.nextInt()); o.setOOrderstatus(lineScanner.next()); o.setOTotalprice(lineScanner.nextFloat()); o.setOOrderdate(lineScanner.next()); o.setOOrderpriority(lineScanner.next()); o.setOClerk(lineScanner.next()); o.setOShipproprity(lineScanner.nextInt()); o.setOComment(lineScanner.next()); dataFileWriter.append(o); lineScanner.close(); } dataFileWriter.flush(); s.close(); dataFileWriter.close(); return; } else { System.err.println("Usage: <inputFilePath> <outputAvroPath>"); System.exit(1); } }
Example #22
Source File: SdcAvroTestUtil.java From datacollector with Apache License 2.0 | 5 votes |
public static File createAvroDataFile() throws Exception { File f = new File(createTestDir(), "file-0.avro"); Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); GenericRecord boss = new GenericData.Record(schema); boss.put("name", "boss"); boss.put("age", 60); boss.put("emails", ImmutableList.of("boss@company.com", "boss2@company.com")); boss.put("boss", null); GenericRecord e3 = new GenericData.Record(schema); e3.put("name", "c"); e3.put("age", 50); e3.put("emails", ImmutableList.of("c@company.com", "c2@company.com")); e3.put("boss", boss); GenericRecord e2 = new GenericData.Record(schema); e2.put("name", "b"); e2.put("age", 40); e2.put("emails", ImmutableList.of("b@company.com", "b2@company.com")); e2.put("boss", boss); GenericRecord e1 = new GenericData.Record(schema); e1.put("name", "a"); e1.put("age", 30); e1.put("emails", ImmutableList.of("a@company.com", "a2@company.com")); e1.put("boss", boss); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, f); dataFileWriter.append(e1); dataFileWriter.append(e2); dataFileWriter.append(e3); dataFileWriter.flush(); dataFileWriter.close(); return f; }
Example #23
Source File: WriteAvroResultWithSchema.java From nifi with Apache License 2.0 | 5 votes |
public WriteAvroResultWithSchema(final Schema schema, final OutputStream out, final CodecFactory codec) throws IOException { super(out); this.schema = schema; final GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.setCodec(codec); dataFileWriter.create(schema, out); }
Example #24
Source File: AzureBlobAvroWriter.java From samza with Apache License 2.0 | 5 votes |
public BlobWriterComponents(DataFileWriter dataFileWriter, AzureBlobOutputStream azureBlobOutputStream, BlockBlobAsyncClient blockBlobAsyncClient) { Preconditions.checkNotNull(dataFileWriter, "DataFileWriter can not be null when creating WriterComponents for an Azure Blob."); Preconditions.checkNotNull(azureBlobOutputStream, "AzureBlobOutputStream can not be null when creating WriterComponents for an Azure Blob."); Preconditions.checkNotNull(blockBlobAsyncClient, "BlockBlobAsyncClient can not be null when creating WriterComponents for an Azure Blob."); this.dataFileWriter = dataFileWriter; this.azureBlobOutputStream = azureBlobOutputStream; this.blockBlobAsyncClient = blockBlobAsyncClient; }
Example #25
Source File: AvroTestTools.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private void writeAsAvroBinary(Iterator<GenericRecord> input, Schema schema, FileSystem fs, Path outputPath) throws IOException { DataFileWriter writer = new DataFileWriter(new GenericDatumWriter()); writer.create(schema, fs.create(outputPath, true)); while (input.hasNext()) { writer.append(input.next()); } writer.close(); log.info("Successfully wrote avro file to path " + outputPath); }
Example #26
Source File: AvroRecordWriter.java From spork with Apache License 2.0 | 5 votes |
static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer, JobConf job) throws UnsupportedEncodingException { if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(factory); } // Do max as core-default.xml has io.file.buffer.size as 4K writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max( job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL))); // copy metadata from job for (Map.Entry<String,String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), "ISO-8859-1") .getBytes("ISO-8859-1")); } }
Example #27
Source File: AvroRowWriter.java From beam with Apache License 2.0 | 5 votes |
AvroRowWriter( String basename, Schema schema, SerializableFunction<AvroWriteRequest<T>, AvroT> toAvroRecord, SerializableFunction<Schema, DatumWriter<AvroT>> writerFactory) throws Exception { super(basename, MimeTypes.BINARY); this.schema = schema; this.toAvroRecord = toAvroRecord; this.writer = new DataFileWriter<>(writerFactory.apply(schema)).create(schema, getOutputStream()); }
Example #28
Source File: AvroSink.java From beam with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { DestinationT destination = getDestination(); CodecFactory codec = dynamicDestinations.getCodec(destination); Schema schema = dynamicDestinations.getSchema(destination); Map<String, Object> metadata = dynamicDestinations.getMetadata(destination); DatumWriter<OutputT> datumWriter = genericRecords ? new GenericDatumWriter<>(schema) : new ReflectDatumWriter<>(schema); dataFileWriter = new DataFileWriter<>(datumWriter).setCodec(codec); for (Map.Entry<String, Object> entry : metadata.entrySet()) { Object v = entry.getValue(); if (v instanceof String) { dataFileWriter.setMeta(entry.getKey(), (String) v); } else if (v instanceof Long) { dataFileWriter.setMeta(entry.getKey(), (Long) v); } else if (v instanceof byte[]) { dataFileWriter.setMeta(entry.getKey(), (byte[]) v); } else { throw new IllegalStateException( "Metadata value type must be one of String, Long, or byte[]. Found " + v.getClass().getSimpleName()); } } dataFileWriter.create(schema, Channels.newOutputStream(channel)); }
Example #29
Source File: Purge.java From Cubert with Apache License 2.0 | 5 votes |
private void purge(String src, String dst) throws IOException { DataFileReader<GenericRecord> dataFileReader = createDataFileReader(src, false); DataFileWriter<GenericRecord> writer = createDataFileWriter(dataFileReader); numRecords = 0; recordsPurged = 0; remainingRecords = 0; // Copy while (dataFileReader.hasNext()) { numRecords++; GenericRecord record = dataFileReader.next(); if (record == null) { continue; } Number column = (Number) record.get(columnName); if ((column == null) || (!membersToPurge.contains(column.intValue()))) { remainingRecords++; writer.append(record); } } recordsPurged = numRecords - remainingRecords; writer.close(); dataFileReader.close(); }
Example #30
Source File: SparkVerifierTest.java From tablasco with Apache License 2.0 | 5 votes |
private static void writeAvroData(List<GenericRecord> data, File avroFile) throws IOException { FileUtils.forceMkdir(avroFile.getParentFile()); Schema schema = data.get(0).getSchema(); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, avroFile); for (GenericRecord genericRecord : data) { dataFileWriter.append(genericRecord); } dataFileWriter.close(); }