org.apache.avro.file.DataFileWriter Java Examples
The following examples show how to use
Example #1
Source File: From datafu with Apache License 2.0 | 7 votes |
public void open(int year, int month, int day) throws IOException { if (_dataWriter != null) { throw new RuntimeException("Already have data writer"); } Path dailyPath = _outputPath; Path path = new Path(dailyPath,String.format("%04d/%02d/%02d",year,month,day)); _outputStream = _fs.create(new Path(path, "part-00000.avro")); GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(); _dataWriter = new DataFileWriter<GenericRecord>(writer); _dataWriter.create(_schema, _outputStream); }
Example #2
Source File: From tajo with Apache License 2.0 | 6 votes |
/** * Initializes the Appender. */ public void init() throws IOException { FileSystem fs = path.getFileSystem(conf); FSDataOutputStream outputStream = fs.create(path, false); avroSchema = AvroUtil.getAvroSchema(meta, conf); avroFields = avroSchema.getFields(); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(avroSchema); dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(avroSchema, outputStream); if (tableStatsEnabled) { this.stats = new TableStatistics(schema, columnStatsEnabled); } super.init(); }
Example #3
Source File: From localization_nifi with Apache License 2.0 | 6 votes |
@Test public void testExtractionWithCodec() throws IOException { final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata()); runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_CODEC_ATTR); // test dynamic attribute avro.codec final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc")); final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three")); final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema); final ByteArrayOutputStream out = new ByteArrayOutputStream(); final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.setCodec(CodecFactory.deflateCodec(1)); dataFileWriter.create(schema, out); dataFileWriter.append(data); dataFileWriter.close(); runner.enqueue(out.toByteArray());; runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1); final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0); flowFile.assertAttributeEquals("avro.codec", "deflate"); }
Example #4
Source File: From nifi with Apache License 2.0 | 6 votes |
@Test public void test_onTrigger_routing_to_failure_null_type() throws Exception { String testString = "Hello World"; GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithNull(testString); DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema()); DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer); ByteArrayOutputStream out = new ByteArrayOutputStream(); fileWriter.create(record.getSchema(), out); fileWriter.append(record); fileWriter.flush(); fileWriter.close(); out.close(); Map<String, String> attributes = new HashMap<String, String>() {{ put(CoreAttributes.FILENAME.key(), "test.avro"); }}; runner.enqueue(out.toByteArray(), attributes);; runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1); MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0); assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key())); assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (string STRING, null BOOLEAN) STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE)); }
Example #5
Source File: From Cubert with Apache License 2.0 | 6 votes |
public static void createFileIfNotExists(BlockSchema fileSchema, String path) throws IOException { Configuration conf = new JobConf(); FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(path))) return; Schema avroSchema = convertFromBlockSchema("CUBERT_MV_RECORD", fileSchema); System.out.println("Creating avro file with schema = " + avroSchema); GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(avroSchema); DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(datumWriter); FSDataOutputStream fout = FileSystem.create(fs, new Path(path), new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.READ_EXECUTE)); writer.create(avroSchema, fout); writer.flush(); writer.close(); }
Example #6
Source File: From localization_nifi with Apache License 2.0 | 6 votes |
private void appendRecordsToFlowFile(ProcessSession session, List<HiveStreamingRecord> records, AtomicReference<FlowFile> appendFlowFile, DataFileWriter<GenericRecord> avroWriter, DataFileStream<GenericRecord> reader) throws IOException { appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> { try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) { for (HiveStreamingRecord sRecord : records) { writer.append(sRecord.getRecord()); } writer.flush(); } })); }
Example #7
Source File: From samza with Apache License 2.0 | 6 votes |
@Before public void setup() throws Exception { threadPool = new ThreadPoolExecutor(1, 1, 60, TimeUnit.SECONDS, new LinkedBlockingDeque<>()); ome = createOME("Topic1"); encodedRecord = new byte[100]; BlobContainerAsyncClient mockContainerAsyncClient = PowerMockito.mock(BlobContainerAsyncClient.class); mockDataFileWriter = mock(DataFileWriter.class); mockAzureBlobOutputStream = mock(AzureBlobOutputStream.class); mockBlockBlobAsyncClient = PowerMockito.mock(BlockBlobAsyncClient.class); when(mockBlockBlobAsyncClient.getBlobUrl()).thenReturn(""); mockCompression = CompressionFactory.getInstance().getCompression(CompressionType.GZIP); azureBlobAvroWriter = spy(new AzureBlobAvroWriter(mockContainerAsyncClient, mock(AzureBlobWriterMetrics.class), threadPool, THRESHOLD, 60000, "test", mockDataFileWriter, mockAzureBlobOutputStream, mockBlockBlobAsyncClient, blobMetadataGeneratorFactory, blobMetadataGeneratorConfig, STREAM_NAME, Long.MAX_VALUE, Long.MAX_VALUE, mockCompression, false)); // keeping blob size and number of records unlimited doReturn(encodedRecord).when(azureBlobAvroWriter).encodeRecord((IndexedRecord) ome.getMessage()); }
Example #8
Source File: From incubator-gobblin with Apache License 2.0 | 6 votes |
private void writeAvroJobSpec(AvroJobSpec jobSpec) throws IOException { DatumWriter<AvroJobSpec> datumWriter = new SpecificDatumWriter<>(AvroJobSpec.SCHEMA$); DataFileWriter<AvroJobSpec> dataFileWriter = new DataFileWriter<>(datumWriter); Path jobSpecPath = new Path(this.specConsumerPath, jobSpec.getUri()); //Write the new JobSpec to a temporary path first. Path tmpDir = new Path(this.specConsumerPath, "_tmp"); if (!fs.exists(tmpDir)) { fs.mkdirs(tmpDir); } Path tmpJobSpecPath = new Path(tmpDir, jobSpec.getUri()); OutputStream out = fs.create(tmpJobSpecPath); dataFileWriter.create(AvroJobSpec.SCHEMA$, out); dataFileWriter.append(jobSpec); dataFileWriter.close(); //Rename the JobSpec from temporary to final location. HadoopUtils.renamePath(fs, tmpJobSpecPath, jobSpecPath, true); }
Example #9
Source File: From digdag with Apache License 2.0 | 6 votes |
private byte[] avroTestData(List<Schema.Field> fields, List<Map<String, Object>> records) throws IOException { Schema schema = Schema.createRecord("testdata", null, null, false); schema.setFields(fields); ByteArrayOutputStream out = new ByteArrayOutputStream(); GenericDatumWriter<GenericData.Record> datum = new GenericDatumWriter<>(schema); DataFileWriter<GenericData.Record> writer = new DataFileWriter<>(datum); writer.create(schema, out); for (Map<String, Object> record : records) { GenericData.Record r = new GenericData.Record(schema); for (Map.Entry<String, Object> item : record.entrySet()) { r.put(item.getKey(), item.getValue()); } writer.append(r); } writer.close(); return out.toByteArray(); }
Example #10
Source File: From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public IPentahoRecordWriter createRecordWriter() throws Exception { validate(); if ( fields == null || StringUtils.isEmpty( nameSpace ) || StringUtils.isEmpty( recordName ) || StringUtils .isEmpty( outputFilename ) ) { throw new Exception( "Invalid state. One of the following required fields is null: 'nameSpace', 'recordNum', or 'outputFileName" ); } Schema schema = getSchema(); writeAvroSchemaToFile( schemaFilename ); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>( schema ); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>( datumWriter ); dataFileWriter.setCodec( codecFactory ); dataFileWriter.create( schema, KettleVFS.getOutputStream( outputFilename, variableSpace, false ) ); return new PentahoAvroRecordWriter( dataFileWriter, schema, fields ); }
Example #11
Source File: From mt-flume with Apache License 2.0 | 6 votes |
private File newTestFile(boolean deleteOnExit) throws IOException { File tempFile = File.createTempFile("testDirectFile", "tmp"); if (deleteOnExit) { tempFile.deleteOnExit(); } DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(schema)); writer.create(schema, tempFile); GenericRecordBuilder recordBuilder; recordBuilder = new GenericRecordBuilder(schema); recordBuilder.set("foo", "bar"); GenericRecord record =; writer.append(record); writer.sync(); recordBuilder = new GenericRecordBuilder(schema); recordBuilder.set("foo", "baz"); record =; writer.append(record); writer.sync(); writer.flush(); writer.close(); return tempFile; }
Example #12
Source File: From nifi with Apache License 2.0 | 6 votes |
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader, DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) { writer.setCodec(CodecFactory.fromString(codec)); // Transfer metadata (this is a subset of the incoming file) for (String metaKey : reader.getMetaKeys()) { if (!RESERVED_METADATA.contains(metaKey)) { writer.setMeta(metaKey, reader.getMeta(metaKey)); } } final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream(); flowFileRef.set(session.append(flowFileRef.get(), (out) -> { // Create writer so that records can be appended later. writer.create(reader.getSchema(), avroHeader); writer.close(); final byte[] header = avroHeader.toByteArray(); out.write(header); })); // Capture the Avro header byte array that is just written to the FlowFile. // This is needed when Avro records are appended to the same FlowFile. return avroHeader.toByteArray(); }
Example #13
Source File: From nifi with Apache License 2.0 | 6 votes |
private void appendAvroRecords(ProcessSession session, byte[] avroHeader, DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef, List<HiveStreamingRecord> hRecords) { flowFileRef.set(session.append(flowFileRef.get(), (out) -> { if (hRecords != null) { // Initialize the writer again as append mode, so that Avro header is written only once. writer.appendTo(new SeekableByteArrayInput(avroHeader), out); try { for (HiveStreamingRecord hRecord : hRecords) { writer.append(hRecord.getRecord()); } } catch (IOException ioe) { // The records were put to Hive Streaming successfully, but there was an error while writing the // Avro records to the flow file. Log as an error and move on. logger.error("Error writing Avro records (which were sent successfully to Hive Streaming) to the flow file, " + ioe, ioe); } } writer.close(); })); }
Example #14
Source File: From geowave with Apache License 2.0 | 6 votes |
private synchronized DataFileWriter getDataWriterCreateIfNull( final String typeName, final GeoWaveAvroFormatPlugin plugin) { if (!cachedWriters.containsKey(typeName)) { FSDataOutputStream out = null; final DataFileWriter dfw = new DataFileWriter(new GenericDatumWriter()); cachedWriters.put(typeName, dfw); dfw.setCodec(CodecFactory.snappyCodec()); try { // TODO: we should probably clean up the type name to make it // HDFS path safe in case there are invalid characters // also, if a file already exists do we want to delete it or // append to it? out = fs.create(new Path(hdfsBaseDirectory, typeName)); dfw.create(plugin.getAvroSchema(), out); } catch (final IOException e) { LOGGER.error("Unable to create output stream", e); // cache a null value so we don't continually try to recreate cachedWriters.put(typeName, null); return null; } } return cachedWriters.get(typeName); }
Example #15
Source File: From mt-flume with Apache License 2.0 | 6 votes |
@Override public void configure(Context context) { int syncIntervalBytes = context.getInteger(SYNC_INTERVAL_BYTES, DEFAULT_SYNC_INTERVAL_BYTES); String compressionCodec = context.getString(COMPRESSION_CODEC, DEFAULT_COMPRESSION_CODEC); writer = new ReflectDatumWriter<T>(getSchema()); dataFileWriter = new DataFileWriter<T>(writer); dataFileWriter.setSyncInterval(syncIntervalBytes); try { CodecFactory codecFactory = CodecFactory.fromString(compressionCodec); dataFileWriter.setCodec(codecFactory); } catch (AvroRuntimeException e) { logger.warn("Unable to instantiate avro codec with name (" + compressionCodec + "). Compression disabled. Exception follows.", e); } }
Example #16
Source File: From dbeam with Apache License 2.0 | 6 votes |
@SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception {"jdbcavroio : Preparing write..."); connection = jdbcAvroArgs.jdbcConnectionConfiguration().createConnection(); Void destination = getDestination(); Schema schema = dynamicDestinations.getSchema(destination); dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(schema)) .setCodec(jdbcAvroArgs.getCodecFactory()) .setSyncInterval(syncInterval); dataFileWriter.setMeta("created_by", this.getClass().getCanonicalName()); this.countingOutputStream = new CountingOutputStream(Channels.newOutputStream(channel)); dataFileWriter.create(schema, this.countingOutputStream);"jdbcavroio : Write prepared"); }
Example #17
Source File: From presto with Apache License 2.0 | 6 votes |
public AvroRecordWriter(Path path, JobConf jobConf, boolean isCompressed, Properties properties) throws IOException { Schema schema; try { schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties); } catch (AvroSerdeException e) { throw new IOException(e); } GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter); if (isCompressed) { int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); dataFileWriter.setCodec(factory); } outputStream = path.getFileSystem(jobConf).create(path); dataFileWriter.create(schema, outputStream); delegate = new AvroGenericRecordWriter(dataFileWriter); }
Example #18
Source File: From geowave with Apache License 2.0 | 5 votes |
public void setupWriter( final DataFileWriter nodeWriter, final DataFileWriter wayWriter, final DataFileWriter relationWriter) { this.nodeWriter = nodeWriter; this.wayWriter = wayWriter; this.relationWriter = relationWriter; }
Example #19
Source File: From datafu with Apache License 2.0 | 5 votes |
public AvroKeyValueWithMetadataRecordWriter(AvroDatumConverter<K, ?> keyConverter, AvroDatumConverter<V, ?> valueConverter, CodecFactory compressionCodec, OutputStream outputStream, Configuration conf) throws IOException { // Create the generic record schema for the key/value pair. mKeyValuePairSchema = AvroKeyValue.getSchema( keyConverter.getWriterSchema(), valueConverter.getWriterSchema()); // Create an Avro container file and a writer to it. mAvroFileWriter = new DataFileWriter<GenericRecord>( new ReflectDatumWriter<GenericRecord>(mKeyValuePairSchema)); mAvroFileWriter.setCodec(compressionCodec); for (Entry<String,String> e : conf) { if (e.getKey().startsWith(TEXT_PREFIX)) mAvroFileWriter.setMeta(e.getKey().substring(TEXT_PREFIX.length()), e.getValue()); } mAvroFileWriter.create(mKeyValuePairSchema, outputStream); // Keep a reference to the converters. mKeyConverter = keyConverter; mValueConverter = valueConverter; // Create a reusable output record. mOutputRecord = new AvroKeyValue<Object, Object>(new GenericData.Record(mKeyValuePairSchema)); }
Example #20
Source File: From datacollector with Apache License 2.0 | 5 votes |
public static File createAvroDataFile() throws Exception { File f = new File(createTestDir(), "file-0.avro"); Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); GenericRecord boss = new GenericData.Record(schema); boss.put("name", "boss"); boss.put("age", 60); boss.put("emails", ImmutableList.of("", "")); boss.put("boss", null); GenericRecord e3 = new GenericData.Record(schema); e3.put("name", "c"); e3.put("age", 50); e3.put("emails", ImmutableList.of("", "")); e3.put("boss", boss); GenericRecord e2 = new GenericData.Record(schema); e2.put("name", "b"); e2.put("age", 40); e2.put("emails", ImmutableList.of("", "")); e2.put("boss", boss); GenericRecord e1 = new GenericData.Record(schema); e1.put("name", "a"); e1.put("age", 30); e1.put("emails", ImmutableList.of("", "")); e1.put("boss", boss); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, f); dataFileWriter.append(e1); dataFileWriter.append(e2); dataFileWriter.append(e3); dataFileWriter.flush(); dataFileWriter.close(); return f; }
Example #21
Source File: From flink-perf with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { // generate only avro file if (args.length == 2) { ordersPath = args[0]; outputOrderAvroPath = args[1]; // Generate file for avro test DatumWriter<Order> orderDatumWriter = new SpecificDatumWriter<Order>(Order.class); DataFileWriter<Order> dataFileWriter = new DataFileWriter<Order>(orderDatumWriter); dataFileWriter.create(Order.getClassSchema(), new File(outputOrderAvroPath)); Scanner s = new Scanner(new File(ordersPath)); while (s.hasNextLine()) { @SuppressWarnings("resource") Scanner lineScanner = new Scanner(s.nextLine()).useDelimiter("\\|"); Order o = new Order(); o.setOOrderkey(lineScanner.nextInt()); o.setOCustkey(lineScanner.nextInt()); o.setOOrderstatus(; o.setOTotalprice(lineScanner.nextFloat()); o.setOOrderdate(; o.setOOrderpriority(; o.setOClerk(; o.setOShipproprity(lineScanner.nextInt()); o.setOComment(; dataFileWriter.append(o); lineScanner.close(); } dataFileWriter.flush(); s.close(); dataFileWriter.close(); return; } else { System.err.println("Usage: <inputFilePath> <outputAvroPath>"); System.exit(1); } }
Example #22
Source File: From datacollector with Apache License 2.0 | 5 votes |
public static File createAvroDataFile() throws Exception { File f = new File(createTestDir(), "file-0.avro"); Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); GenericRecord boss = new GenericData.Record(schema); boss.put("name", "boss"); boss.put("age", 60); boss.put("emails", ImmutableList.of("", "")); boss.put("boss", null); GenericRecord e3 = new GenericData.Record(schema); e3.put("name", "c"); e3.put("age", 50); e3.put("emails", ImmutableList.of("", "")); e3.put("boss", boss); GenericRecord e2 = new GenericData.Record(schema); e2.put("name", "b"); e2.put("age", 40); e2.put("emails", ImmutableList.of("", "")); e2.put("boss", boss); GenericRecord e1 = new GenericData.Record(schema); e1.put("name", "a"); e1.put("age", 30); e1.put("emails", ImmutableList.of("", "")); e1.put("boss", boss); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, f); dataFileWriter.append(e1); dataFileWriter.append(e2); dataFileWriter.append(e3); dataFileWriter.flush(); dataFileWriter.close(); return f; }
Example #23
Source File: From nifi with Apache License 2.0 | 5 votes |
public WriteAvroResultWithSchema(final Schema schema, final OutputStream out, final CodecFactory codec) throws IOException { super(out); this.schema = schema; final GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.setCodec(codec); dataFileWriter.create(schema, out); }
Example #24
Source File: From samza with Apache License 2.0 | 5 votes |
public BlobWriterComponents(DataFileWriter dataFileWriter, AzureBlobOutputStream azureBlobOutputStream, BlockBlobAsyncClient blockBlobAsyncClient) { Preconditions.checkNotNull(dataFileWriter, "DataFileWriter can not be null when creating WriterComponents for an Azure Blob."); Preconditions.checkNotNull(azureBlobOutputStream, "AzureBlobOutputStream can not be null when creating WriterComponents for an Azure Blob."); Preconditions.checkNotNull(blockBlobAsyncClient, "BlockBlobAsyncClient can not be null when creating WriterComponents for an Azure Blob."); this.dataFileWriter = dataFileWriter; this.azureBlobOutputStream = azureBlobOutputStream; this.blockBlobAsyncClient = blockBlobAsyncClient; }
Example #25
Source File: From incubator-gobblin with Apache License 2.0 | 5 votes |
private void writeAsAvroBinary(Iterator<GenericRecord> input, Schema schema, FileSystem fs, Path outputPath) throws IOException { DataFileWriter writer = new DataFileWriter(new GenericDatumWriter()); writer.create(schema, fs.create(outputPath, true)); while (input.hasNext()) { writer.append(; } writer.close();"Successfully wrote avro file to path " + outputPath); }
Example #26
Source File: From spork with Apache License 2.0 | 5 votes |
static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer, JobConf job) throws UnsupportedEncodingException { if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(factory); } // Do max as core-default.xml has io.file.buffer.size as 4K writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max( job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL))); // copy metadata from job for (Map.Entry<String,String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), "ISO-8859-1") .getBytes("ISO-8859-1")); } }
Example #27
Source File: From beam with Apache License 2.0 | 5 votes |
AvroRowWriter( String basename, Schema schema, SerializableFunction<AvroWriteRequest<T>, AvroT> toAvroRecord, SerializableFunction<Schema, DatumWriter<AvroT>> writerFactory) throws Exception { super(basename, MimeTypes.BINARY); this.schema = schema; this.toAvroRecord = toAvroRecord; this.writer = new DataFileWriter<>(writerFactory.apply(schema)).create(schema, getOutputStream()); }
Example #28
Source File: From beam with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { DestinationT destination = getDestination(); CodecFactory codec = dynamicDestinations.getCodec(destination); Schema schema = dynamicDestinations.getSchema(destination); Map<String, Object> metadata = dynamicDestinations.getMetadata(destination); DatumWriter<OutputT> datumWriter = genericRecords ? new GenericDatumWriter<>(schema) : new ReflectDatumWriter<>(schema); dataFileWriter = new DataFileWriter<>(datumWriter).setCodec(codec); for (Map.Entry<String, Object> entry : metadata.entrySet()) { Object v = entry.getValue(); if (v instanceof String) { dataFileWriter.setMeta(entry.getKey(), (String) v); } else if (v instanceof Long) { dataFileWriter.setMeta(entry.getKey(), (Long) v); } else if (v instanceof byte[]) { dataFileWriter.setMeta(entry.getKey(), (byte[]) v); } else { throw new IllegalStateException( "Metadata value type must be one of String, Long, or byte[]. Found " + v.getClass().getSimpleName()); } } dataFileWriter.create(schema, Channels.newOutputStream(channel)); }
Example #29
Source File: From Cubert with Apache License 2.0 | 5 votes |
private void purge(String src, String dst) throws IOException { DataFileReader<GenericRecord> dataFileReader = createDataFileReader(src, false); DataFileWriter<GenericRecord> writer = createDataFileWriter(dataFileReader); numRecords = 0; recordsPurged = 0; remainingRecords = 0; // Copy while (dataFileReader.hasNext()) { numRecords++; GenericRecord record =; if (record == null) { continue; } Number column = (Number) record.get(columnName); if ((column == null) || (!membersToPurge.contains(column.intValue()))) { remainingRecords++; writer.append(record); } } recordsPurged = numRecords - remainingRecords; writer.close(); dataFileReader.close(); }
Example #30
Source File: From tablasco with Apache License 2.0 | 5 votes |
private static void writeAvroData(List<GenericRecord> data, File avroFile) throws IOException { FileUtils.forceMkdir(avroFile.getParentFile()); Schema schema = data.get(0).getSchema(); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, avroFile); for (GenericRecord genericRecord : data) { dataFileWriter.append(genericRecord); } dataFileWriter.close(); }