org.apache.avro.file.DataFileWriter Java Examples

The following examples show how to use org.apache.avro.file.DataFileWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DailyTrackingWriter.java    From datafu with Apache License 2.0 7 votes vote down vote up
public void open(int year, int month, int day) throws IOException
{
  if (_dataWriter != null)
  {
    throw new RuntimeException("Already have data writer");
  }

  Path dailyPath = _outputPath;
  Path path = new Path(dailyPath,String.format("%04d/%02d/%02d",year,month,day));
  
  _outputStream = _fs.create(new Path(path, "part-00000.avro"));
  
  GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>();
  _dataWriter = new DataFileWriter<GenericRecord>(writer);        
  _dataWriter.create(_schema, _outputStream);
}
 
Example #2
Source File: AvroAppender.java    From tajo with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the Appender.
 */
public void init() throws IOException {
  FileSystem fs = path.getFileSystem(conf);

  FSDataOutputStream outputStream = fs.create(path, false);

  avroSchema = AvroUtil.getAvroSchema(meta, conf);
  avroFields = avroSchema.getFields();

  DatumWriter<GenericRecord> datumWriter =
          new GenericDatumWriter<>(avroSchema);
  dataFileWriter = new DataFileWriter<>(datumWriter);
  dataFileWriter.create(avroSchema, outputStream);

  if (tableStatsEnabled) {
    this.stats = new TableStatistics(schema, columnStatsEnabled);
  }
  super.init();
}
 
Example #3
Source File: TestExtractAvroMetadata.java    From localization_nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void testExtractionWithCodec() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
    runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_CODEC_ATTR); // test dynamic attribute avro.codec

    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc"));

    final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three"));
    final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema);

    final ByteArrayOutputStream out = new ByteArrayOutputStream();
    final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.setCodec(CodecFactory.deflateCodec(1));
    dataFileWriter.create(schema, out);
    dataFileWriter.append(data);
    dataFileWriter.close();

    runner.enqueue(out.toByteArray());
    runner.run();

    runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);

    final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
    flowFile.assertAttributeEquals("avro.codec", "deflate");
}
 
Example #4
Source File: TestConvertAvroToORC.java    From nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void test_onTrigger_routing_to_failure_null_type() throws Exception {
    String testString = "Hello World";
    GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithNull(testString);

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (string STRING, null BOOLEAN) STORED AS ORC",
            resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
}
 
Example #5
Source File: AvroUtils.java    From Cubert with Apache License 2.0 6 votes vote down vote up
public static void createFileIfNotExists(BlockSchema fileSchema, String path) throws IOException
{
    Configuration conf = new JobConf();
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(path)))
        return;

    Schema avroSchema = convertFromBlockSchema("CUBERT_MV_RECORD", fileSchema);
    System.out.println("Creating avro file with schema = " + avroSchema);
    GenericDatumWriter<GenericRecord> datumWriter =
            new GenericDatumWriter<GenericRecord>(avroSchema);
    DataFileWriter<GenericRecord> writer =
            new DataFileWriter<GenericRecord>(datumWriter);

    FSDataOutputStream fout =
            FileSystem.create(fs,
                              new Path(path),
                              new FsPermission(FsAction.ALL,
                                               FsAction.READ_EXECUTE,
                                               FsAction.READ_EXECUTE));
    writer.create(avroSchema, fout);
    writer.flush();
    writer.close();

}
 
Example #6
Source File: PutHiveStreaming.java    From localization_nifi with Apache License 2.0 6 votes vote down vote up
private void appendRecordsToFlowFile(ProcessSession session,
                                     List<HiveStreamingRecord> records,
                                     AtomicReference<FlowFile> appendFlowFile,
                                     DataFileWriter<GenericRecord> avroWriter,
                                     DataFileStream<GenericRecord> reader) throws IOException {

    appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> {

        try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) {
            for (HiveStreamingRecord sRecord : records) {
                writer.append(sRecord.getRecord());
            }
            writer.flush();
        }
    }));
}
 
Example #7
Source File: TestAzureBlobAvroWriter.java    From samza with Apache License 2.0 6 votes vote down vote up
@Before
public void setup() throws Exception {
  threadPool = new ThreadPoolExecutor(1, 1, 60,  TimeUnit.SECONDS, new LinkedBlockingDeque<>());
  ome = createOME("Topic1");

  encodedRecord = new byte[100];
  BlobContainerAsyncClient mockContainerAsyncClient = PowerMockito.mock(BlobContainerAsyncClient.class);
  mockDataFileWriter = mock(DataFileWriter.class);
  mockAzureBlobOutputStream = mock(AzureBlobOutputStream.class);
  mockBlockBlobAsyncClient = PowerMockito.mock(BlockBlobAsyncClient.class);
  when(mockBlockBlobAsyncClient.getBlobUrl()).thenReturn("https://samza.blob.core.windows.net/fake-blob-url");

  mockCompression = CompressionFactory.getInstance().getCompression(CompressionType.GZIP);
  azureBlobAvroWriter =
      spy(new AzureBlobAvroWriter(mockContainerAsyncClient, mock(AzureBlobWriterMetrics.class), threadPool, THRESHOLD,
          60000, "test", mockDataFileWriter, mockAzureBlobOutputStream, mockBlockBlobAsyncClient,
          blobMetadataGeneratorFactory, blobMetadataGeneratorConfig, STREAM_NAME,
          Long.MAX_VALUE, Long.MAX_VALUE, mockCompression, false)); // keeping blob size and number of records unlimited
  doReturn(encodedRecord).when(azureBlobAvroWriter).encodeRecord((IndexedRecord) ome.getMessage());
}
 
Example #8
Source File: FsSpecProducer.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
private void writeAvroJobSpec(AvroJobSpec jobSpec) throws IOException {
  DatumWriter<AvroJobSpec> datumWriter = new SpecificDatumWriter<>(AvroJobSpec.SCHEMA$);
  DataFileWriter<AvroJobSpec> dataFileWriter = new DataFileWriter<>(datumWriter);

  Path jobSpecPath = new Path(this.specConsumerPath, jobSpec.getUri());

  //Write the new JobSpec to a temporary path first.
  Path tmpDir = new Path(this.specConsumerPath, "_tmp");
  if (!fs.exists(tmpDir)) {
    fs.mkdirs(tmpDir);
  }

  Path tmpJobSpecPath = new Path(tmpDir, jobSpec.getUri());

  OutputStream out = fs.create(tmpJobSpecPath);

  dataFileWriter.create(AvroJobSpec.SCHEMA$, out);
  dataFileWriter.append(jobSpec);
  dataFileWriter.close();

  //Rename the JobSpec from temporary to final location.
  HadoopUtils.renamePath(fs, tmpJobSpecPath, jobSpecPath, true);
}
 
Example #9
Source File: RedshiftIT.java    From digdag with Apache License 2.0 6 votes vote down vote up
private byte[] avroTestData(List<Schema.Field> fields, List<Map<String, Object>> records)
        throws IOException
{
    Schema schema = Schema.createRecord("testdata", null, null, false);
    schema.setFields(fields);

    ByteArrayOutputStream out = new ByteArrayOutputStream();
    GenericDatumWriter<GenericData.Record> datum = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericData.Record> writer = new DataFileWriter<>(datum);
    writer.create(schema, out);
    for (Map<String, Object> record : records) {
        GenericData.Record r = new GenericData.Record(schema);
        for (Map.Entry<String, Object> item : record.entrySet()) {
            r.put(item.getKey(), item.getValue());
        }
        writer.append(r);
    }
    writer.close();

    return out.toByteArray();
}
 
Example #10
Source File: PentahoAvroOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public IPentahoRecordWriter createRecordWriter() throws Exception {
  validate();
  if ( fields == null || StringUtils.isEmpty( nameSpace ) || StringUtils.isEmpty( recordName ) || StringUtils
    .isEmpty( outputFilename ) ) {
    throw new Exception(
      "Invalid state.  One of the following required fields is null:  'nameSpace', 'recordNum', or 'outputFileName" );
  }
  Schema schema = getSchema();
  writeAvroSchemaToFile( schemaFilename );
  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>( schema );
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>( datumWriter );
  dataFileWriter.setCodec( codecFactory );
  dataFileWriter.create( schema, KettleVFS.getOutputStream( outputFilename, variableSpace, false ) );
  return new PentahoAvroRecordWriter( dataFileWriter, schema, fields );
}
 
Example #11
Source File: TestAvroEventDeserializer.java    From mt-flume with Apache License 2.0 6 votes vote down vote up
private File newTestFile(boolean deleteOnExit) throws IOException {
  File tempFile = File.createTempFile("testDirectFile", "tmp");
  if (deleteOnExit) {
    tempFile.deleteOnExit();
  }

  DataFileWriter<GenericRecord> writer =
      new DataFileWriter<GenericRecord>(
          new GenericDatumWriter<GenericRecord>(schema));
  writer.create(schema, tempFile);
  GenericRecordBuilder recordBuilder;
  recordBuilder = new GenericRecordBuilder(schema);
  recordBuilder.set("foo", "bar");
  GenericRecord record = recordBuilder.build();
  writer.append(record);
  writer.sync();
  recordBuilder = new GenericRecordBuilder(schema);
  recordBuilder.set("foo", "baz");
  record = recordBuilder.build();
  writer.append(record);
  writer.sync();
  writer.flush();
  writer.close();

  return tempFile;
}
 
Example #12
Source File: PutHiveStreaming.java    From nifi with Apache License 2.0 6 votes vote down vote up
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                                     DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {

    writer.setCodec(CodecFactory.fromString(codec));
    // Transfer metadata (this is a subset of the incoming file)
    for (String metaKey : reader.getMetaKeys()) {
        if (!RESERVED_METADATA.contains(metaKey)) {
            writer.setMeta(metaKey, reader.getMeta(metaKey));
        }
    }

    final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        // Create writer so that records can be appended later.
        writer.create(reader.getSchema(), avroHeader);
        writer.close();

        final byte[] header = avroHeader.toByteArray();
        out.write(header);
    }));

    // Capture the Avro header byte array that is just written to the FlowFile.
    // This is needed when Avro records are appended to the same FlowFile.
    return avroHeader.toByteArray();
}
 
Example #13
Source File: PutHiveStreaming.java    From nifi with Apache License 2.0 6 votes vote down vote up
private void appendAvroRecords(ProcessSession session, byte[] avroHeader, DataFileWriter<GenericRecord> writer,
                               AtomicReference<FlowFile> flowFileRef, List<HiveStreamingRecord> hRecords) {

    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        if (hRecords != null) {
            // Initialize the writer again as append mode, so that Avro header is written only once.
            writer.appendTo(new SeekableByteArrayInput(avroHeader), out);
            try {
                for (HiveStreamingRecord hRecord : hRecords) {
                    writer.append(hRecord.getRecord());
                }
            } catch (IOException ioe) {
                // The records were put to Hive Streaming successfully, but there was an error while writing the
                // Avro records to the flow file. Log as an error and move on.
                logger.error("Error writing Avro records (which were sent successfully to Hive Streaming) to the flow file, " + ioe, ioe);
            }
        }
        writer.close();
    }));
}
 
Example #14
Source File: StageRunData.java    From geowave with Apache License 2.0 6 votes vote down vote up
private synchronized DataFileWriter getDataWriterCreateIfNull(
    final String typeName,
    final GeoWaveAvroFormatPlugin plugin) {
  if (!cachedWriters.containsKey(typeName)) {
    FSDataOutputStream out = null;
    final DataFileWriter dfw = new DataFileWriter(new GenericDatumWriter());
    cachedWriters.put(typeName, dfw);
    dfw.setCodec(CodecFactory.snappyCodec());
    try {
      // TODO: we should probably clean up the type name to make it
      // HDFS path safe in case there are invalid characters
      // also, if a file already exists do we want to delete it or
      // append to it?
      out = fs.create(new Path(hdfsBaseDirectory, typeName));
      dfw.create(plugin.getAvroSchema(), out);

    } catch (final IOException e) {
      LOGGER.error("Unable to create output stream", e);
      // cache a null value so we don't continually try to recreate
      cachedWriters.put(typeName, null);
      return null;
    }
  }
  return cachedWriters.get(typeName);
}
 
Example #15
Source File: AbstractAvroEventSerializer.java    From mt-flume with Apache License 2.0 6 votes vote down vote up
@Override
public void configure(Context context) {

  int syncIntervalBytes =
      context.getInteger(SYNC_INTERVAL_BYTES, DEFAULT_SYNC_INTERVAL_BYTES);
  String compressionCodec =
      context.getString(COMPRESSION_CODEC, DEFAULT_COMPRESSION_CODEC);

  writer = new ReflectDatumWriter<T>(getSchema());
  dataFileWriter = new DataFileWriter<T>(writer);

  dataFileWriter.setSyncInterval(syncIntervalBytes);

  try {
    CodecFactory codecFactory = CodecFactory.fromString(compressionCodec);
    dataFileWriter.setCodec(codecFactory);
  } catch (AvroRuntimeException e) {
    logger.warn("Unable to instantiate avro codec with name (" +
        compressionCodec + "). Compression disabled. Exception follows.", e);
  }
}
 
Example #16
Source File: JdbcAvroIO.java    From dbeam with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation") // uses internal test functionality.
@Override
protected void prepareWrite(WritableByteChannel channel) throws Exception {
  logger.info("jdbcavroio : Preparing write...");
  connection = jdbcAvroArgs.jdbcConnectionConfiguration().createConnection();
  Void destination = getDestination();
  Schema schema = dynamicDestinations.getSchema(destination);
  dataFileWriter =
      new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(schema))
          .setCodec(jdbcAvroArgs.getCodecFactory())
          .setSyncInterval(syncInterval);
  dataFileWriter.setMeta("created_by", this.getClass().getCanonicalName());
  this.countingOutputStream = new CountingOutputStream(Channels.newOutputStream(channel));
  dataFileWriter.create(schema, this.countingOutputStream);
  logger.info("jdbcavroio : Write prepared");
}
 
Example #17
Source File: AvroRecordWriter.java    From presto with Apache License 2.0 6 votes vote down vote up
public AvroRecordWriter(Path path, JobConf jobConf, boolean isCompressed, Properties properties)
        throws IOException
{
    Schema schema;
    try {
        schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties);
    }
    catch (AvroSerdeException e) {
        throw new IOException(e);
    }
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter);

    if (isCompressed) {
        int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
                ? CodecFactory.deflateCodec(level)
                : CodecFactory.fromString(codecName);
        dataFileWriter.setCodec(factory);
    }

    outputStream = path.getFileSystem(jobConf).create(path);
    dataFileWriter.create(schema, outputStream);
    delegate = new AvroGenericRecordWriter(dataFileWriter);
}
 
Example #18
Source File: OsmPbfParser.java    From geowave with Apache License 2.0 5 votes vote down vote up
public void setupWriter(
    final DataFileWriter nodeWriter,
    final DataFileWriter wayWriter,
    final DataFileWriter relationWriter) {
  this.nodeWriter = nodeWriter;
  this.wayWriter = wayWriter;
  this.relationWriter = relationWriter;
}
 
Example #19
Source File: AvroKeyValueWithMetadataRecordWriter.java    From datafu with Apache License 2.0 5 votes vote down vote up
public AvroKeyValueWithMetadataRecordWriter(AvroDatumConverter<K, ?> keyConverter,
    AvroDatumConverter<V, ?> valueConverter, CodecFactory compressionCodec,
    OutputStream outputStream, Configuration conf) throws IOException {
  // Create the generic record schema for the key/value pair.
  mKeyValuePairSchema = AvroKeyValue.getSchema(
      keyConverter.getWriterSchema(), valueConverter.getWriterSchema());

  // Create an Avro container file and a writer to it.
  mAvroFileWriter = new DataFileWriter<GenericRecord>(
      new ReflectDatumWriter<GenericRecord>(mKeyValuePairSchema));
  mAvroFileWriter.setCodec(compressionCodec);
  
  for (Entry<String,String> e : conf)
  {
    if (e.getKey().startsWith(TEXT_PREFIX))
      mAvroFileWriter.setMeta(e.getKey().substring(TEXT_PREFIX.length()),
                              e.getValue());
  }
  
  mAvroFileWriter.create(mKeyValuePairSchema, outputStream);

  // Keep a reference to the converters.
  mKeyConverter = keyConverter;
  mValueConverter = valueConverter;

  // Create a reusable output record.
  mOutputRecord = new AvroKeyValue<Object, Object>(new GenericData.Record(mKeyValuePairSchema));
}
 
Example #20
Source File: AvroSpoolDirSourceTestUtil.java    From datacollector with Apache License 2.0 5 votes vote down vote up
public static File createAvroDataFile() throws Exception {
  File f = new File(createTestDir(), "file-0.avro");
  Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
  GenericRecord boss = new GenericData.Record(schema);
  boss.put("name", "boss");
  boss.put("age", 60);
  boss.put("emails", ImmutableList.of("boss@company.com", "boss2@company.com"));
  boss.put("boss", null);

  GenericRecord e3 = new GenericData.Record(schema);
  e3.put("name", "c");
  e3.put("age", 50);
  e3.put("emails", ImmutableList.of("c@company.com", "c2@company.com"));
  e3.put("boss", boss);

  GenericRecord e2 = new GenericData.Record(schema);
  e2.put("name", "b");
  e2.put("age", 40);
  e2.put("emails", ImmutableList.of("b@company.com", "b2@company.com"));
  e2.put("boss", boss);

  GenericRecord e1 = new GenericData.Record(schema);
  e1.put("name", "a");
  e1.put("age", 30);
  e1.put("emails", ImmutableList.of("a@company.com", "a2@company.com"));
  e1.put("boss", boss);

  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
  dataFileWriter.create(schema, f);
  dataFileWriter.append(e1);
  dataFileWriter.append(e2);
  dataFileWriter.append(e3);

  dataFileWriter.flush();
  dataFileWriter.close();

  return f;
}
 
Example #21
Source File: AvroFileGenerator.java    From flink-perf with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	// generate only avro file
	if (args.length == 2) {
		ordersPath = args[0];
		outputOrderAvroPath = args[1];
		// Generate file for avro test
		DatumWriter<Order> orderDatumWriter = new SpecificDatumWriter<Order>(Order.class);
		DataFileWriter<Order> dataFileWriter = new DataFileWriter<Order>(orderDatumWriter);
		dataFileWriter.create(Order.getClassSchema(), new File(outputOrderAvroPath));
		Scanner s = new Scanner(new File(ordersPath));
		while (s.hasNextLine()) {
			@SuppressWarnings("resource")
			Scanner lineScanner = new Scanner(s.nextLine()).useDelimiter("\\|");

			Order o = new Order();
			o.setOOrderkey(lineScanner.nextInt());
			o.setOCustkey(lineScanner.nextInt());
			o.setOOrderstatus(lineScanner.next());
			o.setOTotalprice(lineScanner.nextFloat());
			o.setOOrderdate(lineScanner.next());
			o.setOOrderpriority(lineScanner.next());
			o.setOClerk(lineScanner.next());
			o.setOShipproprity(lineScanner.nextInt());
			o.setOComment(lineScanner.next());
			dataFileWriter.append(o);
			lineScanner.close();
		}
		dataFileWriter.flush();
		s.close();
		dataFileWriter.close();
		return;
	} else {
		System.err.println("Usage: <inputFilePath> <outputAvroPath>");
		System.exit(1);
	}
}
 
Example #22
Source File: SdcAvroTestUtil.java    From datacollector with Apache License 2.0 5 votes vote down vote up
public static File createAvroDataFile() throws Exception {
  File f = new File(createTestDir(), "file-0.avro");
  Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
  GenericRecord boss = new GenericData.Record(schema);
  boss.put("name", "boss");
  boss.put("age", 60);
  boss.put("emails", ImmutableList.of("boss@company.com", "boss2@company.com"));
  boss.put("boss", null);

  GenericRecord e3 = new GenericData.Record(schema);
  e3.put("name", "c");
  e3.put("age", 50);
  e3.put("emails", ImmutableList.of("c@company.com", "c2@company.com"));
  e3.put("boss", boss);

  GenericRecord e2 = new GenericData.Record(schema);
  e2.put("name", "b");
  e2.put("age", 40);
  e2.put("emails", ImmutableList.of("b@company.com", "b2@company.com"));
  e2.put("boss", boss);

  GenericRecord e1 = new GenericData.Record(schema);
  e1.put("name", "a");
  e1.put("age", 30);
  e1.put("emails", ImmutableList.of("a@company.com", "a2@company.com"));
  e1.put("boss", boss);

  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
  dataFileWriter.create(schema, f);
  dataFileWriter.append(e1);
  dataFileWriter.append(e2);
  dataFileWriter.append(e3);

  dataFileWriter.flush();
  dataFileWriter.close();

  return f;
}
 
Example #23
Source File: WriteAvroResultWithSchema.java    From nifi with Apache License 2.0 5 votes vote down vote up
public WriteAvroResultWithSchema(final Schema schema, final OutputStream out, final CodecFactory codec) throws IOException {
    super(out);
    this.schema = schema;

    final GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.setCodec(codec);
    dataFileWriter.create(schema, out);
}
 
Example #24
Source File: AzureBlobAvroWriter.java    From samza with Apache License 2.0 5 votes vote down vote up
public BlobWriterComponents(DataFileWriter dataFileWriter, AzureBlobOutputStream azureBlobOutputStream,
    BlockBlobAsyncClient blockBlobAsyncClient) {
  Preconditions.checkNotNull(dataFileWriter, "DataFileWriter can not be null when creating WriterComponents for an Azure Blob.");
  Preconditions.checkNotNull(azureBlobOutputStream, "AzureBlobOutputStream can not be null when creating WriterComponents for an Azure Blob.");
  Preconditions.checkNotNull(blockBlobAsyncClient, "BlockBlobAsyncClient can not be null when creating WriterComponents for an Azure Blob.");
  this.dataFileWriter = dataFileWriter;
  this.azureBlobOutputStream = azureBlobOutputStream;
  this.blockBlobAsyncClient = blockBlobAsyncClient;
}
 
Example #25
Source File: AvroTestTools.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void writeAsAvroBinary(Iterator<GenericRecord> input, Schema schema, FileSystem fs,
    Path outputPath) throws IOException {

  DataFileWriter writer = new DataFileWriter(new GenericDatumWriter());

  writer.create(schema, fs.create(outputPath, true));
  while (input.hasNext()) {
    writer.append(input.next());
  }
  writer.close();

  log.info("Successfully wrote avro file to path " + outputPath);
}
 
Example #26
Source File: AvroRecordWriter.java    From spork with Apache License 2.0 5 votes vote down vote up
static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer,
    JobConf job) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(job)) {
    int level = job.getInt(DEFLATE_LEVEL_KEY,
        DEFAULT_DEFLATE_LEVEL);
    String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory = codecName.equals(DEFLATE_CODEC)
      ? CodecFactory.deflateCodec(level)
      : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  // Do max as core-default.xml has io.file.buffer.size as 4K
  writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(
          job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

  // copy metadata from job
  for (Map.Entry<String,String> e : job) {
    if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                     e.getValue());
    if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                     URLDecoder.decode(e.getValue(), "ISO-8859-1")
                     .getBytes("ISO-8859-1"));
  }
}
 
Example #27
Source File: AvroRowWriter.java    From beam with Apache License 2.0 5 votes vote down vote up
AvroRowWriter(
    String basename,
    Schema schema,
    SerializableFunction<AvroWriteRequest<T>, AvroT> toAvroRecord,
    SerializableFunction<Schema, DatumWriter<AvroT>> writerFactory)
    throws Exception {
  super(basename, MimeTypes.BINARY);

  this.schema = schema;
  this.toAvroRecord = toAvroRecord;
  this.writer =
      new DataFileWriter<>(writerFactory.apply(schema)).create(schema, getOutputStream());
}
 
Example #28
Source File: AvroSink.java    From beam with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation") // uses internal test functionality.
@Override
protected void prepareWrite(WritableByteChannel channel) throws Exception {
  DestinationT destination = getDestination();
  CodecFactory codec = dynamicDestinations.getCodec(destination);
  Schema schema = dynamicDestinations.getSchema(destination);
  Map<String, Object> metadata = dynamicDestinations.getMetadata(destination);

  DatumWriter<OutputT> datumWriter =
      genericRecords ? new GenericDatumWriter<>(schema) : new ReflectDatumWriter<>(schema);
  dataFileWriter = new DataFileWriter<>(datumWriter).setCodec(codec);
  for (Map.Entry<String, Object> entry : metadata.entrySet()) {
    Object v = entry.getValue();
    if (v instanceof String) {
      dataFileWriter.setMeta(entry.getKey(), (String) v);
    } else if (v instanceof Long) {
      dataFileWriter.setMeta(entry.getKey(), (Long) v);
    } else if (v instanceof byte[]) {
      dataFileWriter.setMeta(entry.getKey(), (byte[]) v);
    } else {
      throw new IllegalStateException(
          "Metadata value type must be one of String, Long, or byte[]. Found "
              + v.getClass().getSimpleName());
    }
  }
  dataFileWriter.create(schema, Channels.newOutputStream(channel));
}
 
Example #29
Source File: Purge.java    From Cubert with Apache License 2.0 5 votes vote down vote up
private void purge(String src, String dst) throws IOException
{
    DataFileReader<GenericRecord> dataFileReader = createDataFileReader(src, false);
    DataFileWriter<GenericRecord> writer = createDataFileWriter(dataFileReader);

    numRecords = 0;
    recordsPurged = 0;
    remainingRecords = 0;

    // Copy
    while (dataFileReader.hasNext())
    {
        numRecords++;
        GenericRecord record = dataFileReader.next();
        if (record == null)
        {
            continue;
        }

        Number column = (Number) record.get(columnName);
        if ((column == null) || (!membersToPurge.contains(column.intValue())))
        {
            remainingRecords++;
            writer.append(record);
        }
    }

    recordsPurged = numRecords - remainingRecords;
    writer.close();
    dataFileReader.close();
}
 
Example #30
Source File: SparkVerifierTest.java    From tablasco with Apache License 2.0 5 votes vote down vote up
private static void writeAvroData(List<GenericRecord> data, File avroFile) throws IOException
{
    FileUtils.forceMkdir(avroFile.getParentFile());
    Schema schema = data.get(0).getSchema();
    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(schema, avroFile);
    for (GenericRecord genericRecord : data)
    {
        dataFileWriter.append(genericRecord);
    }
    dataFileWriter.close();
}