org.apache.avro.file.DataFileWriter Java Exaples

Source File: DailyTrackingWriter.java From datafu with Apache License 2.0

7 votes

public void open(int year, int month, int day) throws IOException
{
  if (_dataWriter != null)
  {
    throw new RuntimeException("Already have data writer");
  }

  Path dailyPath = _outputPath;
  Path path = new Path(dailyPath,String.format("%04d/%02d/%02d",year,month,day));
  
  _outputStream = _fs.create(new Path(path, "part-00000.avro"));
  
  GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>();
  _dataWriter = new DataFileWriter<GenericRecord>(writer);        
  _dataWriter.create(_schema, _outputStream);
}

Source File: AvroAppender.java From tajo with Apache License 2.0

6 votes

/**
 * Initializes the Appender.
 */
public void init() throws IOException {
  FileSystem fs = path.getFileSystem(conf);

  FSDataOutputStream outputStream = fs.create(path, false);

  avroSchema = AvroUtil.getAvroSchema(meta, conf);
  avroFields = avroSchema.getFields();

  DatumWriter<GenericRecord> datumWriter =
          new GenericDatumWriter<>(avroSchema);
  dataFileWriter = new DataFileWriter<>(datumWriter);
  dataFileWriter.create(avroSchema, outputStream);

  if (tableStatsEnabled) {
    this.stats = new TableStatistics(schema, columnStatsEnabled);
  }
  super.init();
}

Source File: TestExtractAvroMetadata.java From localization_nifi with Apache License 2.0

6 votes

@Test
public void testExtractionWithCodec() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
    runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_CODEC_ATTR); // test dynamic attribute avro.codec

    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc"));

    final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three"));
    final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema);

    final ByteArrayOutputStream out = new ByteArrayOutputStream();
    final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.setCodec(CodecFactory.deflateCodec(1));
    dataFileWriter.create(schema, out);
    dataFileWriter.append(data);
    dataFileWriter.close();

    runner.enqueue(out.toByteArray());
    runner.run();

    runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);

    final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
    flowFile.assertAttributeEquals("avro.codec", "deflate");
}

Source File: TestConvertAvroToORC.java From nifi with Apache License 2.0

6 votes

@Test
public void test_onTrigger_routing_to_failure_null_type() throws Exception {
    String testString = "Hello World";
    GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithNull(testString);

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (string STRING, null BOOLEAN) STORED AS ORC",
            resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
}

Source File: AvroUtils.java From Cubert with Apache License 2.0

6 votes

public static void createFileIfNotExists(BlockSchema fileSchema, String path) throws IOException
{
    Configuration conf = new JobConf();
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(path)))
        return;

    Schema avroSchema = convertFromBlockSchema("CUBERT_MV_RECORD", fileSchema);
    System.out.println("Creating avro file with schema = " + avroSchema);
    GenericDatumWriter<GenericRecord> datumWriter =
            new GenericDatumWriter<GenericRecord>(avroSchema);
    DataFileWriter<GenericRecord> writer =
            new DataFileWriter<GenericRecord>(datumWriter);

    FSDataOutputStream fout =
            FileSystem.create(fs,
                              new Path(path),
                              new FsPermission(FsAction.ALL,
                                               FsAction.READ_EXECUTE,
                                               FsAction.READ_EXECUTE));
    writer.create(avroSchema, fout);
    writer.flush();
    writer.close();

}

Source File: PutHiveStreaming.java From localization_nifi with Apache License 2.0

6 votes

private void appendRecordsToFlowFile(ProcessSession session,
                                     List<HiveStreamingRecord> records,
                                     AtomicReference<FlowFile> appendFlowFile,
                                     DataFileWriter<GenericRecord> avroWriter,
                                     DataFileStream<GenericRecord> reader) throws IOException {

    appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> {

        try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) {
            for (HiveStreamingRecord sRecord : records) {
                writer.append(sRecord.getRecord());
            }
            writer.flush();
        }
    }));
}

Source File: TestAzureBlobAvroWriter.java From samza with Apache License 2.0

6 votes

@Before
public void setup() throws Exception {
  threadPool = new ThreadPoolExecutor(1, 1, 60,  TimeUnit.SECONDS, new LinkedBlockingDeque<>());
  ome = createOME("Topic1");

  encodedRecord = new byte[100];
  BlobContainerAsyncClient mockContainerAsyncClient = PowerMockito.mock(BlobContainerAsyncClient.class);
  mockDataFileWriter = mock(DataFileWriter.class);
  mockAzureBlobOutputStream = mock(AzureBlobOutputStream.class);
  mockBlockBlobAsyncClient = PowerMockito.mock(BlockBlobAsyncClient.class);
  when(mockBlockBlobAsyncClient.getBlobUrl()).thenReturn("https://samza.blob.core.windows.net/fake-blob-url");

  mockCompression = CompressionFactory.getInstance().getCompression(CompressionType.GZIP);
  azureBlobAvroWriter =
      spy(new AzureBlobAvroWriter(mockContainerAsyncClient, mock(AzureBlobWriterMetrics.class), threadPool, THRESHOLD,
          60000, "test", mockDataFileWriter, mockAzureBlobOutputStream, mockBlockBlobAsyncClient,
          blobMetadataGeneratorFactory, blobMetadataGeneratorConfig, STREAM_NAME,
          Long.MAX_VALUE, Long.MAX_VALUE, mockCompression, false)); // keeping blob size and number of records unlimited
  doReturn(encodedRecord).when(azureBlobAvroWriter).encodeRecord((IndexedRecord) ome.getMessage());
}

Source File: FsSpecProducer.java From incubator-gobblin with Apache License 2.0

6 votes

private void writeAvroJobSpec(AvroJobSpec jobSpec) throws IOException {
  DatumWriter<AvroJobSpec> datumWriter = new SpecificDatumWriter<>(AvroJobSpec.SCHEMA$);
  DataFileWriter<AvroJobSpec> dataFileWriter = new DataFileWriter<>(datumWriter);

  Path jobSpecPath = new Path(this.specConsumerPath, jobSpec.getUri());

  //Write the new JobSpec to a temporary path first.
  Path tmpDir = new Path(this.specConsumerPath, "_tmp");
  if (!fs.exists(tmpDir)) {
    fs.mkdirs(tmpDir);
  }

  Path tmpJobSpecPath = new Path(tmpDir, jobSpec.getUri());

  OutputStream out = fs.create(tmpJobSpecPath);

  dataFileWriter.create(AvroJobSpec.SCHEMA$, out);
  dataFileWriter.append(jobSpec);
  dataFileWriter.close();

  //Rename the JobSpec from temporary to final location.
  HadoopUtils.renamePath(fs, tmpJobSpecPath, jobSpecPath, true);
}

Source File: RedshiftIT.java From digdag with Apache License 2.0

6 votes

private byte[] avroTestData(List<Schema.Field> fields, List<Map<String, Object>> records)
        throws IOException
{
    Schema schema = Schema.createRecord("testdata", null, null, false);
    schema.setFields(fields);

    ByteArrayOutputStream out = new ByteArrayOutputStream();
    GenericDatumWriter<GenericData.Record> datum = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericData.Record> writer = new DataFileWriter<>(datum);
    writer.create(schema, out);
    for (Map<String, Object> record : records) {
        GenericData.Record r = new GenericData.Record(schema);
        for (Map.Entry<String, Object> item : record.entrySet()) {
            r.put(item.getKey(), item.getValue());
        }
        writer.append(r);
    }
    writer.close();

    return out.toByteArray();
}

Source File: PentahoAvroOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

@Override
public IPentahoRecordWriter createRecordWriter() throws Exception {
  validate();
  if ( fields == null || StringUtils.isEmpty( nameSpace ) || StringUtils.isEmpty( recordName ) || StringUtils
    .isEmpty( outputFilename ) ) {
    throw new Exception(
      "Invalid state.  One of the following required fields is null:  'nameSpace', 'recordNum', or 'outputFileName" );
  }
  Schema schema = getSchema();
  writeAvroSchemaToFile( schemaFilename );
  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>( schema );
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>( datumWriter );
  dataFileWriter.setCodec( codecFactory );
  dataFileWriter.create( schema, KettleVFS.getOutputStream( outputFilename, variableSpace, false ) );
  return new PentahoAvroRecordWriter( dataFileWriter, schema, fields );
}

Source File: TestAvroEventDeserializer.java From mt-flume with Apache License 2.0

6 votes

private File newTestFile(boolean deleteOnExit) throws IOException {
  File tempFile = File.createTempFile("testDirectFile", "tmp");
  if (deleteOnExit) {
    tempFile.deleteOnExit();
  }

  DataFileWriter<GenericRecord> writer =
      new DataFileWriter<GenericRecord>(
          new GenericDatumWriter<GenericRecord>(schema));
  writer.create(schema, tempFile);
  GenericRecordBuilder recordBuilder;
  recordBuilder = new GenericRecordBuilder(schema);
  recordBuilder.set("foo", "bar");
  GenericRecord record = recordBuilder.build();
  writer.append(record);
  writer.sync();
  recordBuilder = new GenericRecordBuilder(schema);
  recordBuilder.set("foo", "baz");
  record = recordBuilder.build();
  writer.append(record);
  writer.sync();
  writer.flush();
  writer.close();

  return tempFile;
}

Source File: PutHiveStreaming.java From nifi with Apache License 2.0

6 votes

private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                                     DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {

    writer.setCodec(CodecFactory.fromString(codec));
    // Transfer metadata (this is a subset of the incoming file)
    for (String metaKey : reader.getMetaKeys()) {
        if (!RESERVED_METADATA.contains(metaKey)) {
            writer.setMeta(metaKey, reader.getMeta(metaKey));
        }
    }

    final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        // Create writer so that records can be appended later.
        writer.create(reader.getSchema(), avroHeader);
        writer.close();

        final byte[] header = avroHeader.toByteArray();
        out.write(header);
    }));

    // Capture the Avro header byte array that is just written to the FlowFile.
    // This is needed when Avro records are appended to the same FlowFile.
    return avroHeader.toByteArray();
}

Source File: PutHiveStreaming.java From nifi with Apache License 2.0

6 votes

private void appendAvroRecords(ProcessSession session, byte[] avroHeader, DataFileWriter<GenericRecord> writer,
                               AtomicReference<FlowFile> flowFileRef, List<HiveStreamingRecord> hRecords) {

    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        if (hRecords != null) {
            // Initialize the writer again as append mode, so that Avro header is written only once.
            writer.appendTo(new SeekableByteArrayInput(avroHeader), out);
            try {
                for (HiveStreamingRecord hRecord : hRecords) {
                    writer.append(hRecord.getRecord());
                }
            } catch (IOException ioe) {
                // The records were put to Hive Streaming successfully, but there was an error while writing the
                // Avro records to the flow file. Log as an error and move on.
                logger.error("Error writing Avro records (which were sent successfully to Hive Streaming) to the flow file, " + ioe, ioe);
            }
        }
        writer.close();
    }));
}

Source File: StageRunData.java From geowave with Apache License 2.0

6 votes

private synchronized DataFileWriter getDataWriterCreateIfNull(
    final String typeName,
    final GeoWaveAvroFormatPlugin plugin) {
  if (!cachedWriters.containsKey(typeName)) {
    FSDataOutputStream out = null;
    final DataFileWriter dfw = new DataFileWriter(new GenericDatumWriter());
    cachedWriters.put(typeName, dfw);
    dfw.setCodec(CodecFactory.snappyCodec());
    try {
      // TODO: we should probably clean up the type name to make it
      // HDFS path safe in case there are invalid characters
      // also, if a file already exists do we want to delete it or
      // append to it?
      out = fs.create(new Path(hdfsBaseDirectory, typeName));
      dfw.create(plugin.getAvroSchema(), out);

    } catch (final IOException e) {
      LOGGER.error("Unable to create output stream", e);
      // cache a null value so we don't continually try to recreate
      cachedWriters.put(typeName, null);
      return null;
    }
  }
  return cachedWriters.get(typeName);
}

Source File: AbstractAvroEventSerializer.java From mt-flume with Apache License 2.0

6 votes

@Override
public void configure(Context context) {

  int syncIntervalBytes =
      context.getInteger(SYNC_INTERVAL_BYTES, DEFAULT_SYNC_INTERVAL_BYTES);
  String compressionCodec =
      context.getString(COMPRESSION_CODEC, DEFAULT_COMPRESSION_CODEC);

  writer = new ReflectDatumWriter<T>(getSchema());
  dataFileWriter = new DataFileWriter<T>(writer);

  dataFileWriter.setSyncInterval(syncIntervalBytes);

  try {
    CodecFactory codecFactory = CodecFactory.fromString(compressionCodec);
    dataFileWriter.setCodec(codecFactory);
  } catch (AvroRuntimeException e) {
    logger.warn("Unable to instantiate avro codec with name (" +
        compressionCodec + "). Compression disabled. Exception follows.", e);
  }
}

Source File: JdbcAvroIO.java From dbeam with Apache License 2.0

6 votes

@SuppressWarnings("deprecation") // uses internal test functionality.
@Override
protected void prepareWrite(WritableByteChannel channel) throws Exception {
  logger.info("jdbcavroio : Preparing write...");
  connection = jdbcAvroArgs.jdbcConnectionConfiguration().createConnection();
  Void destination = getDestination();
  Schema schema = dynamicDestinations.getSchema(destination);
  dataFileWriter =
      new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(schema))
          .setCodec(jdbcAvroArgs.getCodecFactory())
          .setSyncInterval(syncInterval);
  dataFileWriter.setMeta("created_by", this.getClass().getCanonicalName());
  this.countingOutputStream = new CountingOutputStream(Channels.newOutputStream(channel));
  dataFileWriter.create(schema, this.countingOutputStream);
  logger.info("jdbcavroio : Write prepared");
}

Source File: AvroRecordWriter.java From presto with Apache License 2.0

6 votes

public AvroRecordWriter(Path path, JobConf jobConf, boolean isCompressed, Properties properties)
        throws IOException
{
    Schema schema;
    try {
        schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties);
    }
    catch (AvroSerdeException e) {
        throw new IOException(e);
    }
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter);

    if (isCompressed) {
        int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
                ? CodecFactory.deflateCodec(level)
                : CodecFactory.fromString(codecName);
        dataFileWriter.setCodec(factory);
    }

    outputStream = path.getFileSystem(jobConf).create(path);
    dataFileWriter.create(schema, outputStream);
    delegate = new AvroGenericRecordWriter(dataFileWriter);
}

Source File: OsmPbfParser.java From geowave with Apache License 2.0

5 votes

public void setupWriter(
    final DataFileWriter nodeWriter,
    final DataFileWriter wayWriter,
    final DataFileWriter relationWriter) {
  this.nodeWriter = nodeWriter;
  this.wayWriter = wayWriter;
  this.relationWriter = relationWriter;
}

Source File: AvroKeyValueWithMetadataRecordWriter.java From datafu with Apache License 2.0

5 votes

public AvroKeyValueWithMetadataRecordWriter(AvroDatumConverter<K, ?> keyConverter,
    AvroDatumConverter<V, ?> valueConverter, CodecFactory compressionCodec,
    OutputStream outputStream, Configuration conf) throws IOException {
  // Create the generic record schema for the key/value pair.
  mKeyValuePairSchema = AvroKeyValue.getSchema(
      keyConverter.getWriterSchema(), valueConverter.getWriterSchema());

  // Create an Avro container file and a writer to it.
  mAvroFileWriter = new DataFileWriter<GenericRecord>(
      new ReflectDatumWriter<GenericRecord>(mKeyValuePairSchema));
  mAvroFileWriter.setCodec(compressionCodec);
  
  for (Entry<String,String> e : conf)
  {
    if (e.getKey().startsWith(TEXT_PREFIX))
      mAvroFileWriter.setMeta(e.getKey().substring(TEXT_PREFIX.length()),
                              e.getValue());
  }
  
  mAvroFileWriter.create(mKeyValuePairSchema, outputStream);

  // Keep a reference to the converters.
  mKeyConverter = keyConverter;
  mValueConverter = valueConverter;

  // Create a reusable output record.
  mOutputRecord = new AvroKeyValue<Object, Object>(new GenericData.Record(mKeyValuePairSchema));
}

Source File: AvroSpoolDirSourceTestUtil.java From datacollector with Apache License 2.0

5 votes

public static File createAvroDataFile() throws Exception {
  File f = new File(createTestDir(), "file-0.avro");
  Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
  GenericRecord boss = new GenericData.Record(schema);
  boss.put("name", "boss");
  boss.put("age", 60);
  boss.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  boss.put("boss", null);

  GenericRecord e3 = new GenericData.Record(schema);
  e3.put("name", "c");
  e3.put("age", 50);
  e3.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  e3.put("boss", boss);

  GenericRecord e2 = new GenericData.Record(schema);
  e2.put("name", "b");
  e2.put("age", 40);
  e2.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  e2.put("boss", boss);

  GenericRecord e1 = new GenericData.Record(schema);
  e1.put("name", "a");
  e1.put("age", 30);
  e1.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  e1.put("boss", boss);

  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
  dataFileWriter.create(schema, f);
  dataFileWriter.append(e1);
  dataFileWriter.append(e2);
  dataFileWriter.append(e3);

  dataFileWriter.flush();
  dataFileWriter.close();

  return f;
}

Source File: AvroFileGenerator.java From flink-perf with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	// generate only avro file
	if (args.length == 2) {
		ordersPath = args[0];
		outputOrderAvroPath = args[1];
		// Generate file for avro test
		DatumWriter<Order> orderDatumWriter = new SpecificDatumWriter<Order>(Order.class);
		DataFileWriter<Order> dataFileWriter = new DataFileWriter<Order>(orderDatumWriter);
		dataFileWriter.create(Order.getClassSchema(), new File(outputOrderAvroPath));
		Scanner s = new Scanner(new File(ordersPath));
		while (s.hasNextLine()) {
			@SuppressWarnings("resource")
			Scanner lineScanner = new Scanner(s.nextLine()).useDelimiter("\\|");

			Order o = new Order();
			o.setOOrderkey(lineScanner.nextInt());
			o.setOCustkey(lineScanner.nextInt());
			o.setOOrderstatus(lineScanner.next());
			o.setOTotalprice(lineScanner.nextFloat());
			o.setOOrderdate(lineScanner.next());
			o.setOOrderpriority(lineScanner.next());
			o.setOClerk(lineScanner.next());
			o.setOShipproprity(lineScanner.nextInt());
			o.setOComment(lineScanner.next());
			dataFileWriter.append(o);
			lineScanner.close();
		}
		dataFileWriter.flush();
		s.close();
		dataFileWriter.close();
		return;
	} else {
		System.err.println("Usage: <inputFilePath> <outputAvroPath>");
		System.exit(1);
	}
}

Source File: SdcAvroTestUtil.java From datacollector with Apache License 2.0

5 votes

public static File createAvroDataFile() throws Exception {
  File f = new File(createTestDir(), "file-0.avro");
  Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
  GenericRecord boss = new GenericData.Record(schema);
  boss.put("name", "boss");
  boss.put("age", 60);
  boss.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  boss.put("boss", null);

  GenericRecord e3 = new GenericData.Record(schema);
  e3.put("name", "c");
  e3.put("age", 50);
  e3.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  e3.put("boss", boss);

  GenericRecord e2 = new GenericData.Record(schema);
  e2.put("name", "b");
  e2.put("age", 40);
  e2.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  e2.put("boss", boss);

  GenericRecord e1 = new GenericData.Record(schema);
  e1.put("name", "a");
  e1.put("age", 30);
  e1.put("emails", ImmutableList.of("[email protected]", "[email protected]"));
  e1.put("boss", boss);

  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
  dataFileWriter.create(schema, f);
  dataFileWriter.append(e1);
  dataFileWriter.append(e2);
  dataFileWriter.append(e3);

  dataFileWriter.flush();
  dataFileWriter.close();

  return f;
}

Source File: WriteAvroResultWithSchema.java From nifi with Apache License 2.0

5 votes

public WriteAvroResultWithSchema(final Schema schema, final OutputStream out, final CodecFactory codec) throws IOException {
    super(out);
    this.schema = schema;

    final GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.setCodec(codec);
    dataFileWriter.create(schema, out);
}

Source File: AzureBlobAvroWriter.java From samza with Apache License 2.0

5 votes

public BlobWriterComponents(DataFileWriter dataFileWriter, AzureBlobOutputStream azureBlobOutputStream,
    BlockBlobAsyncClient blockBlobAsyncClient) {
  Preconditions.checkNotNull(dataFileWriter, "DataFileWriter can not be null when creating WriterComponents for an Azure Blob.");
  Preconditions.checkNotNull(azureBlobOutputStream, "AzureBlobOutputStream can not be null when creating WriterComponents for an Azure Blob.");
  Preconditions.checkNotNull(blockBlobAsyncClient, "BlockBlobAsyncClient can not be null when creating WriterComponents for an Azure Blob.");
  this.dataFileWriter = dataFileWriter;
  this.azureBlobOutputStream = azureBlobOutputStream;
  this.blockBlobAsyncClient = blockBlobAsyncClient;
}

Source File: AvroTestTools.java From incubator-gobblin with Apache License 2.0

5 votes

private void writeAsAvroBinary(Iterator<GenericRecord> input, Schema schema, FileSystem fs,
    Path outputPath) throws IOException {

  DataFileWriter writer = new DataFileWriter(new GenericDatumWriter());

  writer.create(schema, fs.create(outputPath, true));
  while (input.hasNext()) {
    writer.append(input.next());
  }
  writer.close();

  log.info("Successfully wrote avro file to path " + outputPath);
}

Source File: AvroRecordWriter.java From spork with Apache License 2.0

5 votes

static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer,
    JobConf job) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(job)) {
    int level = job.getInt(DEFLATE_LEVEL_KEY,
        DEFAULT_DEFLATE_LEVEL);
    String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory = codecName.equals(DEFLATE_CODEC)
      ? CodecFactory.deflateCodec(level)
      : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  // Do max as core-default.xml has io.file.buffer.size as 4K
  writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(
          job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

  // copy metadata from job
  for (Map.Entry<String,String> e : job) {
    if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                     e.getValue());
    if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                     URLDecoder.decode(e.getValue(), "ISO-8859-1")
                     .getBytes("ISO-8859-1"));
  }
}

Source File: AvroRowWriter.java From beam with Apache License 2.0

5 votes

AvroRowWriter(
    String basename,
    Schema schema,
    SerializableFunction<AvroWriteRequest<T>, AvroT> toAvroRecord,
    SerializableFunction<Schema, DatumWriter<AvroT>> writerFactory)
    throws Exception {
  super(basename, MimeTypes.BINARY);

  this.schema = schema;
  this.toAvroRecord = toAvroRecord;
  this.writer =
      new DataFileWriter<>(writerFactory.apply(schema)).create(schema, getOutputStream());
}

Source File: AvroSink.java From beam with Apache License 2.0

5 votes

@SuppressWarnings("deprecation") // uses internal test functionality.
@Override
protected void prepareWrite(WritableByteChannel channel) throws Exception {
  DestinationT destination = getDestination();
  CodecFactory codec = dynamicDestinations.getCodec(destination);
  Schema schema = dynamicDestinations.getSchema(destination);
  Map<String, Object> metadata = dynamicDestinations.getMetadata(destination);

  DatumWriter<OutputT> datumWriter =
      genericRecords ? new GenericDatumWriter<>(schema) : new ReflectDatumWriter<>(schema);
  dataFileWriter = new DataFileWriter<>(datumWriter).setCodec(codec);
  for (Map.Entry<String, Object> entry : metadata.entrySet()) {
    Object v = entry.getValue();
    if (v instanceof String) {
      dataFileWriter.setMeta(entry.getKey(), (String) v);
    } else if (v instanceof Long) {
      dataFileWriter.setMeta(entry.getKey(), (Long) v);
    } else if (v instanceof byte[]) {
      dataFileWriter.setMeta(entry.getKey(), (byte[]) v);
    } else {
      throw new IllegalStateException(
          "Metadata value type must be one of String, Long, or byte[]. Found "
              + v.getClass().getSimpleName());
    }
  }
  dataFileWriter.create(schema, Channels.newOutputStream(channel));
}

Source File: Purge.java From Cubert with Apache License 2.0

5 votes

private void purge(String src, String dst) throws IOException
{
    DataFileReader<GenericRecord> dataFileReader = createDataFileReader(src, false);
    DataFileWriter<GenericRecord> writer = createDataFileWriter(dataFileReader);

    numRecords = 0;
    recordsPurged = 0;
    remainingRecords = 0;

    // Copy
    while (dataFileReader.hasNext())
    {
        numRecords++;
        GenericRecord record = dataFileReader.next();
        if (record == null)
        {
            continue;
        }

        Number column = (Number) record.get(columnName);
        if ((column == null) || (!membersToPurge.contains(column.intValue())))
        {
            remainingRecords++;
            writer.append(record);
        }
    }

    recordsPurged = numRecords - remainingRecords;
    writer.close();
    dataFileReader.close();
}

Source File: SparkVerifierTest.java From tablasco with Apache License 2.0

5 votes

private static void writeAvroData(List<GenericRecord> data, File avroFile) throws IOException
{
    FileUtils.forceMkdir(avroFile.getParentFile());
    Schema schema = data.get(0).getSchema();
    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(schema, avroFile);
    for (GenericRecord genericRecord : data)
    {
        dataFileWriter.append(genericRecord);
    }
    dataFileWriter.close();
}

org.apache.avro.file.DataFileWriter Java Examples