org.apache.orc.Writer Java Exaples

Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0

6 votes

public static void addAvroRecord(
    VectorizedRowBatch batch,
    GenericRecord record,
    TypeDescription orcSchema,
    int orcBatchSize,
    Writer writer
) throws IOException {

  for (int c = 0; c < batch.numCols; c++) {
    ColumnVector colVector = batch.cols[c];
    final String thisField = orcSchema.getFieldNames().get(c);
    final TypeDescription type = orcSchema.getChildren().get(c);

    Object fieldValue = record.get(thisField);
    Schema.Field avroField = record.getSchema().getField(thisField);
    addToVector(type, colVector, avroField.schema(), fieldValue, batch.size);
  }

  batch.size++;

  if (batch.size % orcBatchSize == 0 || batch.size == batch.getMaxSize()) {
    writer.addRowBatch(batch);
    batch.reset();
    batch.size = 0;
  }
}

Source File: OrcMetrics.java From iceberg with Apache License 2.0

5 votes

static Metrics fromWriter(Writer writer) {
  try {
    return buildOrcMetrics(writer.getNumberOfRows(), writer.getSchema(), writer.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get statistics from writer");
  }
}

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

5 votes

private static Writer newOrcWriter(OutputFile file,
                                   OrcFile.WriterOptions options, Map<String, byte[]> metadata) {
  final Path locPath = new Path(file.location());
  final Writer writer;

  try {
    writer = OrcFile.createWriter(locPath, options);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Can't create file " + locPath);
  }

  metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));

  return writer;
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

5 votes

public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}

Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0

5 votes

public static Writer createOrcWriter(Properties orcWriterProperties, Configuration configuration, Path orcOutputFile, TypeDescription orcSchema) throws IOException {
  if (LOG.isDebugEnabled()) {
    LOG.debug("Creating ORC writer at: {}", orcOutputFile.toString());
  }
  return OrcFile.createWriter(
      orcOutputFile,
      OrcFile.writerOptions(orcWriterProperties, configuration).setSchema(orcSchema)
  );
}

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

5 votes

@Test
public void recordConversion() throws IOException {
  Path outputFilePath = new Path(createTempFile());

  Schema.Parser schemaParser = new Schema.Parser();
  Schema schema = schemaParser.parse(
      "{\"type\": \"record\", \"name\": \"MyRecord\", \"fields\": [{\"name\": \"first\", \"type\": \"int\"},{" +
          "\"name\": \"second\", \"type\": {\"type\": \"record\", \"name\": \"MySubRecord\", \"fields\":" +
          " [{\"name\": \"sub1\", \"type\": \"string\"}, {\"name\": \"sub2\", \"type\": \"int\"}] } }, {\"name\":" +
          " \"somedate\", \"type\": { \"type\" : \"int\", \"logicalType\": \"date\"} } ]}"
  );

  TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  GenericRecord avroRecord = new GenericData.Record(schema);
  avroRecord.put("first", 1);
  avroRecord.put("somedate", 17535);

  GenericData.Record subRecord = new GenericData.Record(schema.getField("second").schema());
  subRecord.put("sub1", new Utf8("value1"));
  subRecord.put("sub2", 42);

  avroRecord.put("second", subRecord);

  VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  // TODO: add code to read the ORC file and validate the contents
}

Source File: OrcBulkWriter.java From flink with Apache License 2.0

5 votes

OrcBulkWriter(Vectorizer<T> vectorizer, Writer writer) {
	this.vectorizer = checkNotNull(vectorizer);
	this.writer = checkNotNull(writer);
	this.rowBatch = vectorizer.getSchema().createRowBatch();

	// Configure the vectorizer with the writer so that users can add
	// metadata on the fly through the Vectorizer#vectorize(...) method.
	this.vectorizer.setWriter(this.writer);
}

Source File: OrcKeyCompactorOutputFormat.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Required for extension since super method hard-coded file extension as ".orc". To keep flexibility
 * of extension name, we made it configuration driven.
 * @param taskAttemptContext The source of configuration that determines the file extension
 * @return The {@link RecordWriter} that write out Orc object.
 * @throws IOException
 */
@Override
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
  Configuration conf = taskAttemptContext.getConfiguration();
  String extension = "." + conf.get(COMPACTION_OUTPUT_EXTENSION, "orc" );

  Path filename = getDefaultWorkFile(taskAttemptContext, extension);
  Writer writer = OrcFile.createWriter(filename,
      org.apache.orc.mapred.OrcOutputFormat.buildOptions(conf));
  return new OrcMapreduceRecordWriter(writer);
}

Source File: OrcCompactionTaskTest.java From incubator-gobblin with Apache License 2.0

5 votes

public void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception {
  Configuration configuration = new Configuration();
  OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema);

  Writer writer = OrcFile.createWriter(path, options);
  OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer);
  for (OrcStruct orcRecord : orcStructs) {
    recordWriter.write(NullWritable.get(), orcRecord);
  }
  recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID()));
}

Source File: OrcWriter.java From osm2orc with ISC License

4 votes

OrcEntityProcessor(Writer writer, VectorizedRowBatch batch) {
    this.writer = writer;
    this.batch = batch;
}

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

4 votes

@Test
public void unionTypeConversions() throws IOException {
  final Path outputFilePath = new Path(createTempFile());

  final Schema.Parser schemaParser = new Schema.Parser();
  final Schema schema = schemaParser.parse(TestAvroToOrcRecordConverter.class.getResourceAsStream("avro_union_types.json"));

  final TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  final Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  final GenericRecord avroRecord1 = new GenericData.Record(schema);
  avroRecord1.put("nullableInteger", 87);
  avroRecord1.put("integerOrString", "someString");
  avroRecord1.put("nullableStringOrInteger", "nonNullString");
  avroRecord1.put("justLong", 57844942331l);

  final GenericRecord avroRecord2 = new GenericData.Record(schema);
  avroRecord2.put("nullableInteger", null);
  avroRecord2.put("integerOrString", 16);
  avroRecord2.put("nullableStringOrInteger", null);
  avroRecord2.put("justLong", 758934l);

  final VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord1, orcSchema, 1000, orcWriter);
  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord2, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  try (OrcToSdcRecordConverter sdcRecordConverter = new OrcToSdcRecordConverter(outputFilePath)) {

    final Record record1 = RecordCreator.create();
    boolean populated = sdcRecordConverter.populateRecord(record1);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(record1, avroRecord1, null);

    final Record record2 = RecordCreator.create();
    populated = sdcRecordConverter.populateRecord(record2);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(
        record2,
        avroRecord2,
        ImmutableMap.<String, Matcher<Field>>builder()
            .put("nullableInteger", Matchers.intFieldWithNullValue())
            .put("nullableStringOrInteger", Matchers.stringFieldWithNullValue())
            .build()
    );
  }
}

Source File: OrcColumnarRowSplitReaderNoHiveTest.java From flink with Apache License 2.0

4 votes

@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: OrcColumnarRowSplitReaderTest.java From flink with Apache License 2.0

4 votes

protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: ORCRecordExtractorTest.java From incubator-pinot with Apache License 2.0

4 votes

/**
 * Create an ORC input file using the input records
 */
@Override
protected void createInputFile()
    throws IOException {
  TypeDescription schema = TypeDescription.fromString(
      "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>");
  Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()),
      OrcFile.writerOptions(new Configuration()).setSchema(schema));

  int numRecords = _inputRecords.size();
  VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords);
  LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0];
  userIdVector.noNulls = false;
  BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1];
  firstNameVector.noNulls = false;
  BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2];
  ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3];
  bidsVector.noNulls = false;
  LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child;
  bidsElementVector.ensureSize(6, false);
  BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4];
  DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5];
  LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6];

  for (int i = 0; i < numRecords; i++) {
    Map<String, Object> record = _inputRecords.get(i);

    Integer userId = (Integer) record.get("user_id");
    if (userId != null) {
      userIdVector.vector[i] = userId;
    } else {
      userIdVector.isNull[i] = true;
    }
    String firstName = (String) record.get("firstName");
    if (firstName != null) {
      firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName));
    } else {
      firstNameVector.isNull[i] = true;
    }
    lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName")));
    List<Integer> bids = (List<Integer>) record.get("bids");
    if (bids != null) {
      bidsVector.offsets[i] = bidsVector.childCount;
      bidsVector.lengths[i] = bids.size();
      for (int bid : bids) {
        bidsElementVector.vector[bidsVector.childCount++] = bid;
      }
    } else {
      bidsVector.isNull[i] = true;
    }
    campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo")));
    costVector.vector[i] = (double) record.get("cost");
    timestampVector.vector[i] = (long) record.get("timestamp");

    rowBatch.size++;
  }

  writer.addRowBatch(rowBatch);
  rowBatch.reset();
  writer.close();
}

Source File: OrcMetaDataWriter.java From pentaho-hadoop-shims with Apache License 2.0

4 votes

public OrcMetaDataWriter( Writer writer ) {
  this.writer = writer;
}

Source File: Vectorizer.java From flink with Apache License 2.0

2 votes

/**
 * Users are not supposed to use this method since this is intended to be used only by the {@link OrcBulkWriter}.
 *
 * @param writer the underlying ORC Writer.
 */
public void setWriter(Writer writer) {
	this.writer = writer;
}

org.apache.orc.Writer Java Examples