org.apache.orc.OrcFile#createWriter

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

6 votes

OrcFileAppender(Schema schema,
                OutputFile file,
                OrcFile.WriterOptions options,
                Map<String,byte[]> metadata) {
  orcSchema = TypeConversion.toOrc(schema, columnIds);
  options.setSchema(orcSchema);
  path = new Path(file.location());
  try {
    writer = OrcFile.createWriter(path, options);
  } catch (IOException e) {
    throw new RuntimeException("Can't create file " + path, e);
  }
  writer.addUserMetadata(COLUMN_NUMBERS_ATTRIBUTE, columnIds.serialize());
  metadata.forEach(
      (key,value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));
}

Source File: JsonORCFileReaderWriterFactory.java From secor with Apache License 2.0

6 votes

public JsonORCFileWriter(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    Configuration conf = new Configuration();
    Path path = new Path(logFilePath.getLogFilePath());
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    if (schema == null) {
        String topic = logFilePath.getTopic();
        throw new IllegalArgumentException(
            String.format("No schema is provided for topic '%s'", topic));
    }
    List<TypeDescription> fieldTypes = schema.getChildren();
    converters = new JsonConverter[fieldTypes.size()];
    for (int c = 0; c < converters.length; ++c) {
        converters[c] = VectorColumnFiller.createConverter(fieldTypes
                .get(c));
    }

    writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
            .compress(resolveCompression(codec)).setSchema(schema));
    batch = schema.createRowBatch();
}

Source File: PentahoOrcRecordWriter.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

public PentahoOrcRecordWriter( List<? extends IOrcOutputField> fields, TypeDescription schema, String filePath,
                               Configuration conf ) {
  this.fields = fields;
  this.schema = schema;
  final AtomicInteger fieldNumber = new AtomicInteger();  //Mutable field count
  fields.forEach( field -> setOutputMeta( fieldNumber, field ) );
  outputRowMetaAndData = new RowMetaAndData( outputRowMeta, new Object[ fieldNumber.get() ] );

  try {
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( filePath, conf );
    Path outputFile = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( filePath ) );
    writer = OrcFile.createWriter( outputFile,
      OrcFile.writerOptions( conf )
        .setSchema( schema ) );
    batch = schema.createRowBatch();
  } catch ( IOException e ) {
    logger.error( e );
  }

  //Write the addition metadata for the fields
  // new OrcMetaDataWriter( writer ).write( fields );
}

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

5 votes

private static Writer newOrcWriter(OutputFile file,
                                   OrcFile.WriterOptions options, Map<String, byte[]> metadata) {
  final Path locPath = new Path(file.location());
  final Writer writer;

  try {
    writer = OrcFile.createWriter(locPath, options);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Can't create file " + locPath);
  }

  metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));

  return writer;
}

Source File: OrcWriter.java From osm2orc with ISC License

5 votes

@Override
public void initialize(Map<String, Object> metaData) {
    try {
        Configuration conf = new Configuration();
        // conf.set(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), "tags");
        processor = new OrcEntityProcessor(OrcFile.createWriter(new Path(filename),
                OrcFile.writerOptions(conf).setSchema(SCHEMA)), SCHEMA.createRowBatch());
    } catch (IOException e) {
        throw new OsmosisRuntimeException(e);
    }
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

5 votes

public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}

Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0

5 votes

public static Writer createOrcWriter(Properties orcWriterProperties, Configuration configuration, Path orcOutputFile, TypeDescription orcSchema) throws IOException {
  if (LOG.isDebugEnabled()) {
    LOG.debug("Creating ORC writer at: {}", orcOutputFile.toString());
  }
  return OrcFile.createWriter(
      orcOutputFile,
      OrcFile.writerOptions(orcWriterProperties, configuration).setSchema(orcSchema)
  );
}

Source File: OrcKeyCompactorOutputFormat.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Required for extension since super method hard-coded file extension as ".orc". To keep flexibility
 * of extension name, we made it configuration driven.
 * @param taskAttemptContext The source of configuration that determines the file extension
 * @return The {@link RecordWriter} that write out Orc object.
 * @throws IOException
 */
@Override
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
  Configuration conf = taskAttemptContext.getConfiguration();
  String extension = "." + conf.get(COMPACTION_OUTPUT_EXTENSION, "orc" );

  Path filename = getDefaultWorkFile(taskAttemptContext, extension);
  Writer writer = OrcFile.createWriter(filename,
      org.apache.orc.mapred.OrcOutputFormat.buildOptions(conf));
  return new OrcMapreduceRecordWriter(writer);
}

Source File: OrcCompactionTaskTest.java From incubator-gobblin with Apache License 2.0

5 votes

public void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception {
  Configuration configuration = new Configuration();
  OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema);

  Writer writer = OrcFile.createWriter(path, options);
  OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer);
  for (OrcStruct orcRecord : orcStructs) {
    recordWriter.write(NullWritable.get(), orcRecord);
  }
  recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID()));
}

Source File: OrcColumnarRowSplitReaderNoHiveTest.java From flink with Apache License 2.0

4 votes

@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: OrcColumnarRowSplitReaderTest.java From flink with Apache License 2.0

4 votes

protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: ORCRecordExtractorTest.java From incubator-pinot with Apache License 2.0

4 votes

/**
 * Create an ORC input file using the input records
 */
@Override
protected void createInputFile()
    throws IOException {
  TypeDescription schema = TypeDescription.fromString(
      "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>");
  Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()),
      OrcFile.writerOptions(new Configuration()).setSchema(schema));

  int numRecords = _inputRecords.size();
  VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords);
  LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0];
  userIdVector.noNulls = false;
  BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1];
  firstNameVector.noNulls = false;
  BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2];
  ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3];
  bidsVector.noNulls = false;
  LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child;
  bidsElementVector.ensureSize(6, false);
  BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4];
  DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5];
  LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6];

  for (int i = 0; i < numRecords; i++) {
    Map<String, Object> record = _inputRecords.get(i);

    Integer userId = (Integer) record.get("user_id");
    if (userId != null) {
      userIdVector.vector[i] = userId;
    } else {
      userIdVector.isNull[i] = true;
    }
    String firstName = (String) record.get("firstName");
    if (firstName != null) {
      firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName));
    } else {
      firstNameVector.isNull[i] = true;
    }
    lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName")));
    List<Integer> bids = (List<Integer>) record.get("bids");
    if (bids != null) {
      bidsVector.offsets[i] = bidsVector.childCount;
      bidsVector.lengths[i] = bids.size();
      for (int bid : bids) {
        bidsElementVector.vector[bidsVector.childCount++] = bid;
      }
    } else {
      bidsVector.isNull[i] = true;
    }
    campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo")));
    costVector.vector[i] = (double) record.get("cost");
    timestampVector.vector[i] = (long) record.get("timestamp");

    rowBatch.size++;
  }

  writer.addRowBatch(rowBatch);
  rowBatch.reset();
  writer.close();
}

Java Code Examples for org.apache.orc.OrcFile#createWriter()