org.apache.orc.TypeDescription#createRowBatch

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

6 votes

OrcFileAppender(Schema schema, OutputFile file,
                Function<TypeDescription, OrcValueWriter<?>> createWriterFunc,
                Configuration conf, Map<String, byte[]> metadata,
                int batchSize) {
  this.conf = conf;
  this.file = file;
  this.batchSize = batchSize;
  this.schema = schema;

  TypeDescription orcSchema = ORCSchemaUtil.convert(this.schema);
  this.batch = orcSchema.createRowBatch(this.batchSize);

  OrcFile.WriterOptions options = OrcFile.writerOptions(conf).useUTCTimestamp(true);
  if (file instanceof HadoopOutputFile) {
    options.fileSystem(((HadoopOutputFile) file).getFileSystem());
  }
  options.setSchema(orcSchema);
  this.writer = newOrcWriter(file, options, metadata);
  this.valueWriter = newOrcValueWriter(orcSchema, createWriterFunc);
}

Source File: PentahoOrcRecordWriter.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

public PentahoOrcRecordWriter( List<? extends IOrcOutputField> fields, TypeDescription schema, String filePath,
                               Configuration conf ) {
  this.fields = fields;
  this.schema = schema;
  final AtomicInteger fieldNumber = new AtomicInteger();  //Mutable field count
  fields.forEach( field -> setOutputMeta( fieldNumber, field ) );
  outputRowMetaAndData = new RowMetaAndData( outputRowMeta, new Object[ fieldNumber.get() ] );

  try {
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( filePath, conf );
    Path outputFile = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( filePath ) );
    writer = OrcFile.createWriter( outputFile,
      OrcFile.writerOptions( conf )
        .setSchema( schema ) );
    batch = schema.createRowBatch();
  } catch ( IOException e ) {
    logger.error( e );
  }

  //Write the addition metadata for the fields
  // new OrcMetaDataWriter( writer ).write( fields );
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

5 votes

public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

5 votes

@Test
public void recordConversion() throws IOException {
  Path outputFilePath = new Path(createTempFile());

  Schema.Parser schemaParser = new Schema.Parser();
  Schema schema = schemaParser.parse(
      "{\"type\": \"record\", \"name\": \"MyRecord\", \"fields\": [{\"name\": \"first\", \"type\": \"int\"},{" +
          "\"name\": \"second\", \"type\": {\"type\": \"record\", \"name\": \"MySubRecord\", \"fields\":" +
          " [{\"name\": \"sub1\", \"type\": \"string\"}, {\"name\": \"sub2\", \"type\": \"int\"}] } }, {\"name\":" +
          " \"somedate\", \"type\": { \"type\" : \"int\", \"logicalType\": \"date\"} } ]}"
  );

  TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  GenericRecord avroRecord = new GenericData.Record(schema);
  avroRecord.put("first", 1);
  avroRecord.put("somedate", 17535);

  GenericData.Record subRecord = new GenericData.Record(schema.getField("second").schema());
  subRecord.put("sub1", new Utf8("value1"));
  subRecord.put("sub2", 42);

  avroRecord.put("second", subRecord);

  VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  // TODO: add code to read the ORC file and validate the contents
}

Source File: OrcNoHiveBulkWriterFactory.java From flink with Apache License 2.0

5 votes

@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
	TypeDescription description = TypeDescription.fromString(schema);
	opts.setSchema(description);
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));
	WriterImpl writer = new WriterImpl(null, new Path("."), opts);

	VectorizedRowBatch rowBatch = description.createRowBatch();
	return new BulkWriter<RowData>() {
		@Override
		public void addElement(RowData row) throws IOException {
			int rowId = rowBatch.size++;
			for (int i = 0; i < row.getArity(); ++i) {
				setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
			}
			if (rowBatch.size == rowBatch.getMaxSize()) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void flush() throws IOException {
			if (rowBatch.size != 0) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void finish() throws IOException {
			flush();
			writer.close();
		}
	};
}

Source File: VectorizedRowBatchIterator.java From iceberg with Apache License 2.0

4 votes

VectorizedRowBatchIterator(String fileLocation, TypeDescription schema, RecordReader rows) {
  this.fileLocation = fileLocation;
  this.rows = rows;
  this.batch = schema.createRowBatch();
}

Source File: OrcIterator.java From iceberg with Apache License 2.0

4 votes

OrcIterator(Path filename, TypeDescription schema, RecordReader rows) {
  this.filename = filename;
  this.rows = rows;
  this.batch = schema.createRowBatch();
}

Source File: SparkOrcWriter.java From iceberg with Apache License 2.0

4 votes

public SparkOrcWriter(OrcFileAppender writer) {
  TypeDescription schema = writer.getSchema();
  batch = schema.createRowBatch(BATCH_SIZE);
  this.writer = writer;
  converters = buildConverters(schema);
}

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

4 votes

@Test
public void unionTypeConversions() throws IOException {
  final Path outputFilePath = new Path(createTempFile());

  final Schema.Parser schemaParser = new Schema.Parser();
  final Schema schema = schemaParser.parse(TestAvroToOrcRecordConverter.class.getResourceAsStream("avro_union_types.json"));

  final TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  final Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  final GenericRecord avroRecord1 = new GenericData.Record(schema);
  avroRecord1.put("nullableInteger", 87);
  avroRecord1.put("integerOrString", "someString");
  avroRecord1.put("nullableStringOrInteger", "nonNullString");
  avroRecord1.put("justLong", 57844942331l);

  final GenericRecord avroRecord2 = new GenericData.Record(schema);
  avroRecord2.put("nullableInteger", null);
  avroRecord2.put("integerOrString", 16);
  avroRecord2.put("nullableStringOrInteger", null);
  avroRecord2.put("justLong", 758934l);

  final VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord1, orcSchema, 1000, orcWriter);
  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord2, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  try (OrcToSdcRecordConverter sdcRecordConverter = new OrcToSdcRecordConverter(outputFilePath)) {

    final Record record1 = RecordCreator.create();
    boolean populated = sdcRecordConverter.populateRecord(record1);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(record1, avroRecord1, null);

    final Record record2 = RecordCreator.create();
    populated = sdcRecordConverter.populateRecord(record2);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(
        record2,
        avroRecord2,
        ImmutableMap.<String, Matcher<Field>>builder()
            .put("nullableInteger", Matchers.intFieldWithNullValue())
            .put("nullableStringOrInteger", Matchers.stringFieldWithNullValue())
            .build()
    );
  }
}

Source File: OrcNoHiveShim.java From flink with Apache License 2.0

4 votes

@Override
public OrcNoHiveBatchWrapper createBatchWrapper(TypeDescription schema, int batchSize) {
	return new OrcNoHiveBatchWrapper(schema.createRowBatch(batchSize));
}

Source File: OrcColumnarRowSplitReaderNoHiveTest.java From flink with Apache License 2.0

4 votes

@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: OrcShimV200.java From flink with Apache License 2.0

4 votes

@Override
public HiveOrcBatchWrapper createBatchWrapper(TypeDescription schema, int batchSize) {
	return new HiveOrcBatchWrapper(schema.createRowBatch(batchSize));
}

Source File: OrcColumnarRowSplitReaderTest.java From flink with Apache License 2.0

4 votes

protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: ORCRecordExtractorTest.java From incubator-pinot with Apache License 2.0

4 votes

/**
 * Create an ORC input file using the input records
 */
@Override
protected void createInputFile()
    throws IOException {
  TypeDescription schema = TypeDescription.fromString(
      "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>");
  Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()),
      OrcFile.writerOptions(new Configuration()).setSchema(schema));

  int numRecords = _inputRecords.size();
  VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords);
  LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0];
  userIdVector.noNulls = false;
  BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1];
  firstNameVector.noNulls = false;
  BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2];
  ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3];
  bidsVector.noNulls = false;
  LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child;
  bidsElementVector.ensureSize(6, false);
  BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4];
  DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5];
  LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6];

  for (int i = 0; i < numRecords; i++) {
    Map<String, Object> record = _inputRecords.get(i);

    Integer userId = (Integer) record.get("user_id");
    if (userId != null) {
      userIdVector.vector[i] = userId;
    } else {
      userIdVector.isNull[i] = true;
    }
    String firstName = (String) record.get("firstName");
    if (firstName != null) {
      firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName));
    } else {
      firstNameVector.isNull[i] = true;
    }
    lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName")));
    List<Integer> bids = (List<Integer>) record.get("bids");
    if (bids != null) {
      bidsVector.offsets[i] = bidsVector.childCount;
      bidsVector.lengths[i] = bids.size();
      for (int bid : bids) {
        bidsElementVector.vector[bidsVector.childCount++] = bid;
      }
    } else {
      bidsVector.isNull[i] = true;
    }
    campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo")));
    costVector.vector[i] = (double) record.get("cost");
    timestampVector.vector[i] = (long) record.get("timestamp");

    rowBatch.size++;
  }

  writer.addRowBatch(rowBatch);
  rowBatch.reset();
  writer.close();
}

Java Code Examples for org.apache.orc.TypeDescription#createRowBatch()