Java Code Examples for org.apache.orc.OrcFile#createWriter()
The following examples show how to use
org.apache.orc.OrcFile#createWriter() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: OrcFileAppender.java From iceberg with Apache License 2.0 | 6 votes |
OrcFileAppender(Schema schema, OutputFile file, OrcFile.WriterOptions options, Map<String,byte[]> metadata) { orcSchema = TypeConversion.toOrc(schema, columnIds); options.setSchema(orcSchema); path = new Path(file.location()); try { writer = OrcFile.createWriter(path, options); } catch (IOException e) { throw new RuntimeException("Can't create file " + path, e); } writer.addUserMetadata(COLUMN_NUMBERS_ATTRIBUTE, columnIds.serialize()); metadata.forEach( (key,value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value))); }
Example 2
Source File: JsonORCFileReaderWriterFactory.java From secor with Apache License 2.0 | 6 votes |
public JsonORCFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException { Configuration conf = new Configuration(); Path path = new Path(logFilePath.getLogFilePath()); schema = schemaProvider.getSchema(logFilePath.getTopic(), logFilePath); if (schema == null) { String topic = logFilePath.getTopic(); throw new IllegalArgumentException( String.format("No schema is provided for topic '%s'", topic)); } List<TypeDescription> fieldTypes = schema.getChildren(); converters = new JsonConverter[fieldTypes.size()]; for (int c = 0; c < converters.length; ++c) { converters[c] = VectorColumnFiller.createConverter(fieldTypes .get(c)); } writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf) .compress(resolveCompression(codec)).setSchema(schema)); batch = schema.createRowBatch(); }
Example 3
Source File: PentahoOrcRecordWriter.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
public PentahoOrcRecordWriter( List<? extends IOrcOutputField> fields, TypeDescription schema, String filePath, Configuration conf ) { this.fields = fields; this.schema = schema; final AtomicInteger fieldNumber = new AtomicInteger(); //Mutable field count fields.forEach( field -> setOutputMeta( fieldNumber, field ) ); outputRowMetaAndData = new RowMetaAndData( outputRowMeta, new Object[ fieldNumber.get() ] ); try { S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( filePath, conf ); Path outputFile = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( filePath ) ); writer = OrcFile.createWriter( outputFile, OrcFile.writerOptions( conf ) .setSchema( schema ) ); batch = schema.createRowBatch(); } catch ( IOException e ) { logger.error( e ); } //Write the addition metadata for the fields // new OrcMetaDataWriter( writer ).write( fields ); }
Example 4
Source File: OrcFileAppender.java From iceberg with Apache License 2.0 | 5 votes |
private static Writer newOrcWriter(OutputFile file, OrcFile.WriterOptions options, Map<String, byte[]> metadata) { final Path locPath = new Path(file.location()); final Writer writer; try { writer = OrcFile.createWriter(locPath, options); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Can't create file " + locPath); } metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value))); return writer; }
Example 5
Source File: OrcWriter.java From osm2orc with ISC License | 5 votes |
@Override public void initialize(Map<String, Object> metaData) { try { Configuration conf = new Configuration(); // conf.set(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), "tags"); processor = new OrcEntityProcessor(OrcFile.createWriter(new Path(filename), OrcFile.writerOptions(conf).setSchema(SCHEMA)), SCHEMA.createRowBatch()); } catch (IOException e) { throw new OsmosisRuntimeException(e); } }
Example 6
Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0 | 5 votes |
public File createORCFile(int[] values) throws IOException { File file = File.createTempFile("zeppelin-flink-input", ".orc"); file.delete(); Path path = new Path(file.getAbsolutePath()); Configuration conf = new Configuration(); conf.set("orc.compress", "snappy"); TypeDescription schema = TypeDescription.fromString("struct<msg:int>"); Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf) .setSchema(schema)); VectorizedRowBatch batch = schema.createRowBatch(); LongColumnVector x = (LongColumnVector) batch.cols[0]; for (int i = 0; i < values.length; ++i) { int row = batch.size++; x.vector[row] = values[i]; // If the batch is full, write it out and start over. if (batch.size == batch.getMaxSize()) { writer.addRowBatch(batch); batch.reset(); } } if (batch.size != 0) { writer.addRowBatch(batch); batch.reset(); } writer.close(); return file; }
Example 7
Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0 | 5 votes |
public static Writer createOrcWriter(Properties orcWriterProperties, Configuration configuration, Path orcOutputFile, TypeDescription orcSchema) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("Creating ORC writer at: {}", orcOutputFile.toString()); } return OrcFile.createWriter( orcOutputFile, OrcFile.writerOptions(orcWriterProperties, configuration).setSchema(orcSchema) ); }
Example 8
Source File: OrcKeyCompactorOutputFormat.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Required for extension since super method hard-coded file extension as ".orc". To keep flexibility * of extension name, we made it configuration driven. * @param taskAttemptContext The source of configuration that determines the file extension * @return The {@link RecordWriter} that write out Orc object. * @throws IOException */ @Override public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException { Configuration conf = taskAttemptContext.getConfiguration(); String extension = "." + conf.get(COMPACTION_OUTPUT_EXTENSION, "orc" ); Path filename = getDefaultWorkFile(taskAttemptContext, extension); Writer writer = OrcFile.createWriter(filename, org.apache.orc.mapred.OrcOutputFormat.buildOptions(conf)); return new OrcMapreduceRecordWriter(writer); }
Example 9
Source File: OrcCompactionTaskTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
public void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception { Configuration configuration = new Configuration(); OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema); Writer writer = OrcFile.createWriter(path, options); OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer); for (OrcStruct orcRecord : orcStructs) { recordWriter.write(NullWritable.get(), orcRecord); } recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID())); }
Example 10
Source File: OrcColumnarRowSplitReaderNoHiveTest.java From flink with Apache License 2.0 | 4 votes |
@Override protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException { // NOTE: orc has field name information, so name should be same as orc TypeDescription schema = TypeDescription.fromString( "struct<" + "f0:float," + "f1:double," + "f2:timestamp," + "f3:tinyint," + "f4:smallint" + ">"); org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file); Configuration conf = new Configuration(); Writer writer = OrcFile.createWriter(filePath, OrcFile.writerOptions(conf).setSchema(schema)); VectorizedRowBatch batch = schema.createRowBatch(rowSize); DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0]; DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1]; TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2]; LongColumnVector col3 = (LongColumnVector) batch.cols[3]; LongColumnVector col4 = (LongColumnVector) batch.cols[4]; col0.noNulls = false; col1.noNulls = false; col2.noNulls = false; col3.noNulls = false; col4.noNulls = false; for (int i = 0; i < rowSize - 1; i++) { col0.vector[i] = i; col1.vector[i] = i; Timestamp timestamp = toTimestamp(i); col2.time[i] = timestamp.getTime(); col2.nanos[i] = timestamp.getNanos(); col3.vector[i] = i; col4.vector[i] = i; } col0.isNull[rowSize - 1] = true; col1.isNull[rowSize - 1] = true; col2.isNull[rowSize - 1] = true; col3.isNull[rowSize - 1] = true; col4.isNull[rowSize - 1] = true; batch.size = rowSize; writer.addRowBatch(batch); batch.reset(); writer.close(); }
Example 11
Source File: OrcColumnarRowSplitReaderTest.java From flink with Apache License 2.0 | 4 votes |
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException { // NOTE: orc has field name information, so name should be same as orc TypeDescription schema = TypeDescription.fromString( "struct<" + "f0:float," + "f1:double," + "f2:timestamp," + "f3:tinyint," + "f4:smallint" + ">"); org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file); Configuration conf = new Configuration(); Writer writer = OrcFile.createWriter(filePath, OrcFile.writerOptions(conf).setSchema(schema)); VectorizedRowBatch batch = schema.createRowBatch(rowSize); DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0]; DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1]; TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2]; LongColumnVector col3 = (LongColumnVector) batch.cols[3]; LongColumnVector col4 = (LongColumnVector) batch.cols[4]; col0.noNulls = false; col1.noNulls = false; col2.noNulls = false; col3.noNulls = false; col4.noNulls = false; for (int i = 0; i < rowSize - 1; i++) { col0.vector[i] = i; col1.vector[i] = i; Timestamp timestamp = toTimestamp(i); col2.time[i] = timestamp.getTime(); col2.nanos[i] = timestamp.getNanos(); col3.vector[i] = i; col4.vector[i] = i; } col0.isNull[rowSize - 1] = true; col1.isNull[rowSize - 1] = true; col2.isNull[rowSize - 1] = true; col3.isNull[rowSize - 1] = true; col4.isNull[rowSize - 1] = true; batch.size = rowSize; writer.addRowBatch(batch); batch.reset(); writer.close(); }
Example 12
Source File: ORCRecordExtractorTest.java From incubator-pinot with Apache License 2.0 | 4 votes |
/** * Create an ORC input file using the input records */ @Override protected void createInputFile() throws IOException { TypeDescription schema = TypeDescription.fromString( "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>"); Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()), OrcFile.writerOptions(new Configuration()).setSchema(schema)); int numRecords = _inputRecords.size(); VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords); LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0]; userIdVector.noNulls = false; BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1]; firstNameVector.noNulls = false; BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2]; ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3]; bidsVector.noNulls = false; LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child; bidsElementVector.ensureSize(6, false); BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4]; DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5]; LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6]; for (int i = 0; i < numRecords; i++) { Map<String, Object> record = _inputRecords.get(i); Integer userId = (Integer) record.get("user_id"); if (userId != null) { userIdVector.vector[i] = userId; } else { userIdVector.isNull[i] = true; } String firstName = (String) record.get("firstName"); if (firstName != null) { firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName)); } else { firstNameVector.isNull[i] = true; } lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName"))); List<Integer> bids = (List<Integer>) record.get("bids"); if (bids != null) { bidsVector.offsets[i] = bidsVector.childCount; bidsVector.lengths[i] = bids.size(); for (int bid : bids) { bidsElementVector.vector[bidsVector.childCount++] = bid; } } else { bidsVector.isNull[i] = true; } campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo"))); costVector.vector[i] = (double) record.get("cost"); timestampVector.vector[i] = (long) record.get("timestamp"); rowBatch.size++; } writer.addRowBatch(rowBatch); rowBatch.reset(); writer.close(); }