org.apache.orc.OrcFile Java Exaples

Source File: PentahoOrcRecordWriter.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

public PentahoOrcRecordWriter( List<? extends IOrcOutputField> fields, TypeDescription schema, String filePath,
                               Configuration conf ) {
  this.fields = fields;
  this.schema = schema;
  final AtomicInteger fieldNumber = new AtomicInteger();  //Mutable field count
  fields.forEach( field -> setOutputMeta( fieldNumber, field ) );
  outputRowMetaAndData = new RowMetaAndData( outputRowMeta, new Object[ fieldNumber.get() ] );

  try {
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( filePath, conf );
    Path outputFile = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( filePath ) );
    writer = OrcFile.createWriter( outputFile,
      OrcFile.writerOptions( conf )
        .setSchema( schema ) );
    batch = schema.createRowBatch();
  } catch ( IOException e ) {
    logger.error( e );
  }

  //Write the addition metadata for the fields
  // new OrcMetaDataWriter( writer ).write( fields );
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

6 votes

public PhysicalWriterImpl(FSDataOutputStream out, OrcFile.WriterOptions opts) throws IOException {
	if (opts.isEnforceBufferSize()) {
		this.bufferSize = opts.getBufferSize();
	} else {
		this.bufferSize = getEstimatedBufferSize(
			opts.getStripeSize(), opts.getSchema().getMaximumId() + 1, opts.getBufferSize());
	}

	this.out = out;
	this.blockOffset = 0;
	this.blockSize = opts.getBlockSize();
	this.maxPadding = (int) (opts.getPaddingTolerance() * (double) opts.getBufferSize());
	this.compress = opts.getCompress();
	this.codec = OrcCodecPool.getCodec(this.compress);
	this.streams  = new TreeMap<>();
	this.writer = new OutStream("metadata", this.bufferSize, this.codec, new DirectStream(this.out));
	this.shims = opts.getHadoopShims();
	this.addBlockPadding = opts.getBlockPadding();
	this.protobufWriter = CodedOutputStream.newInstance(this.writer);
	this.writeVariableLengthBlocks = opts.getWriteVariableLengthBlocks();
}

Source File: OrcFileSystemITCase.java From flink with Apache License 2.0

6 votes

@Override
public void testNonPartition() {
	super.testNonPartition();

	// test configure success
	File directory = new File(URI.create(resultPath()).getPath());
	File[] files = directory.listFiles((dir, name) ->
			!name.startsWith(".") && !name.startsWith("_"));
	Assert.assertNotNull(files);
	Path path = new Path(URI.create(files[0].getAbsolutePath()));

	try {
		Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(new Configuration()));
		if (configure) {
			Assert.assertEquals("SNAPPY", reader.getCompressionKind().toString());
		} else {
			Assert.assertEquals("ZLIB", reader.getCompressionKind().toString());
		}
	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}

Source File: OrcBulkWriterTestUtil.java From flink with Apache License 2.0

6 votes

public static void validate(File files, List<Record> expected) throws IOException {
	final File[] buckets = files.listFiles();
	assertNotNull(buckets);
	assertEquals(1, buckets.length);

	final File[] partFiles = buckets[0].listFiles();
	assertNotNull(partFiles);

	for (File partFile : partFiles) {
		assertTrue(partFile.length() > 0);

		OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration());
		Reader reader = OrcFile.createReader(new org.apache.hadoop.fs.Path(partFile.toURI()), readerOptions);

		assertEquals(3, reader.getNumberOfRows());
		assertEquals(2, reader.getSchema().getFieldNames().size());
		assertSame(reader.getCompressionKind(), CompressionKind.LZ4);
		assertTrue(reader.hasMetadataValue(USER_METADATA_KEY));
		assertTrue(reader.getMetadataKeys().contains(USER_METADATA_KEY));

		List<Record> results = getResults(reader);

		assertEquals(3, results.size());
		assertEquals(results, expected);
	}
}

Source File: ORC.java From iceberg with Apache License 2.0

6 votes

public OrcIterator build() {
  Preconditions.checkNotNull(schema, "Schema is required");
  try {
    Path path = new Path(file.location());
    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    ColumnIdMap columnIds = new ColumnIdMap();
    TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds);
    Reader.Options options = reader.options();
    if (start != null) {
      options.range(start, length);
    }
    options.schema(orcSchema);
    return new OrcIterator(path, orcSchema, reader.rows(options));
  } catch (IOException e) {
    throw new RuntimeException("Can't open " + file.location(), e);
  }
}

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

6 votes

OrcFileAppender(Schema schema,
                OutputFile file,
                OrcFile.WriterOptions options,
                Map<String,byte[]> metadata) {
  orcSchema = TypeConversion.toOrc(schema, columnIds);
  options.setSchema(orcSchema);
  path = new Path(file.location());
  try {
    writer = OrcFile.createWriter(path, options);
  } catch (IOException e) {
    throw new RuntimeException("Can't create file " + path, e);
  }
  writer.addUserMetadata(COLUMN_NUMBERS_ATTRIBUTE, columnIds.serialize());
  metadata.forEach(
      (key,value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));
}

Source File: OrcCompactionTaskTest.java From incubator-gobblin with Apache License 2.0

6 votes

/**
 * Read a output ORC compacted file into memory.
 * This only works if fields are int value.
 */
public List<OrcStruct> readOrcFile(Path orcFilePath)
    throws IOException, InterruptedException {
  ReaderImpl orcReader = new ReaderImpl(orcFilePath, new OrcFile.ReaderOptions(new Configuration()));

  Reader.Options options = new Reader.Options().schema(orcReader.getSchema());
  OrcMapreduceRecordReader recordReader = new OrcMapreduceRecordReader(orcReader, options);
  List<OrcStruct> result = new ArrayList<>();

  OrcStruct recordContainer;
  while (recordReader.nextKeyValue()) {
    recordContainer = (OrcStruct) OrcUtils.createValueRecursively(orcReader.getSchema());
    OrcUtils.upConvertOrcStruct((OrcStruct) recordReader.getCurrentValue(), recordContainer, orcReader.getSchema());
    result.add(recordContainer);
  }

  return result;
}

Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0

6 votes

public void createOrcInputFile(List<Record> records) throws IOException {
  if (ORC_FILE.exists()) {
    Assert.assertTrue(ORC_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(ORC_FILE);
  try (FileAppender<Record> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(ORC_FILE);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  ORC_FILE.deleteOnExit();
}

Source File: JsonORCFileReaderWriterFactory.java From secor with Apache License 2.0

6 votes

public JsonORCFileWriter(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    Configuration conf = new Configuration();
    Path path = new Path(logFilePath.getLogFilePath());
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    if (schema == null) {
        String topic = logFilePath.getTopic();
        throw new IllegalArgumentException(
            String.format("No schema is provided for topic '%s'", topic));
    }
    List<TypeDescription> fieldTypes = schema.getChildren();
    converters = new JsonConverter[fieldTypes.size()];
    for (int c = 0; c < converters.length; ++c) {
        converters[c] = VectorColumnFiller.createConverter(fieldTypes
                .get(c));
    }

    writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
            .compress(resolveCompression(codec)).setSchema(schema));
    batch = schema.createRowBatch();
}

Source File: PentahoOrcRecordReader.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

static Reader getReader( String fileName, Configuration conf ) {

    try {
      S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( fileName, conf );
      Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( fileName ) );
      FileSystem fs = FileSystem.get( filePath.toUri(), conf );
      if ( !fs.exists( filePath ) ) {
        throw new NoSuchFileException( fileName );
      }
      if ( fs.getFileStatus( filePath ).isDirectory() ) {
        PathFilter pathFilter = file -> file.getName().endsWith( ".orc" );

        FileStatus[] fileStatuses = fs.listStatus( filePath, pathFilter );
        if ( fileStatuses.length == 0 ) {
          throw new NoSuchFileException( fileName );
        }
        filePath = fileStatuses[ 0 ].getPath();
      }
      return OrcFile.createReader( filePath,
        OrcFile.readerOptions( conf ).filesystem( fs ) );
    } catch ( IOException e ) {
      throw new IllegalArgumentException( "Unable to read data from file " + fileName, e );
    }
  }

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

6 votes

OrcFileAppender(Schema schema, OutputFile file,
                Function<TypeDescription, OrcValueWriter<?>> createWriterFunc,
                Configuration conf, Map<String, byte[]> metadata,
                int batchSize) {
  this.conf = conf;
  this.file = file;
  this.batchSize = batchSize;
  this.schema = schema;

  TypeDescription orcSchema = ORCSchemaUtil.convert(this.schema);
  this.batch = orcSchema.createRowBatch(this.batchSize);

  OrcFile.WriterOptions options = OrcFile.writerOptions(conf).useUTCTimestamp(true);
  if (file instanceof HadoopOutputFile) {
    options.fileSystem(((HadoopOutputFile) file).getFileSystem());
  }
  options.setSchema(orcSchema);
  this.writer = newOrcWriter(file, options, metadata);
  this.valueWriter = newOrcValueWriter(orcSchema, createWriterFunc);
}

Source File: OrcCompactionTaskTest.java From incubator-gobblin with Apache License 2.0

5 votes

public void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception {
  Configuration configuration = new Configuration();
  OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema);

  Writer writer = OrcFile.createWriter(path, options);
  OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer);
  for (OrcStruct orcRecord : orcStructs) {
    recordWriter.write(NullWritable.get(), orcRecord);
  }
  recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID()));
}

Source File: JsonORCFileReaderWriterFactory.java From secor with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public JsonORCFileReader(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    Path path = new Path(logFilePath.getLogFilePath());
    Reader reader = OrcFile.createReader(path,
            OrcFile.readerOptions(new Configuration(true)));
    offset = logFilePath.getOffset();
    rows = reader.rows();
    batch = reader.getSchema().createRowBatch();
    rows.nextBatch(batch);
}

Source File: OrcKeyCompactorOutputFormat.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Required for extension since super method hard-coded file extension as ".orc". To keep flexibility
 * of extension name, we made it configuration driven.
 * @param taskAttemptContext The source of configuration that determines the file extension
 * @return The {@link RecordWriter} that write out Orc object.
 * @throws IOException
 */
@Override
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
  Configuration conf = taskAttemptContext.getConfiguration();
  String extension = "." + conf.get(COMPACTION_OUTPUT_EXTENSION, "orc" );

  Path filename = getDefaultWorkFile(taskAttemptContext, extension);
  Writer writer = OrcFile.createWriter(filename,
      org.apache.orc.mapred.OrcOutputFormat.buildOptions(conf));
  return new OrcMapreduceRecordWriter(writer);
}

Source File: OrcNoHiveShim.java From flink with Apache License 2.0

5 votes

@Override
public RecordReader createRecordReader(
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		List<OrcSplitReader.Predicate> conjunctPredicates,
		org.apache.flink.core.fs.Path path,
		long splitStart,
		long splitLength) throws IOException {
	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());

	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(
			splitStart, splitLength, orcReader.getStripes());

	// create ORC row reader configuration
	Reader.Options options = new Reader.Options()
			.schema(schema)
			.range(offsetAndLength.f0, offsetAndLength.f1)
			.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// TODO configure filters

	// configure selected fields
	options.include(computeProjectionMask(schema, selectedFields));

	// create ORC row reader
	RecordReader orcRowsReader = orcReader.rows(options);

	// assign ids
	schema.getId();

	return orcRowsReader;
}

Source File: OrcBulkWriterFactory.java From flink with Apache License 2.0

5 votes

private OrcFile.WriterOptions getWriterOptions() {
	if (null == writerOptions) {
		Configuration conf = new Configuration();
		for (Map.Entry<String, String> entry : confMap.entrySet()) {
			conf.set(entry.getKey(), entry.getValue());
		}

		writerOptions = OrcFile.writerOptions(writerProperties, conf);
		writerOptions.setSchema(this.vectorizer.getSchema());
	}

	return writerOptions;
}

Source File: OrcBulkWriterFactory.java From flink with Apache License 2.0

5 votes

@Override
public BulkWriter<T> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = getWriterOptions();
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));

	return new OrcBulkWriter<>(vectorizer, new WriterImpl(null, FIXED_PATH, opts));
}

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

5 votes

private static Writer newOrcWriter(OutputFile file,
                                   OrcFile.WriterOptions options, Map<String, byte[]> metadata) {
  final Path locPath = new Path(file.location());
  final Writer writer;

  try {
    writer = OrcFile.createWriter(locPath, options);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Can't create file " + locPath);
  }

  metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));

  return writer;
}

Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0

5 votes

public static Writer createOrcWriter(Properties orcWriterProperties, Configuration configuration, Path orcOutputFile, TypeDescription orcSchema) throws IOException {
  if (LOG.isDebugEnabled()) {
    LOG.debug("Creating ORC writer at: {}", orcOutputFile.toString());
  }
  return OrcFile.createWriter(
      orcOutputFile,
      OrcFile.writerOptions(orcWriterProperties, configuration).setSchema(orcSchema)
  );
}

Source File: OrcToSdcRecordConverter.java From datacollector with Apache License 2.0

5 votes

public OrcToSdcRecordConverter(Path orcFilePath) throws IOException {
  final Configuration readerConf = new Configuration();
  final OrcFile.ReaderOptions fileReaderOptions = OrcFile.readerOptions(readerConf);
  this.orcFilePath = orcFilePath;
  reader = OrcFile.createReader(this.orcFilePath, fileReaderOptions);

  // TODO: support various parameters to Reader.Options via options passed into constructor?
  // final Reader.Options rowReaderOptions = new Reader.Options();

  // for now, just use default options
  rows = reader.rows();
  readerBatch = reader.getSchema().createRowBatch();
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

5 votes

public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}

Source File: OrcWriter.java From osm2orc with ISC License

5 votes

@Override
public void initialize(Map<String, Object> metaData) {
    try {
        Configuration conf = new Configuration();
        // conf.set(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), "tags");
        processor = new OrcEntityProcessor(OrcFile.createWriter(new Path(filename),
                OrcFile.writerOptions(conf).setSchema(SCHEMA)), SCHEMA.createRowBatch());
    } catch (IOException e) {
        throw new OsmosisRuntimeException(e);
    }
}

Source File: DremioORCRecordUtils.java From dremio-oss with Apache License 2.0

5 votes

static boolean hadBadBloomFilters(TypeDescription.Category category,
                                  OrcFile.WriterVersion version) {
  switch(category) {
    case STRING:
    case CHAR:
    case VARCHAR:
      return !version.includes(OrcFile.WriterVersion.HIVE_12055);
    case DECIMAL:
      return true;
    case TIMESTAMP:
      return !version.includes(OrcFile.WriterVersion.ORC_135);
    default:
      return false;
  }
}

Source File: OrcNoHiveBulkWriterFactory.java From flink with Apache License 2.0

5 votes

@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
	TypeDescription description = TypeDescription.fromString(schema);
	opts.setSchema(description);
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));
	WriterImpl writer = new WriterImpl(null, new Path("."), opts);

	VectorizedRowBatch rowBatch = description.createRowBatch();
	return new BulkWriter<RowData>() {
		@Override
		public void addElement(RowData row) throws IOException {
			int rowId = rowBatch.size++;
			for (int i = 0; i < row.getArity(); ++i) {
				setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
			}
			if (rowBatch.size == rowBatch.getMaxSize()) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void flush() throws IOException {
			if (rowBatch.size != 0) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void finish() throws IOException {
			flush();
			writer.close();
		}
	};
}

Source File: ORC.java From iceberg with Apache License 2.0

5 votes

static Reader newFileReader(String location, ReaderOptions readerOptions) {
  try {
    return OrcFile.createReader(new Path(location), readerOptions);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", location);
  }
}

Source File: ORC.java From iceberg with Apache License 2.0

5 votes

static Reader newFileReader(InputFile file, Configuration config) {
  ReaderOptions readerOptions = OrcFile.readerOptions(config).useUTCTimestamp(true);
  if (file instanceof HadoopInputFile) {
    readerOptions.filesystem(((HadoopInputFile) file).getFileSystem());
  }
  return newFileReader(file.location(), readerOptions);
}

Source File: TestOrcMetadata.java From rainbow with Apache License 2.0

5 votes

@Test
public void test () throws IOException, Descriptors.DescriptorValidationException
{
    Configuration conf = new Configuration();
    System.setProperty("hadoop.home.dir", "/");
    FileSystem fileSystem = FileSystem.get(URI.create("hdfs://presto00:9000"), conf);
    Path hdfsDirPath = new Path("/rainbow2/orc_new_compress");
    System.out.println(fileSystem.isFile(hdfsDirPath));
    FileStatus[] fileStatuses = fileSystem.listStatus(hdfsDirPath);
    System.out.println(fileStatuses.length);
    for (FileStatus status : fileStatuses)
    {
        status.getPath();
        System.out.println(status.getPath() + ", " + status.getLen());
    }

    Reader reader = OrcFile.createReader(fileStatuses[0].getPath(),
            OrcFile.readerOptions(conf));
    System.out.println("file length:" + reader.getFileTail().getFileLength());
    List<String> columnNames = new ArrayList<>();
    columnNames.add("samplepercent");
    System.out.println(reader.getRawDataSizeOfColumns(columnNames));
    System.out.println(reader.getFileTail().getFooter().getTypes(0).getFieldNames(0));
    System.out.println(reader.getTypes().get(0).getSerializedSize());

    List<Reader> readers = new ArrayList<>();
    for (FileStatus fileStatus : fileStatuses)
    {
        Reader reader1 = OrcFile.createReader(fileStatus.getPath(),
                OrcFile.readerOptions(conf));
        readers.add(reader1);
        System.out.println("content size: " + reader1.getContentLength() + ", raw size: "
        + reader1.getRawDataSize());
    }

    for (String columnName : reader.getSchema().getFieldNames())
    {
        System.out.println(columnName);
    }
}

Source File: TestMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

public void createOrcInputFile() throws IOException {
  if (orcFile.exists()) {
    Assert.assertTrue(orcFile.delete());
  }

  OutputFile outFile = Files.localOutput(orcFile);
  try (FileAppender<GenericRecord> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    GenericRecord record = GenericRecord.create(FILE_SCHEMA);
    // create 50 records
    for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
      record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
      record.setField("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats
                                                                        // in Parquet, but will produce stats for ORC
      record.setField("_required", "req"); // required, always non-null
      record.setField("_all_nulls", null); // never non-null
      record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      record.setField("_no_nulls", ""); // optional, but always non-null
      record.setField("_str", i + "str" + i);

      GenericRecord structNotNull = GenericRecord.create(_structFieldType);
      structNotNull.setField("_int_field", INT_MIN_VALUE + i);
      record.setField("_struct_not_null", structNotNull); // struct with int

      appender.add(record);
    }
  }

  InputFile inFile = Files.localInput(orcFile);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  orcFile.deleteOnExit();
}

Source File: OrcColumnarRowSplitReaderTest.java From flink with Apache License 2.0

4 votes

protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: OrcRowInputFormat.java From flink with Apache License 2.0

4 votes

@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}

org.apache.orc.OrcFile Java Examples