org.apache.orc.OrcFile#createReader

Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0

6 votes

public void createOrcInputFile(List<Record> records) throws IOException {
  if (ORC_FILE.exists()) {
    Assert.assertTrue(ORC_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(ORC_FILE);
  try (FileAppender<Record> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(ORC_FILE);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  ORC_FILE.deleteOnExit();
}

Source File: ORC.java From iceberg with Apache License 2.0

6 votes

public OrcIterator build() {
  Preconditions.checkNotNull(schema, "Schema is required");
  try {
    Path path = new Path(file.location());
    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    ColumnIdMap columnIds = new ColumnIdMap();
    TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds);
    Reader.Options options = reader.options();
    if (start != null) {
      options.range(start, length);
    }
    options.schema(orcSchema);
    return new OrcIterator(path, orcSchema, reader.rows(options));
  } catch (IOException e) {
    throw new RuntimeException("Can't open " + file.location(), e);
  }
}

Source File: OrcFileSystemITCase.java From flink with Apache License 2.0

6 votes

@Override
public void testNonPartition() {
	super.testNonPartition();

	// test configure success
	File directory = new File(URI.create(resultPath()).getPath());
	File[] files = directory.listFiles((dir, name) ->
			!name.startsWith(".") && !name.startsWith("_"));
	Assert.assertNotNull(files);
	Path path = new Path(URI.create(files[0].getAbsolutePath()));

	try {
		Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(new Configuration()));
		if (configure) {
			Assert.assertEquals("SNAPPY", reader.getCompressionKind().toString());
		} else {
			Assert.assertEquals("ZLIB", reader.getCompressionKind().toString());
		}
	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}

Source File: OrcBulkWriterTestUtil.java From flink with Apache License 2.0

6 votes

public static void validate(File files, List<Record> expected) throws IOException {
	final File[] buckets = files.listFiles();
	assertNotNull(buckets);
	assertEquals(1, buckets.length);

	final File[] partFiles = buckets[0].listFiles();
	assertNotNull(partFiles);

	for (File partFile : partFiles) {
		assertTrue(partFile.length() > 0);

		OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration());
		Reader reader = OrcFile.createReader(new org.apache.hadoop.fs.Path(partFile.toURI()), readerOptions);

		assertEquals(3, reader.getNumberOfRows());
		assertEquals(2, reader.getSchema().getFieldNames().size());
		assertSame(reader.getCompressionKind(), CompressionKind.LZ4);
		assertTrue(reader.hasMetadataValue(USER_METADATA_KEY));
		assertTrue(reader.getMetadataKeys().contains(USER_METADATA_KEY));

		List<Record> results = getResults(reader);

		assertEquals(3, results.size());
		assertEquals(results, expected);
	}
}

Source File: PentahoOrcRecordReader.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

static Reader getReader( String fileName, Configuration conf ) {

    try {
      S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( fileName, conf );
      Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( fileName ) );
      FileSystem fs = FileSystem.get( filePath.toUri(), conf );
      if ( !fs.exists( filePath ) ) {
        throw new NoSuchFileException( fileName );
      }
      if ( fs.getFileStatus( filePath ).isDirectory() ) {
        PathFilter pathFilter = file -> file.getName().endsWith( ".orc" );

        FileStatus[] fileStatuses = fs.listStatus( filePath, pathFilter );
        if ( fileStatuses.length == 0 ) {
          throw new NoSuchFileException( fileName );
        }
        filePath = fileStatuses[ 0 ].getPath();
      }
      return OrcFile.createReader( filePath,
        OrcFile.readerOptions( conf ).filesystem( fs ) );
    } catch ( IOException e ) {
      throw new IllegalArgumentException( "Unable to read data from file " + fileName, e );
    }
  }

Source File: ORC.java From iceberg with Apache License 2.0

5 votes

static Reader newFileReader(String location, ReaderOptions readerOptions) {
  try {
    return OrcFile.createReader(new Path(location), readerOptions);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", location);
  }
}

Source File: TestMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

public void createOrcInputFile() throws IOException {
  if (orcFile.exists()) {
    Assert.assertTrue(orcFile.delete());
  }

  OutputFile outFile = Files.localOutput(orcFile);
  try (FileAppender<GenericRecord> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    GenericRecord record = GenericRecord.create(FILE_SCHEMA);
    // create 50 records
    for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
      record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
      record.setField("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats
                                                                        // in Parquet, but will produce stats for ORC
      record.setField("_required", "req"); // required, always non-null
      record.setField("_all_nulls", null); // never non-null
      record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      record.setField("_no_nulls", ""); // optional, but always non-null
      record.setField("_str", i + "str" + i);

      GenericRecord structNotNull = GenericRecord.create(_structFieldType);
      structNotNull.setField("_int_field", INT_MIN_VALUE + i);
      record.setField("_struct_not_null", structNotNull); // struct with int

      appender.add(record);
    }
  }

  InputFile inFile = Files.localInput(orcFile);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  orcFile.deleteOnExit();
}

Source File: TestOrcMetadata.java From rainbow with Apache License 2.0

5 votes

@Test
public void test () throws IOException, Descriptors.DescriptorValidationException
{
    Configuration conf = new Configuration();
    System.setProperty("hadoop.home.dir", "/");
    FileSystem fileSystem = FileSystem.get(URI.create("hdfs://presto00:9000"), conf);
    Path hdfsDirPath = new Path("/rainbow2/orc_new_compress");
    System.out.println(fileSystem.isFile(hdfsDirPath));
    FileStatus[] fileStatuses = fileSystem.listStatus(hdfsDirPath);
    System.out.println(fileStatuses.length);
    for (FileStatus status : fileStatuses)
    {
        status.getPath();
        System.out.println(status.getPath() + ", " + status.getLen());
    }

    Reader reader = OrcFile.createReader(fileStatuses[0].getPath(),
            OrcFile.readerOptions(conf));
    System.out.println("file length:" + reader.getFileTail().getFileLength());
    List<String> columnNames = new ArrayList<>();
    columnNames.add("samplepercent");
    System.out.println(reader.getRawDataSizeOfColumns(columnNames));
    System.out.println(reader.getFileTail().getFooter().getTypes(0).getFieldNames(0));
    System.out.println(reader.getTypes().get(0).getSerializedSize());

    List<Reader> readers = new ArrayList<>();
    for (FileStatus fileStatus : fileStatuses)
    {
        Reader reader1 = OrcFile.createReader(fileStatus.getPath(),
                OrcFile.readerOptions(conf));
        readers.add(reader1);
        System.out.println("content size: " + reader1.getContentLength() + ", raw size: "
        + reader1.getRawDataSize());
    }

    for (String columnName : reader.getSchema().getFieldNames())
    {
        System.out.println(columnName);
    }
}

Source File: OrcToSdcRecordConverter.java From datacollector with Apache License 2.0

5 votes

public OrcToSdcRecordConverter(Path orcFilePath) throws IOException {
  final Configuration readerConf = new Configuration();
  final OrcFile.ReaderOptions fileReaderOptions = OrcFile.readerOptions(readerConf);
  this.orcFilePath = orcFilePath;
  reader = OrcFile.createReader(this.orcFilePath, fileReaderOptions);

  // TODO: support various parameters to Reader.Options via options passed into constructor?
  // final Reader.Options rowReaderOptions = new Reader.Options();

  // for now, just use default options
  rows = reader.rows();
  readerBatch = reader.getSchema().createRowBatch();
}

Source File: OrcNoHiveShim.java From flink with Apache License 2.0

5 votes

@Override
public RecordReader createRecordReader(
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		List<OrcSplitReader.Predicate> conjunctPredicates,
		org.apache.flink.core.fs.Path path,
		long splitStart,
		long splitLength) throws IOException {
	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());

	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(
			splitStart, splitLength, orcReader.getStripes());

	// create ORC row reader configuration
	Reader.Options options = new Reader.Options()
			.schema(schema)
			.range(offsetAndLength.f0, offsetAndLength.f1)
			.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// TODO configure filters

	// configure selected fields
	options.include(computeProjectionMask(schema, selectedFields));

	// create ORC row reader
	RecordReader orcRowsReader = orcReader.rows(options);

	// assign ids
	schema.getId();

	return orcRowsReader;
}

Source File: JsonORCFileReaderWriterFactory.java From secor with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public JsonORCFileReader(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    Path path = new Path(logFilePath.getLogFilePath());
    Reader reader = OrcFile.createReader(path,
            OrcFile.readerOptions(new Configuration(true)));
    offset = logFilePath.getOffset();
    rows = reader.rows();
    batch = reader.getSchema().createRowBatch();
    rows.nextBatch(batch);
}

Source File: OrcRowInputFormat.java From Flink-CEPplus with Apache License 2.0

4 votes

@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}

Source File: OrcRowInputFormat.java From flink with Apache License 2.0

4 votes

@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}

Source File: OrcShimV230.java From flink with Apache License 2.0

4 votes

@Override
protected Reader createReader(Path path,
		Configuration conf) throws IOException {
	return OrcFile.createReader(path, OrcFile.readerOptions(conf));
}

Source File: OrcUtils.java From incubator-gobblin with Apache License 2.0

4 votes

public static Reader getRecordReaderFromFile(Configuration conf, Path orcFilePath)
    throws IOException {
  return OrcFile.createReader(orcFilePath, new OrcFile.ReaderOptions(conf));
}

Java Code Examples for org.apache.orc.OrcFile#createReader()