org.apache.orc.OrcProto Java Exaples

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

@Override
void startStripe(Map<org.apache.orc.impl.StreamName, InStream> streams,
                 OrcProto.StripeFooter stripeFooter
) throws IOException {
  // For each stripe, checks the encoding and initializes the appropriate
  // reader
  switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
    case DIRECT:
    case DIRECT_V2:
      reader = new StringDirectTreeReader(columnId);
      break;
    case DICTIONARY:
    case DICTIONARY_V2:
      reader = new StringDictionaryTreeReader(columnId);
      break;
    default:
      throw new IllegalArgumentException("Unsupported encoding " +
          stripeFooter.getColumnsList().get(columnId).getKind());
  }
  reader.startStripe(streams, stripeFooter);
}

Source File: DremioORCRecordUtils.java From dremio-oss with Apache License 2.0

6 votes

/**
 * This function is a copy of original implementation from hive-private repository
 */
@Override
public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
  if (file == null) {
    open();
  }
  long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
  int tailLength = (int) stripe.getFooterLength();

  // read the footer
  ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);
  file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
  return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer",
    Lists.<DiskRange>newArrayList(new BufferChunk(tailBuf, 0)),
    tailLength, codec, bufferSize));
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

6 votes

@Override
public void appendRawStripe(ByteBuffer buffer, OrcProto.StripeInformation.Builder dirEntry) throws IOException {
	long start = out.getPos();
	int length = buffer.remaining();
	long availBlockSpace = blockSize - (start % blockSize);

	// see if stripe can fit in the current hdfs block, else pad the remaining
	// space in the block
	if (length < blockSize && length > availBlockSpace &&
		addBlockPadding) {
		byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)];
		LOG.info(String.format("Padding ORC by %d bytes while merging..",
			availBlockSpace));
		start += availBlockSpace;
		while (availBlockSpace > 0) {
			int writeLen = (int) Math.min(availBlockSpace, pad.length);
			out.write(pad, 0, writeLen);
			availBlockSpace -= writeLen;
		}
	}

	out.write(buffer.array(), buffer.arrayOffset() + buffer.position(), length);
	dirEntry.setOffset(start);
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

6 votes

@Override
public long writePostScript(OrcProto.PostScript.Builder builder) throws IOException {
	builder.setFooterLength(footerLength);
	builder.setMetadataLength(metadataLength);

	OrcProto.PostScript ps = builder.build();
	// need to write this uncompressed
	long startPosition = out.getPos();
	ps.writeTo(out);
	long length = out.getPos() - startPosition;

	if (length > 255) {
		throw new IllegalArgumentException("PostScript too large at " + length);
	}

	out.write((int) length);
	return out.getPos();
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

protected TimestampTreeReader(TimeZone timeZone, int columnId, InStream presentStream, InStream dataStream,
                              InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt)
    throws IOException {
  super(columnId, presentStream);
  this.skipCorrupt = skipCorrupt;
  this.baseTimestampMap = new HashMap<>();
  this.readerTimeZone = timeZone;
  this.writerTimeZone = TimeZone.getDefault();
  this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
  this.base_timestamp = getBaseTimestamp(readerTimeZone.getID());
  if (encoding != null) {
    checkEncoding(encoding);

    if (dataStream != null) {
      this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt);
    }

    if (nanosStream != null) {
      this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
    }
  }
}

Source File: RecordReaderUtils.java From tajo with Apache License 2.0

6 votes

public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
                                               boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
                                               OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
                                               long offset, long length, DiskRangeList.CreateHelper list, boolean doMergeBuffers) {
  for (int group = 0; group < includedRowGroups.length; ++group) {
    if (!includedRowGroups[group]) continue;
    int posn = getIndexPosition(
        encoding.getKind(), type.getKind(), stream.getKind(), isCompressed, hasNull);
    long start = index.getEntry(group).getPositions(posn);
    final long nextGroupOffset;
    boolean isLast = group == (includedRowGroups.length - 1);
    nextGroupOffset = isLast ? length : index.getEntry(group + 1).getPositions(posn);

    start += offset;
    long end = offset + estimateRgEndOffset(
        isCompressed, isLast, nextGroupOffset, length, compressionSize);
    list.addOrMerge(start, end, doMergeBuffers, true);
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

protected StringTreeReader(int columnId, InStream present, InStream data, InStream length,
                           InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
  super(columnId, present);
  if (encoding != null) {
    switch (encoding.getKind()) {
      case DIRECT:
      case DIRECT_V2:
        reader = new StringDirectTreeReader(columnId, present, data, length,
            encoding.getKind());
        break;
      case DICTIONARY:
      case DICTIONARY_V2:
        reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
            encoding);
        break;
      default:
        throw new IllegalArgumentException("Unsupported encoding " +
            encoding.getKind());
    }
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

protected StringDictionaryTreeReader(int columnId, InStream present, InStream data,
                                     InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding)
    throws IOException {
  super(columnId, present);
  scratchlcv = new LongColumnVector();
  if (data != null && encoding != null) {
    this.reader = createIntegerReader(encoding.getKind(), data, false, false);
  }

  if (dictionary != null && encoding != null) {
    readDictionaryStream(dictionary);
  }

  if (length != null && encoding != null) {
    readDictionaryLengthStream(length, encoding);
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

@Override
void startStripe(Map<StreamName, InStream> streams,
                 OrcProto.StripeFooter stripeFooter
) throws IOException {
  // For each stripe, checks the encoding and initializes the appropriate
  // reader
  switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
    case DIRECT:
    case DIRECT_V2:
      reader = new StringDirectTreeReader(columnId);
      break;
    case DICTIONARY:
    case DICTIONARY_V2:
      reader = new StringDictionaryTreeReader(columnId);
      break;
    default:
      throw new IllegalArgumentException("Unsupported encoding " +
          stripeFooter.getColumnsList().get(columnId).getKind());
  }
  reader.startStripe(streams, stripeFooter);
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

protected CharTreeReader(int columnId, InStream present, InStream data, InStream length,
                         InStream dictionary, OrcProto.ColumnEncoding encoding, int maxLength) throws IOException {
  super(columnId, present);
  this.maxLength = maxLength;
  if (encoding != null) {
    switch (encoding.getKind()) {
      case DIRECT:
      case DIRECT_V2:
        reader = new StringDirectTreeReader(columnId, present, data, length,
            encoding.getKind());
        break;
      case DICTIONARY:
      case DICTIONARY_V2:
        reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
            encoding);
        break;
      default:
        throw new IllegalArgumentException("Unsupported encoding " +
            encoding.getKind());
    }
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding)
    throws IOException {
  int dictionarySize = encoding.getDictionarySize();
  if (in != null) { // Guard against empty LENGTH stream.
    IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false);
    int offset = 0;
    if (dictionaryOffsets == null ||
        dictionaryOffsets.length < dictionarySize + 1) {
      dictionaryOffsets = new int[dictionarySize + 1];
    }
    for (int i = 0; i < dictionarySize; ++i) {
      dictionaryOffsets[i] = offset;
      offset += (int) lenReader.next();
    }
    dictionaryOffsets[dictionarySize] = offset;
    in.close();
  }

}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

6 votes

@Override
void startStripe(Map<org.apache.orc.impl.StreamName, InStream> streams,
                 OrcProto.StripeFooter stripeFooter
) throws IOException {
  super.startStripe(streams, stripeFooter);

  // read the dictionary blob
  org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
      OrcProto.Stream.Kind.DICTIONARY_DATA);
  InStream in = streams.get(name);
  readDictionaryStream(in);

  // read the lengths
  name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
  in = streams.get(name);
  readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId));

  // set up the row reader
  name = new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.DATA);
  reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
      streams.get(name), false, false);
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

protected DateTreeReader(int columnId, InStream present, InStream data,
                         OrcProto.ColumnEncoding encoding) throws IOException {
  super(columnId, present);
  if (data != null && encoding != null) {
    checkEncoding(encoding);
    reader = createIntegerReader(encoding.getKind(), data, true, false);
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void startStripe(Map<org.apache.orc.impl.StreamName, InStream> streams,
                 OrcProto.StripeFooter stripeFooter
) throws IOException {
  super.startStripe(streams, stripeFooter);
  org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
      OrcProto.Stream.Kind.DATA);
  reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
      streams.get(name), true, false);
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
  if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
      (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
    throw new IOException("Unknown encoding " + encoding + " in column " +
        columnId);
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
  if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
      encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
    throw new IOException("Unknown encoding " + encoding + " in column " +
        columnId);
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void startStripe(Map<org.apache.orc.impl.StreamName, InStream> streams,
                 OrcProto.StripeFooter stripeFooter
) throws IOException {
  super.startStripe(streams, stripeFooter);
  data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
      streams.get(new org.apache.orc.impl.StreamName(columnId,
          OrcProto.Stream.Kind.DATA)), true, skipCorrupt);
  nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
      streams.get(new org.apache.orc.impl.StreamName(columnId,
          OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
  base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone());
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
  if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
      (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
    throw new IOException("Unknown encoding " + encoding + " in column " +
        columnId);
  }
}

Source File: HiveORCVectorizedReader.java From dremio-oss with Apache License 2.0

5 votes

private int[] getOrdinalIdsOfSelectedColumns(List< OrcProto.Type > types, List<Integer> selectedColumns, boolean isOriginal) {
  int rootColumn = isOriginal ? 0 : TRANS_ROW_COLUMN_INDEX + 1;
  int[] ids = new int[types.size()];
  OrcProto.Type root = types.get(rootColumn);

  // iterating over only direct children
  for(int i = 0; i < root.getSubtypesCount(); ++i) {
    if (selectedColumns.contains(i)) {
      // find the position of this column in the types list
      ids[i] = root.getSubtypes(i);
    }
  }

  return ids;
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

protected StringDirectTreeReader(int columnId, InStream present, InStream data,
                                 InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException {
  super(columnId, present);
  this.scratchlcv = new LongColumnVector();
  this.stream = data;
  if (length != null && encoding != null) {
    this.lengths = createIntegerReader(encoding, length, false, false);
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
  if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT &&
      encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) {
    throw new IOException("Unknown encoding " + encoding + " in column " +
        columnId);
  }
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void startStripe(Map<org.apache.orc.impl.StreamName, InStream> streams,
                 OrcProto.StripeFooter stripeFooter
) throws IOException {
  super.startStripe(streams, stripeFooter);
  org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
      OrcProto.Stream.Kind.DATA);
  stream = streams.get(name);
  data = new BasicTextReaderShim(stream);

  lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
      streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
      false, false);
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

@Override
void startStripe(Map<org.apache.orc.impl.StreamName, InStream> streams,
                 OrcProto.StripeFooter stripeFooter
) throws IOException {
  super.startStripe(streams, stripeFooter);
  org.apache.orc.impl.StreamName name = new org.apache.orc.impl.StreamName(columnId,
      OrcProto.Stream.Kind.DATA);
  stream = streams.get(name);
  lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
      streams.get(new org.apache.orc.impl.StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false);
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

5 votes

@Override
public void writeIndex(StreamName name, OrcProto.RowIndex.Builder index,
						CompressionCodec codec) throws IOException {
	OutputStream stream = new OutStream(this.toString(), bufferSize, codec, createDataStream(name));
	index.build().writeTo(stream);
	stream.flush();
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

5 votes

@Override
public void writeBloomFilter(StreamName name, OrcProto.BloomFilterIndex.Builder bloom,
							CompressionCodec codec) throws IOException {
	OutputStream stream = new OutStream(this.toString(), bufferSize, codec, createDataStream(name));
	bloom.build().writeTo(stream);
	stream.flush();
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

5 votes

@Override
public void writeFileMetadata(OrcProto.Metadata.Builder builder) throws IOException {
	long startPosition = out.getPos();
	OrcProto.Metadata metadata = builder.build();
	metadata.writeTo(protobufWriter);
	protobufWriter.flush();
	writer.flush();
	this.metadataLength = (int) (out.getPos() - startPosition);
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

5 votes

@Override
public void writeFileFooter(OrcProto.Footer.Builder builder) throws IOException {
	long bodyLength = out.getPos() - metadataLength;
	builder.setContentLength(bodyLength);
	builder.setHeaderLength(headerLength);
	long startPosition = out.getPos();
	OrcProto.Footer footer = builder.build();
	footer.writeTo(protobufWriter);
	protobufWriter.flush();
	writer.flush();
	this.footerLength = (int) (out.getPos() - startPosition);
}

Source File: PhysicalWriterImpl.java From flink with Apache License 2.0

5 votes

private void writeStripeFooter(OrcProto.StripeFooter footer, long dataSize,
								long indexSize, OrcProto.StripeInformation.Builder dirEntry) throws IOException {
	footer.writeTo(protobufWriter);
	protobufWriter.flush();
	writer.flush();

	dirEntry.setOffset(stripeStart);
	dirEntry.setFooterLength(out.getPos() - stripeStart - dataSize - indexSize);
}

Source File: TreeReaderFactory.java From tajo with Apache License 2.0

5 votes

protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length,
                           OrcProto.ColumnEncoding encoding) throws IOException {
  super(columnId, present);
  scratchlcv = new LongColumnVector();
  this.stream = data;
  if (length != null && encoding != null) {
    checkEncoding(encoding);
    this.lengths = createIntegerReader(encoding.getKind(), length, false, false);
  }
}

Source File: HiveORCVectorizedReader.java From dremio-oss with Apache License 2.0

5 votes

private int[] getOrdinalIdsOfSelectedColumns(List< OrcProto.Type > types, List<Integer> selectedColumns, boolean isOriginal) {
  int rootColumn = isOriginal ? 0 : TRANS_ROW_COLUMN_INDEX + 1;
  int[] ids = new int[types.size()];
  OrcProto.Type root = types.get(rootColumn);

  // iterating over only direct children
  for(int i = 0; i < root.getSubtypesCount(); ++i) {
    if (selectedColumns.contains(i)) {
      // find the position of this column in the types list
      ids[i] = root.getSubtypes(i);
    }
  }

  return ids;
}

org.apache.orc.OrcProto Java Examples