Java Code Examples for org.apache.parquet.io.api.Binary#length()
The following examples show how to use
org.apache.parquet.io.api.Binary#length() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: VarLenEntryDictionaryReader.java From Bats with Apache License 2.0 | 6 votes |
private final VarLenColumnBulkEntry getEntrySingle(int valsToReadWithinPage) { final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader; final int[] valueLengths = entry.getValuesLength(); final Binary currEntry = valueReader.getEntry(); final int dataLen = currEntry.length(); // Is there enough memory to handle this large value? if (batchMemoryConstraintsReached(0, 4, dataLen)) { entry.set(0, 0, 0, 0); // no data to be consumed return entry; } // Set the value length valueLengths[0] = dataLen; // Now set the bulk entry entry.set(0, dataLen, 1, 1, currEntry.getBytes()); return entry; }
Example 2
Source File: DeltaByteArrayReader.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public Binary readBytes() { int prefixLength = prefixLengthReader.readInteger(); // This does not copy bytes Binary suffix = suffixReader.readBytes(); int length = prefixLength + suffix.length(); // NOTE: due to PARQUET-246, it is important that we // respect prefixLength which was read from prefixLengthReader, // even for the *first* value of a page. Even though the first // value of the page should have an empty prefix, it may not // because of PARQUET-246. // We have to do this to materialize the output if(prefixLength != 0) { byte[] out = new byte[length]; System.arraycopy(previous.getBytesUnsafe(), 0, out, 0, prefixLength); System.arraycopy(suffix.getBytesUnsafe(), 0, out, prefixLength, suffix.length()); previous = Binary.fromConstantByteArray(out); } else { previous = suffix; } return previous; }
Example 3
Source File: TestBinaryTruncator.java From parquet-mr with Apache License 2.0 | 6 votes |
private void checkContract(BinaryTruncator truncator, Comparator<Binary> comparator, Binary value, boolean strictMin, boolean strictMax) { int length = value.length(); // Edge cases: returning the original value if no truncation is required assertSame(value, truncator.truncateMin(value, length)); assertSame(value, truncator.truncateMax(value, length)); assertSame(value, truncator.truncateMin(value, random(length + 1, length * 2 + 1))); assertSame(value, truncator.truncateMax(value, random(length + 1, length * 2 + 1))); if (length > 1) { checkMinContract(truncator, comparator, value, length - 1, strictMin); checkMaxContract(truncator, comparator, value, length - 1, strictMax); checkMinContract(truncator, comparator, value, random(1, length - 1), strictMin); checkMaxContract(truncator, comparator, value, random(1, length - 1), strictMax); } // Edge case: possible to truncate min value to 0 length if original value is not empty checkMinContract(truncator, comparator, value, 0, strictMin); // Edge case: impossible to truncate max value to 0 length -> returning the original value assertSame(value, truncator.truncateMax(value, 0)); }
Example 4
Source File: BinaryColumnReader.java From presto with Apache License 2.0 | 6 votes |
@Override protected void readValue(BlockBuilder blockBuilder, Type type) { if (definitionLevel == columnDescriptor.getMaxDefinitionLevel()) { Binary binary = valuesReader.readBytes(); Slice value; if (binary.length() == 0) { value = EMPTY_SLICE; } else { value = wrappedBuffer(binary.getBytes()); } if (isVarcharType(type)) { value = truncateToLength(value, type); } if (isCharType(type)) { value = truncateToLengthAndTrimSpaces(value, type); } type.writeSlice(blockBuilder, value); } else if (isValueNull()) { blockBuilder.appendNull(); } }
Example 5
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testBinaryDictionaryFallBack() throws IOException { int slabSize = 100; int maxDictionaryByteSize = 50; final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize); int fallBackThreshold = maxDictionaryByteSize; int dataSize=0; for (long i = 0; i < 100; i++) { Binary binary = Binary.fromString("str" + i); cw.writeBytes(binary); dataSize += (binary.length() + 4); if (dataSize < fallBackThreshold) { assertEquals(PLAIN_DICTIONARY, cw.getEncoding()); } else { assertEquals(PLAIN, cw.getEncoding()); } } //Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back ValuesReader reader = new BinaryPlainValuesReader(); reader.initFromPage(100, cw.getBytes().toInputStream()); for (long i = 0; i < 100; i++) { assertEquals(Binary.fromString("str" + i), reader.readBytes()); } //simulate cutting the page cw.reset(); assertEquals(0, cw.getBufferedSize()); }
Example 6
Source File: PrimitiveStringifier.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override String stringifyNotNull(Binary value) { if (value.length() != 12) { return BINARY_INVALID; } ByteBuffer buffer = value.toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); int pos = buffer.position(); String months = UNSIGNED_STRINGIFIER.stringify(buffer.getInt(pos)); String days = UNSIGNED_STRINGIFIER.stringify(buffer.getInt(pos + 4)); String millis = UNSIGNED_STRINGIFIER.stringify(buffer.getInt(pos + 8)); return "interval(" + months + " months, " + days + " days, " + millis + " millis)"; }
Example 7
Source File: FixedLenByteArrayPlainValuesWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public final void writeBytes(Binary v) { if (v.length() != length) { throw new IllegalArgumentException("Fixed Binary size " + v.length() + " does not match field type length " + length); } try { v.writeTo(out); } catch (IOException e) { throw new ParquetEncodingException("could not write fixed bytes", e); } }
Example 8
Source File: FallbackValuesWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void writeBytes(Binary v) { //for rawdata, length(4 bytes int) is stored, followed by the binary content itself rawDataByteSize += v.length() + 4; currentWriter.writeBytes(v); checkFallback(); }
Example 9
Source File: ParquetGroupConverter.java From dremio-oss with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { final int length = value.length(); final byte[] bytes = value.getBytes(); /* set the bytes in LE format in the buffer of decimal vector, we will swap * the bytes while writing into the vector. */ writer.writeBigEndianBytesToDecimal(bytes, new ArrowType.Decimal(holder.precision, holder.scale)); setWritten(); }
Example 10
Source File: ParquetGroupConverter.java From dremio-oss with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { if (value.length() > this.varValueSizeLimit) { throw createFieldSizeLimitException(value.length(), this.varValueSizeLimit); } holder.buffer = buf = buf.reallocIfNeeded(value.length()); buf.setBytes(0, value.toByteBuffer()); holder.start = 0; holder.end = value.length(); writer.writeVarChar(holder.start, holder.end, holder.buffer); setWritten(); }
Example 11
Source File: ParquetGroupConverter.java From dremio-oss with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { if (value.length() > this.varValueSizeLimit) { throw createFieldSizeLimitException(value.length(), this.varValueSizeLimit); } holder.buffer = buf = buf.reallocIfNeeded(value.length()); buf.setBytes(0, value.toByteBuffer()); holder.start = 0; holder.end = value.length(); writer.writeVarBinary(holder.start, holder.end, holder.buffer); setWritten(); }
Example 12
Source File: ParquetTimestampUtils.java From flink with Apache License 2.0 | 5 votes |
/** * Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos). * * @param timestampBinary INT96 parquet timestamp * @return timestamp in millis, GMT timezone */ public static long getTimestampMillis(Binary timestampBinary) { if (timestampBinary.length() != 12) { throw new IllegalArgumentException("Parquet timestamp must be 12 bytes, actual " + timestampBinary.length()); } byte[] bytes = timestampBinary.getBytes(); // little endian encoding - need to invert byte order long timeOfDayNanos = ByteBuffer.wrap(new byte[] {bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]}).getLong(); int julianDay = ByteBuffer.wrap(new byte[] {bytes[11], bytes[10], bytes[9], bytes[8]}).getInt(); return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND); }
Example 13
Source File: ParquetTimestampUtils.java From presto with Apache License 2.0 | 5 votes |
/** * Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos). * * @param timestampBinary INT96 parquet timestamp * @return timestamp in millis, GMT timezone */ public static long getTimestampMillis(Binary timestampBinary) { if (timestampBinary.length() != 12) { throw new PrestoException(NOT_SUPPORTED, "Parquet timestamp must be 12 bytes, actual " + timestampBinary.length()); } byte[] bytes = timestampBinary.getBytes(); // little endian encoding - need to invert byte order long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]); int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]); return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND); }
Example 14
Source File: DrillParquetGroupConverter.java From Bats with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { holder.buffer = buf.reallocIfNeeded(value.length()); holder.buffer.setBytes(0, value.toByteBuffer()); holder.start = 0; holder.end = value.length(); writer.write(holder); }
Example 15
Source File: DrillParquetGroupConverter.java From Bats with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { holder.buffer = buf = buf.reallocIfNeeded(value.length()); buf.setBytes(0, value.toByteBuffer()); holder.start = 0; holder.end = value.length(); writer.write(holder); }
Example 16
Source File: DrillParquetGroupConverter.java From Bats with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { holder.buffer = buf = buf.reallocIfNeeded(value.length()); buf.setBytes(0, value.toByteBuffer()); holder.start = 0; holder.end = value.length(); writer.write(holder); }
Example 17
Source File: VarLenNullableDictionaryReader.java From Bats with Apache License 2.0 | 5 votes |
private final VarLenColumnBulkEntry getEntrySingle(int valsToReadWithinPage) { final int[] valueLengths = entry.getValuesLength(); final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader; // Initialize the reader if needed pageInfo.definitionLevels.readFirstIntegerIfNeeded(); if (pageInfo.definitionLevels.readCurrInteger() == 1) { final Binary currEntry = valueReader.getEntry(); final int dataLen = currEntry.length(); // Is there enough memory to handle this large value? if (batchMemoryConstraintsReached(1, 4, dataLen)) { entry.set(0, 0, 0, 0); // no data to be consumed return entry; } // Set the value length valueLengths[0] = dataLen; // Now set the bulk entry entry.set(0, dataLen, 1, 1, currEntry.getBytes()); } else { valueLengths[0] = -1; // Now set the bulk entry entry.set(0, 0, 1, 0); } // read the next definition-level value since we know the current entry has been processed pageInfo.definitionLevels.nextIntegerIfNotEOF(); return entry; }
Example 18
Source File: TestColumnIndexBuilder.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public boolean keep(Binary value) { return value != null && value.length() > 0 && value.getBytesUnsafe()[0] == 'B'; }
Example 19
Source File: VarLenNullableDictionaryReader.java From Bats with Apache License 2.0 | 4 votes |
private final VarLenColumnBulkEntry getEntryBulk(int valuesToRead) { final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader; final int[] valueLengths = entry.getValuesLength(); final int readBatch = Math.min(entry.getMaxEntries(), valuesToRead); Preconditions.checkState(readBatch > 0, "Read batch count [%s] should be greater than zero", readBatch); final byte[] tgtBuff = entry.getInternalDataArray(); final int tgtLen = tgtBuff.length; // Counters int numValues = 0; int numNulls = 0; int tgtPos = 0; // Initialize the reader if needed pageInfo.definitionLevels.readFirstIntegerIfNeeded(); for (int idx = 0; idx < readBatch; ++idx ) { if (pageInfo.definitionLevels.readCurrInteger() == 1) { final Binary currEntry = valueReader.getEntry(); final int dataLen = currEntry.length(); if (tgtLen < (tgtPos + dataLen)) { valueReader.pushBack(currEntry); // push back this value since we're exiting from the loop break; } valueLengths[numValues++] = dataLen; if (dataLen > 0) { vlCopyNoPadding(currEntry.getBytes(), 0, tgtBuff, tgtPos, dataLen); // Update the counters tgtPos += dataLen; } } else { valueLengths[numValues++] = -1; ++numNulls; } // read the next definition-level value since we know the current entry has been processed pageInfo.definitionLevels.nextIntegerIfNotEOF(); } // We're here either because a) the Parquet metadata is wrong (advertises more values than the real count) // or the first value being processed ended up to be too long for the buffer. if (numValues == 0) { return getEntrySingle(valuesToRead); } entry.set(0, tgtPos, numValues, numValues - numNulls); return entry; }
Example 20
Source File: VarLenEntryDictionaryReader.java From Bats with Apache License 2.0 | 4 votes |
private final VarLenColumnBulkEntry getEntryBulk(int valuesToRead) { final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader; final int[] valueLengths = entry.getValuesLength(); final int readBatch = Math.min(entry.getMaxEntries(), valuesToRead); Preconditions.checkState(readBatch > 0, "Read batch count [%s] should be greater than zero", readBatch); final byte[] tgtBuff = entry.getInternalDataArray(); final int tgtLen = tgtBuff.length; // Counters int numValues = 0; int tgtPos = 0; for (int idx = 0; idx < readBatch; ++idx ) { final Binary currEntry = valueReader.getEntry(); final int dataLen = currEntry.length(); if (tgtLen < (tgtPos + dataLen)) { valueReader.pushBack(currEntry); // push back this value since we're exiting from the loop break; } valueLengths[numValues++] = dataLen; if (dataLen > 0) { vlCopyNoPadding(currEntry.getBytes(), 0, tgtBuff, tgtPos, dataLen); // Update the counters tgtPos += dataLen; } } // We're here either because a) the Parquet metadata is wrong (advertises more values than the real count) // or the first value being processed ended up to be too long for the buffer. if (numValues == 0) { return getEntrySingle(valuesToRead); } // Now set the bulk entry entry.set(0, tgtPos, numValues, numValues); return entry; }