Java Code Examples for org.apache.parquet.column.Dictionary#getMaxId()
The following examples show how to use
org.apache.parquet.column.Dictionary#getMaxId() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ArrowVectorAccessors.java From iceberg with Apache License 2.0 | 5 votes |
DictionaryFloatAccessor(IntVector vector, Dictionary dictionary) { super(vector); this.offsetVector = vector; this.decodedDictionary = new float[dictionary.getMaxId() + 1]; for (int i = 0; i <= dictionary.getMaxId(); i++) { decodedDictionary[i] = dictionary.decodeToFloat(i); } }
Example 2
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 5 votes |
private static VectorContainer buildIntegerGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(32, true), null); final VectorContainer input = new VectorContainer(bufferAllocator); final IntVector intVector = input.addOrGet(field); intVector.allocateNew(); final SortedSet<Integer> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToInt(i)); } } if (existingDict != null) { final IntVector existingDictValues = existingDict.getValueAccessorById(IntVector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Integer> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { intVector.setSafe(recordCount++, iter.next()); } intVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
Example 3
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 5 votes |
private static VectorContainer buildLongGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(64, true), null); final VectorContainer input = new VectorContainer(bufferAllocator); final BigIntVector longVector = input.addOrGet(field); longVector.allocateNew(); SortedSet<Long> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToLong(i)); } } if (existingDict != null) { final BigIntVector existingDictValues = existingDict.getValueAccessorById(BigIntVector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Long> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { longVector.setSafe(recordCount++, iter.next()); } longVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
Example 4
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 5 votes |
private static VectorContainer buildDoubleGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); final VectorContainer input = new VectorContainer(bufferAllocator); final Float8Vector doubleVector = input.addOrGet(field); doubleVector.allocateNew(); SortedSet<Double> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToDouble(i)); } } if (existingDict != null) { final Float8Vector existingDictValues = existingDict.getValueAccessorById(Float8Vector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Double> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { doubleVector.setSafe(recordCount++, iter.next()); } doubleVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
Example 5
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 5 votes |
private static VectorContainer buildFloatGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); final VectorContainer input = new VectorContainer(bufferAllocator); final Float4Vector floatVector = input.addOrGet(field); floatVector.allocateNew(); SortedSet<Float> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToFloat(i)); } } if (existingDict != null) { final Float4Vector existingDictValues = existingDict.getValueAccessorById(Float4Vector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Float> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { floatVector.setSafe(recordCount++, iter.next()); } floatVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
Example 6
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 5 votes |
private static VectorContainer buildBinaryGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Binary(), null); final VectorContainer input = new VectorContainer(bufferAllocator); final VarBinaryVector binaryVector = input.addOrGet(field); binaryVector.allocateNew(); final SortedSet<Binary> values = new TreeSet<>(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToBinary(i)); } } if (existingDict != null) { final VarBinaryVector existingDictValues = existingDict.getValueAccessorById(VarBinaryVector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(Binary.fromConstantByteArray(existingDictValues.get(i))); } } final Iterator<Binary> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { final byte[] data = iter.next().getBytes(); binaryVector.setSafe(recordCount++, data, 0, data.length); } binaryVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
Example 7
Source File: LocalDictionariesReader.java From dremio-oss with Apache License 2.0 | 5 votes |
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) { System.out.println("Dictionary for column " + columnDescriptor.toString()); for (int i = 0; i < localDictionary.getMaxId(); ++i) { switch (columnDescriptor.getType()) { case INT32: System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i))); break; case INT64: System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i))); break; case INT96: case BINARY: case FIXED_LEN_BYTE_ARRAY: System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe()))); break; case FLOAT: System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i))); break; case DOUBLE: System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i))); break; case BOOLEAN: System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i))); break; default: break; } } }
Example 8
Source File: ProtoMessageConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void setDictionary(Dictionary dictionary) { dict = new Descriptors.EnumValueDescriptor[dictionary.getMaxId() + 1]; for (int i = 0; i <= dictionary.getMaxId(); i++) { Binary binaryValue = dictionary.decodeToBinary(i); dict[i] = translateEnumValue(binaryValue); } }
Example 9
Source File: TupleConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void setDictionary(Dictionary dictionary) { dict = new String[dictionary.getMaxId() + 1]; for (int i = 0; i <= dictionary.getMaxId(); i++) { dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8(); } }
Example 10
Source File: AvroConverters.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public void setDictionary(Dictionary dictionary) { dict = (T[]) new Object[dictionary.getMaxId() + 1]; for (int i = 0; i <= dictionary.getMaxId(); i++) { dict[i] = convert(dictionary.decodeToBinary(i)); } }
Example 11
Source File: ArrowVectorAccessors.java From iceberg with Apache License 2.0 | 4 votes |
private DictionaryDecimalAccessor(IntVector vector, Dictionary dictionary) { super(vector); this.offsetVector = vector; this.parquetDictionary = dictionary; this.cache = new Decimal[dictionary.getMaxId() + 1]; }
Example 12
Source File: DictionaryFilter.java From parquet-mr with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private <T extends Comparable<T>> Set<T> expandDictionary(ColumnChunkMetaData meta) throws IOException { ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1); DictionaryPage page = dictionaries.readDictionaryPage(col); // the chunk may not be dictionary-encoded if (page == null) { return null; } Dictionary dict = page.getEncoding().initDictionary(col, page); IntFunction<Object> dictValueProvider; PrimitiveTypeName type = meta.getPrimitiveType().getPrimitiveTypeName(); switch (type) { case FIXED_LEN_BYTE_ARRAY: // Same as BINARY case BINARY: dictValueProvider = dict::decodeToBinary; break; case INT32: dictValueProvider = dict::decodeToInt; break; case INT64: dictValueProvider = dict::decodeToLong; break; case FLOAT: dictValueProvider = dict::decodeToFloat; break; case DOUBLE: dictValueProvider = dict::decodeToDouble; break; default: LOG.warn("Unsupported dictionary type: {}", type); return null; } Set<T> dictSet = new HashSet<>(); for (int i = 0; i <= dict.getMaxId(); i++) { dictSet.add((T) dictValueProvider.apply(i)); } return dictSet; }
Example 13
Source File: ShowDictionaryCommand.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public int run() throws IOException { Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required."); Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files."); String source = targets.get(0); ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source)); MessageType schema = reader.getFileMetaData().getSchema(); ColumnDescriptor descriptor = Util.descriptor(column, schema); PrimitiveType type = Util.primitive(column, schema); Preconditions.checkNotNull(type); DictionaryPageReadStore dictionaryReader; int rowGroup = 0; while ((dictionaryReader = reader.getNextDictionaryReader()) != null) { DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor); Dictionary dict = page.getEncoding().initDictionary(descriptor, page); console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize()); for (int i = 0; i <= dict.getMaxId(); i += 1) { switch(type.getPrimitiveTypeName()) { case BINARY: if (type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) { console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70)); } else { console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70)); } break; case INT32: console.info("{}: {}", String.format("%6d", i), dict.decodeToInt(i)); break; case INT64: console.info("{}: {}", String.format("%6d", i), dict.decodeToLong(i)); break; case FLOAT: console.info("{}: {}", String.format("%6d", i), dict.decodeToFloat(i)); break; case DOUBLE: console.info("{}: {}", String.format("%6d", i), dict.decodeToDouble(i)); break; default: throw new IllegalArgumentException( "Unknown dictionary type: " + type.getPrimitiveTypeName()); } } reader.skipNextRowGroup(); rowGroup += 1; } console.info(""); return 0; }