org.apache.parquet.schema.PrimitiveType Java Examples
The following examples show how to use
org.apache.parquet.schema.PrimitiveType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Metadata_V2.java From Bats with Apache License 2.0 | 6 votes |
@Override public void serialize(ColumnMetadata_v2 value, JsonGenerator jgen, SerializerProvider provider) throws IOException, JsonProcessingException { jgen.writeStartObject(); jgen.writeArrayFieldStart("name"); for (String n : value.name) { jgen.writeString(n); } jgen.writeEndArray(); if (value.mxValue != null) { Object val; if (value.primitiveType == PrimitiveType.PrimitiveTypeName.BINARY && value.mxValue != null) { val = new String(((Binary) value.mxValue).getBytes()); } else { val = value.mxValue; } jgen.writeObjectField("mxValue", val); } if (value.nulls != null) { jgen.writeObjectField("nulls", value.nulls); } jgen.writeEndObject(); }
Example #2
Source File: ParquetResolverTest.java From pxf with Apache License 2.0 | 6 votes |
private MessageType getParquetSchemaForPrimitiveTypes(Type.Repetition repetition, boolean readCase) { List<Type> fields = new ArrayList<>(); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s1", OriginalType.UTF8)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s2", OriginalType.UTF8)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "n1", null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.DOUBLE, "d1", null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, 16, "dc1", OriginalType.DECIMAL, new DecimalMetadata(38, 18), null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tm", null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FLOAT, "f", null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT64, "bg", null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BOOLEAN, "b", null)); // GPDB only has int16 and not int8 type, so for write tiny numbers int8 are still treated as shorts in16 OriginalType tinyType = readCase ? OriginalType.INT_8 : OriginalType.INT_16; fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "tn", tinyType)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "sml", OriginalType.INT_16)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "vc1", OriginalType.UTF8)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "c1", OriginalType.UTF8)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "bin", null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz", null)); fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz2", null)); return new MessageType("hive_schema", fields); }
Example #3
Source File: AvroSchemaConverter190Int96Avro18.java From datacollector with Apache License 2.0 | 6 votes |
private Schema addLogicalTypeToSchema( Schema schema, OriginalType annotation, PrimitiveType asPrimitive, PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName ) { LogicalType logicalType = convertOriginalTypeToLogicalType( annotation, asPrimitive.getDecimalMetadata()); if (logicalType != null && (annotation != DECIMAL || parquetPrimitiveTypeName == BINARY || parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) { schema = logicalType.addToSchema(schema); } return schema; }
Example #4
Source File: MessageTypeConverter.java From presto with Apache License 2.0 | 6 votes |
private static org.apache.parquet.format.Type getType(PrimitiveType.PrimitiveTypeName type) { switch (type) { case INT64: return Type.INT64; case INT32: return Type.INT32; case BOOLEAN: return Type.BOOLEAN; case BINARY: return Type.BYTE_ARRAY; case FLOAT: return Type.FLOAT; case DOUBLE: return Type.DOUBLE; case INT96: return Type.INT96; case FIXED_LEN_BYTE_ARRAY: return Type.FIXED_LEN_BYTE_ARRAY; default: throw new RuntimeException("Unknown primitive type " + type); } }
Example #5
Source File: TestParquetPredicateUtils.java From presto with Apache License 2.0 | 6 votes |
@Test public void testParquetTupleDomainMap() { MapType mapType = new MapType( INTEGER, INTEGER, methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException")); HiveColumnHandle columnHandle = createBaseColumn("my_map", 0, HiveType.valueOf("map<int,int>"), mapType, REGULAR, Optional.empty()); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(mapType))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_map", new GroupType(REPEATED, "map", new PrimitiveType(REQUIRED, INT32, "key"), new PrimitiveType(OPTIONAL, INT32, "value")))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true); assertTrue(tupleDomain.isAll()); }
Example #6
Source File: Metadata.java From Bats with Apache License 2.0 | 6 votes |
private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) { if (type.isPrimitive()) { PrimitiveType primitiveType = (PrimitiveType) type; int precision = 0; int scale = 0; if (primitiveType.getDecimalMetadata() != null) { precision = primitiveType.getDecimalMetadata().getPrecision(); scale = primitiveType.getDecimalMetadata().getScale(); } int repetitionLevel = schema.getMaxRepetitionLevel(path); int definitionLevel = schema.getMaxDefinitionLevel(path); return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel); } Type t = ((GroupType) type).getType(path[depth]); return getColTypeInfo(schema, t, path, depth + 1); }
Example #7
Source File: TestMetadataReader.java From presto with Apache License 2.0 | 6 votes |
@Test(dataProvider = "allCreatedBy") public void testReadStatsInt64(Optional<String> fileCreatedBy) { Statistics statistics = new Statistics(); statistics.setNull_count(13); statistics.setMin(fromHex("F6FFFFFFFFFFFFFF")); statistics.setMax(fromHex("3AA4000000000000")); assertThat(MetadataReader.readStats(fileCreatedBy, Optional.of(statistics), new PrimitiveType(OPTIONAL, INT64, "Test column"))) .isInstanceOfSatisfying(LongStatistics.class, columnStatistics -> { assertEquals(columnStatistics.getNumNulls(), 13); assertEquals(columnStatistics.getMin(), -10); assertEquals(columnStatistics.getMax(), 42042); assertEquals(columnStatistics.genericGetMin(), (Long) (long) -10L); assertEquals(columnStatistics.genericGetMax(), (Long) 42042L); }); }
Example #8
Source File: TestParquetPredicateUtils.java From presto with Apache License 2.0 | 6 votes |
@Test public void testParquetTupleDomainStruct() { RowType rowType = rowType( RowType.field("a", INTEGER), RowType.field("b", INTEGER)); HiveColumnHandle columnHandle = createBaseColumn("my_struct", 0, HiveType.valueOf("struct<a:int,b:int>"), rowType, REGULAR, Optional.empty()); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(rowType))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_struct", new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true); assertTrue(tupleDomain.isAll()); }
Example #9
Source File: TestParquetPredicateUtils.java From presto with Apache License 2.0 | 6 votes |
@Test public void testParquetTupleDomainPrimitive() { HiveColumnHandle columnHandle = createBaseColumn("my_primitive", 0, HiveType.valueOf("bigint"), BIGINT, REGULAR, Optional.empty()); Domain singleValueDomain = Domain.singleValue(BIGINT, 123L); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, singleValueDomain)); MessageType fileSchema = new MessageType("hive_schema", new PrimitiveType(OPTIONAL, INT64, "my_primitive")); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true); assertEquals(tupleDomain.getDomains().get().size(), 1); ColumnDescriptor descriptor = tupleDomain.getDomains().get().keySet().iterator().next(); assertEquals(descriptor.getPath().length, 1); assertEquals(descriptor.getPath()[0], "my_primitive"); Domain predicateDomain = Iterables.getOnlyElement(tupleDomain.getDomains().get().values()); assertEquals(predicateDomain, singleValueDomain); }
Example #10
Source File: TestColumnIO.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testReadUsingSchemaWithRequiredFieldThatWasOptional(){ MessageType originalSchema = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "e")); MemPageStore store = new MemPageStore(1); SimpleGroupFactory groupFactory = new SimpleGroupFactory(originalSchema); writeGroups(originalSchema, store, groupFactory.newGroup().append("e", 4)); try { MessageType schemaWithRequiredFieldThatWasOptional = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "e")); // Incompatible schema: required when it was optional readGroups(store, originalSchema, schemaWithRequiredFieldThatWasOptional, 1); fail("should have thrown an incompatible schema exception"); } catch (ParquetDecodingException e) { assertEquals("The requested schema is not compatible with the file schema. incompatible types: required int32 e != optional int32 e", e.getMessage()); } }
Example #11
Source File: HiveClientTest.java From garmadon with Apache License 2.0 | 6 votes |
@Test public void createTableWithoutIssue() throws SQLException { PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id"); MessageType schema = new MessageType("fs", appId); String table = "fs"; String location = "file:" + hdfsTemp + "/garmadon_database/fs"; HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon", hdfsTemp + "/garmadon_database"); hiveClient.createTableIfNotExist(table, schema, location); HashMap<String, String> result = getResultHashTableDesc(hiveClient, table); assertEquals(location, result.get("Location")); assertEquals("EXTERNAL_TABLE", result.get("Table Type").trim()); assertEquals("string", result.get("day")); assertEquals("string", result.get("app_id")); }
Example #12
Source File: HiveClientTest.java From garmadon with Apache License 2.0 | 6 votes |
@Test public void shouldProvideHiveTypeFromParquetType() throws Exception { HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon", hdfsTemp + "/garmadon_database"); PrimitiveType string = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "name"); assertEquals("string", hiveClient.inferHiveType(string)); PrimitiveType array_string = new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.BINARY, "name"); assertEquals("array<string>", hiveClient.inferHiveType(array_string)); PrimitiveType int32 = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT32, "name"); assertEquals("int", hiveClient.inferHiveType(int32)); PrimitiveType int64 = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT64, "name"); assertEquals("bigint", hiveClient.inferHiveType(int64)); PrimitiveType floatz = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.FLOAT, "name"); assertEquals("float", hiveClient.inferHiveType(floatz)); PrimitiveType doublez = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.DOUBLE, "name"); assertEquals("double", hiveClient.inferHiveType(doublez)); PrimitiveType booleanz = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BOOLEAN, "name"); assertEquals("boolean", hiveClient.inferHiveType(booleanz)); }
Example #13
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 6 votes |
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) { if (rowGroup.getRowCount() <= 0) { return ROWS_CANNOT_MATCH; } this.stats = Maps.newHashMap(); this.valueCounts = Maps.newHashMap(); this.conversions = Maps.newHashMap(); for (ColumnChunkMetaData col : rowGroup.getColumns()) { PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType(); if (colType.getId() != null) { int id = colType.getId().intValue(); stats.put(id, col.getStatistics()); valueCounts.put(id, col.getValueCount()); conversions.put(id, ParquetConversions.converterFromParquet(colType)); } } return ExpressionVisitors.visitEvaluator(expr, this); }
Example #14
Source File: ParquetRecordWriter.java From Bats with Apache License 2.0 | 6 votes |
protected PrimitiveType getPrimitiveType(MaterializedField field) { MinorType minorType = field.getType().getMinorType(); String name = field.getName(); int length = ParquetTypeHelper.getLengthForMinorType(minorType); PrimitiveTypeName primitiveTypeName = ParquetTypeHelper.getPrimitiveTypeNameForMinorType(minorType); if (Types.isDecimalType(minorType)) { primitiveTypeName = logicalTypeForDecimals; if (usePrimitiveTypesForDecimals) { if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT32)) { primitiveTypeName = PrimitiveTypeName.INT32; } else if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT64)) { primitiveTypeName = PrimitiveTypeName.INT64; } } length = DecimalUtility.getMaxBytesSizeForPrecision(field.getPrecision()); } Repetition repetition = ParquetTypeHelper.getRepetitionForDataMode(field.getDataMode()); OriginalType originalType = ParquetTypeHelper.getOriginalTypeForMinorType(minorType); DecimalMetadata decimalMetadata = ParquetTypeHelper.getDecimalMetadataForField(field); return new PrimitiveType(repetition, primitiveTypeName, length, name, originalType, decimalMetadata, null); }
Example #15
Source File: FixedLenBytesColumnReader.java From flink with Apache License 2.0 | 5 votes |
public FixedLenBytesColumnReader( ColumnDescriptor descriptor, PageReader pageReader, int precision) throws IOException { super(descriptor, pageReader); checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); this.precision = precision; }
Example #16
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) { if (type instanceof GroupType) { showDetails(out, type.asGroupType(), depth, container, cpath); return; } else if (type instanceof PrimitiveType) { showDetails(out, type.asPrimitiveType(), depth, container, cpath); return; } }
Example #17
Source File: ParquetColumnMetadata.java From Bats with Apache License 2.0 | 5 votes |
/** * Returns data type length for a given {@see ColumnDescriptor} and it's corresponding * {@see SchemaElement}. Neither is enough information alone as the max * repetition level (indicating if it is an array type) is in the ColumnDescriptor and * the length of a fixed width field is stored at the schema level. * * @return the length if fixed width, else <tt>UNDEFINED_LENGTH</tt> (-1) */ public int getDataTypeLength() { if (! isFixedLength()) { return UNDEFINED_LENGTH; } else if (isRepeated()) { return UNDEFINED_LENGTH; } else if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { return se.getType_length() * 8; } else { return getTypeLengthInBits(column.getType()); } }
Example #18
Source File: TupleConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override final public void start() { currentTuple = TF.newTuple(schemaSize); if (elephantBirdCompatible) { try { int i = 0; for (Type field : parquetSchema.getFields()) { if (field.isPrimitive() && field.isRepetition(Repetition.OPTIONAL)) { PrimitiveType primitiveType = field.asPrimitiveType(); switch (primitiveType.getPrimitiveTypeName()) { case INT32: currentTuple.set(i, I32_ZERO); break; case INT64: currentTuple.set(i, I64_ZERO); break; case FLOAT: currentTuple.set(i, FLOAT_ZERO); break; case DOUBLE: currentTuple.set(i, DOUBLE_ZERO); break; case BOOLEAN: currentTuple.set(i, I32_ZERO); break; } } ++ i; } } catch (ExecException e) { throw new RuntimeException(e); } } }
Example #19
Source File: FixedBinaryTestUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
public static Binary getFixedBinary(PrimitiveType type, BigInteger bigInt) { switch (type.getPrimitiveTypeName()) { case FIXED_LEN_BYTE_ARRAY: return getFixedBinary(type.getTypeLength(), bigInt); case INT96: return getFixedBinary(12, bigInt); case BINARY: return Binary.fromConstantByteArray(bigInt.toByteArray()); default: throw new IllegalArgumentException("Type " + type + " cannot be represented by a Binary"); } }
Example #20
Source File: TestTupleDomainParquetPredicate.java From presto with Apache License 2.0 | 5 votes |
@Test public void testVarcharMatchesWithStatistics() throws ParquetCorruptionException { String value = "Test"; ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value)); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); Statistics<?> stats = getStatsBasedOnType(column.getPrimitiveType().getPrimitiveTypeName()); stats.setNumNulls(1L); stats.setMinMaxFromBytes(value.getBytes(UTF_8), value.getBytes(UTF_8)); assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, stats), ID, true)); }
Example #21
Source File: TestBinaryTruncator.java From parquet-mr with Apache License 2.0 | 5 votes |
private void testTruncator(PrimitiveType type, boolean strict) { BinaryTruncator truncator = BinaryTruncator.getTruncator(type); Comparator<Binary> comparator = type.comparator(); checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa"), strict, strict); checkContract(truncator, comparator, Binary.fromString("árvÃztűrÅ‘ tükörfúrógép"), strict, strict); checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa" + UTF8_3BYTES_MAX_CHAR), strict, strict); checkContract(truncator, comparator, Binary.fromString("a" + UTF8_3BYTES_MAX_CHAR + UTF8_1BYTE_MAX_CHAR), strict, strict); checkContract(truncator, comparator, Binary.fromConstantByteArray(new byte[] { (byte) 0xFE, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, (byte) 0xFF }), strict, strict); // Edge case: zero length -> unable to truncate checkContract(truncator, comparator, Binary.fromString(""), false, false); // Edge case: containing only UTF-8 max characters -> unable to truncate for max checkContract(truncator, comparator, Binary.fromString( UTF8_1BYTE_MAX_CHAR + UTF8_4BYTES_MAX_CHAR + UTF8_3BYTES_MAX_CHAR + UTF8_4BYTES_MAX_CHAR + UTF8_2BYTES_MAX_CHAR + UTF8_3BYTES_MAX_CHAR + UTF8_3BYTES_MAX_CHAR + UTF8_1BYTE_MAX_CHAR + UTF8_2BYTES_MAX_CHAR + UTF8_3BYTES_MAX_CHAR + UTF8_4BYTES_MAX_CHAR), strict, false); // Edge case: non-UTF-8; max bytes -> unable to truncate for max checkContract( truncator, comparator, binary(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF), strict, false); }
Example #22
Source File: TestTupleDomainParquetPredicate.java From presto with Apache License 2.0 | 5 votes |
@Test public void testVarcharMatchesWithDictionaryDescriptor() { ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), EMPTY_SLICE); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); DictionaryPage page = new DictionaryPage(Slices.wrappedBuffer(new byte[] {0, 0, 0, 0}), 1, PLAIN_DICTIONARY); assertTrue(parquetPredicate.matches(new DictionaryDescriptor(column, Optional.of(page)))); }
Example #23
Source File: HiveClientTest.java From garmadon with Apache License 2.0 | 5 votes |
@Test(expected = Exception.class) public void shouldThrowExceptionForUnknownParquetType() throws Exception { HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon", hdfsTemp + "/garmadon_database"); PrimitiveType unsupported = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT96, "unsupported"); hiveClient.inferHiveType(unsupported); }
Example #24
Source File: ParquetUtil.java From iceberg with Apache License 2.0 | 5 votes |
public static boolean isIntType(PrimitiveType primitiveType) { if (primitiveType.getOriginalType() != null) { switch (primitiveType.getOriginalType()) { case INT_8: case INT_16: case INT_32: case DATE: return true; default: return false; } } return primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT32; }
Example #25
Source File: TestMetadataReader.java From presto with Apache License 2.0 | 5 votes |
@Test(dataProvider = "allCreatedBy") public void testReadNullStats(Optional<String> fileCreatedBy) { // integer assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT32, "Test column"))) .isInstanceOfSatisfying( IntStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); // bigint assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT64, "Test column"))) .isInstanceOfSatisfying( LongStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); // varchar assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8))) .isInstanceOfSatisfying( BinaryStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); // varbinary assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column"))) .isInstanceOfSatisfying( BinaryStatistics.class, columnStatistics -> assertTrue(columnStatistics.isEmpty())); }
Example #26
Source File: FloatColumnIndexBuilder.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override ColumnIndexBase<Float> createColumnIndex(PrimitiveType type) { if (invalid) { return null; } FloatColumnIndex columnIndex = new FloatColumnIndex(type); columnIndex.minValues = minValues.toFloatArray(); columnIndex.maxValues = maxValues.toFloatArray(); return columnIndex; }
Example #27
Source File: ThriftSchemaConvertVisitor.java From parquet-mr with Apache License 2.0 | 5 votes |
private ConvertedField visitPrimitiveType(PrimitiveTypeName type, LogicalTypeAnnotation orig, State state) { PrimitiveBuilder<PrimitiveType> b = primitive(type, state.repetition); if (orig != null) { b = b.as(orig); } if (fieldProjectionFilter.keep(state.path)) { return new Keep(state.path, b.named(state.name)); } else { return new Drop(state.path); } }
Example #28
Source File: DataWritableReadSupport.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * * It creates the readContext for Parquet side with the requested schema during the init phase. * * @param configuration needed to get the wanted columns * @param keyValueMetaData // unused * @param fileSchema parquet file schema * @return the parquet ReadContext */ @Override public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration, final Map<String, String> keyValueMetaData, final MessageType fileSchema) { final String columns = configuration.get(IOConstants.COLUMNS); final Map<String, String> contextMetadata = new HashMap<String, String>(); if (columns != null) { final List<String> listColumns = getColumns(columns); final List<Type> typeListTable = new ArrayList<Type>(); for (final String col : listColumns) { // listColumns contains partition columns which are metadata only if (fileSchema.containsField(col)) { typeListTable.add(fileSchema.getType(col)); } else { // below allows schema evolution typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col)); } } MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable); contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString()); MessageType requestedSchemaByUser = tableSchema; final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration); final List<Type> typeListWanted = new ArrayList<Type>(); for (final Integer idx : indexColumnsWanted) { typeListWanted.add(tableSchema.getType(listColumns.get(idx))); } requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(), typeListWanted), fileSchema, configuration); return new ReadContext(requestedSchemaByUser, contextMetadata); } else { contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString()); return new ReadContext(fileSchema, contextMetadata); } }
Example #29
Source File: Metadata_V3.java From Bats with Apache License 2.0 | 5 votes |
public ColumnMetadata_v3(String[] name, PrimitiveType.PrimitiveTypeName primitiveType, Object minValue, Object maxValue, Long nulls) { this.name = name; this.minValue = minValue; this.maxValue = maxValue; this.nulls = nulls; this.primitiveType = primitiveType; }
Example #30
Source File: ColumnIOFactory.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void visit(PrimitiveType primitiveType) { if (!currentRequestedType.isPrimitive() || (this.strictTypeChecking && currentRequestedType.asPrimitiveType().getPrimitiveTypeName() != primitiveType.getPrimitiveTypeName())) { incompatibleSchema(primitiveType, currentRequestedType); } PrimitiveColumnIO newIO = new PrimitiveColumnIO(primitiveType, current, currentRequestedIndex, leaves.size()); current.add(newIO); leaves.add(newIO); }