org.apache.iceberg.Schema Java Exaples

Source File: RowDataReader.java From iceberg with Apache License 2.0

7 votes

private CloseableIterable<InternalRow> newAvroIterable(
    InputFile location,
    FileScanTask task,
    Schema projection,
    Map<Integer, ?> idToConstant) {
  Avro.ReadBuilder builder = Avro.read(location)
      .reuseContainers()
      .project(projection)
      .split(task.start(), task.length())
      .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}

Source File: ParquetSchemaUtil.java From iceberg with Apache License 2.0

6 votes

/**
 * Prunes columns from a Parquet file schema that was written without field ids.
 * <p>
 * Files that were written without field ids are read assuming that schema evolution preserved
 * column order. Deleting columns was not allowed.
 * <p>
 * The order of columns in the resulting Parquet schema matches the Parquet file.
 *
 * @param fileSchema schema from a Parquet file that does not have field ids.
 * @param expectedSchema expected schema
 * @return a parquet schema pruned using the expected schema
 */
public static MessageType pruneColumnsFallback(MessageType fileSchema, Schema expectedSchema) {
  Set<Integer> selectedIds = Sets.newHashSet();

  for (Types.NestedField field : expectedSchema.columns()) {
    selectedIds.add(field.fieldId());
  }

  MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();

  int ordinal = 1;
  for (Type type : fileSchema.getFields()) {
    if (selectedIds.contains(ordinal)) {
      builder.addField(type.withId(ordinal));
    }
    ordinal += 1;
  }

  return builder.named(fileSchema.getName());
}

Source File: ORCSchemaUtil.java From iceberg with Apache License 2.0

6 votes

/**
 * Convert an ORC schema to an Iceberg schema. This method handles the convertion from the original
 * Iceberg column mapping IDs if present in the ORC column attributes, otherwise, ORC column IDs
 * will be assigned following ORCs pre-order ID assignment.
 *
 * @return the Iceberg schema
 */
public static Schema convert(TypeDescription orcSchema) {
  List<TypeDescription> children = orcSchema.getChildren();
  List<String> childrenNames = orcSchema.getFieldNames();
  Preconditions.checkState(children.size() == childrenNames.size(),
      "Error in ORC file, children fields and names do not match.");

  List<Types.NestedField> icebergFields = Lists.newArrayListWithExpectedSize(children.size());
  AtomicInteger lastColumnId = new AtomicInteger(getMaxIcebergId(orcSchema));
  for (int i = 0; i < children.size(); i++) {
    icebergFields.add(convertOrcToIceberg(children.get(i), childrenNames.get(i),
        lastColumnId::incrementAndGet));
  }

  return new Schema(icebergFields);
}

Source File: TestIcebergSerDe.java From dremio-oss with Apache License 2.0

6 votes

@Before
public void setUp() {
  schema = new Schema(
    required(0, "id", Types.LongType.get()),
    required(1, "data", Types.StringType.get()),
    required(2, "b", Types.BooleanType.get()),
    required(3, "i", Types.IntegerType.get()),
    required(4, "l", Types.LongType.get()),
    required(5, "f", Types.FloatType.get()),
    required(6, "d", Types.DoubleType.get()),
    required(7, "date", Types.DateType.get()),
    required(8, "ts", Types.TimestampType.withZone()),
    required(9, "s", Types.StringType.get()),
    required(10, "bytes", Types.BinaryType.get()),
    required(11, "dec_9_0", Types.DecimalType.of(9, 0)),
    required(12, "dec_11_2", Types.DecimalType.of(11, 2)),
    required(13, "dec_38_10", Types.DecimalType.of(38, 10))
  );
}

Source File: IcebergStorage.java From iceberg with Apache License 2.0

6 votes

@Override
public List<String> getPredicateFields(String location, Job job) throws IOException {
  LOG.info("[{}]: getPredicateFields() -> {}", signature, location);
  Schema schema = load(location, job).schema();

  List<String> result = Lists.newArrayList();

  for (Types.NestedField nf : schema.columns()) {
    switch (nf.type().typeId()) {
      case MAP:
      case LIST:
      case STRUCT:
        continue;
      default:
        result.add(nf.name());
    }
  }

  return result;
}

Source File: RandomData.java From iceberg with Apache License 2.0

6 votes

private static Iterable<Record> newIterable(Supplier<RandomDataGenerator> newGenerator,
                                            Schema schema, int numRecords) {
  return () -> new Iterator<Record>() {
    private int count = 0;
    private RandomDataGenerator generator = newGenerator.get();

    @Override
    public boolean hasNext() {
      return count < numRecords;
    }

    @Override
    public Record next() {
      if (count >= numRecords) {
        throw new NoSuchElementException();
      }
      count += 1;
      return (Record) TypeUtil.visit(schema, generator);
    }
  };
}

Source File: TestTruncatesProjection.java From iceberg with Apache License 2.0

6 votes

@Test
public void testIntegerInclusiveLowerBound() {
  Integer value = 100;
  Schema schema = new Schema(optional(1, "value", Types.IntegerType.get()));
  PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build();

  assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "90");
  assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "100");
  assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100");
  assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "100");
  assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "100");
  assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE);

  assertProjectionInclusive(spec, in("value", value - 1, value, value + 1),
      Expression.Operation.IN, "[90, 100, 100]");
  assertProjectionInclusiveValue(spec, notIn("value", value, value + 1), Expression.Operation.TRUE);
}

Source File: TestBuildOrcProjection.java From iceberg with Apache License 2.0

6 votes

@Test
public void testProjectionNestedNoOp() {
  Types.StructType nestedStructType = Types.StructType.of(
      optional(2, "b", Types.StringType.get()),
      optional(3, "c", Types.DateType.get())
  );
  Schema originalSchema = new Schema(
      optional(1, "a", nestedStructType)
  );

  // Original mapping (stored in ORC)
  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(originalSchema, orcSchema);
  assertEquals(1, newOrcSchema.getChildren().size());
  assertEquals(TypeDescription.Category.STRUCT, newOrcSchema.findSubtype("a").getCategory());
  TypeDescription nestedCol = newOrcSchema.findSubtype("a");
  assertEquals(2, nestedCol.findSubtype("b").getId());
  assertEquals(TypeDescription.Category.STRING, nestedCol.findSubtype("b").getCategory());
  assertEquals(3, nestedCol.findSubtype("c").getId());
  assertEquals(TypeDescription.Category.DATE, nestedCol.findSubtype("c").getCategory());
}

Source File: TestReadProjection.java From iceberg with Apache License 2.0

6 votes

@Test
public void testReorderedProjection() throws Exception {
  Schema schema = new Schema(
      Types.NestedField.required(0, "id", Types.LongType.get()),
      Types.NestedField.optional(1, "data", Types.StringType.get())
  );

  Record record = GenericRecord.create(schema.asStruct());
  record.setField("id", 34L);
  record.setField("data", "test");

  Schema reordered = new Schema(
      Types.NestedField.optional(2, "missing_1", Types.StringType.get()),
      Types.NestedField.optional(1, "data", Types.StringType.get()),
      Types.NestedField.optional(3, "missing_2", Types.LongType.get())
  );

  Record projected = writeAndRead("full_projection", schema, reordered, record);

  Assert.assertNull("Should contain the correct 0 value", projected.get(0));
  Assert.assertEquals("Should contain the correct 1 value", "test", projected.get(1).toString());
  Assert.assertNull("Should contain the correct 2 value", projected.get(2));
}

Source File: TestORCSchemaUtil.java From iceberg with Apache License 2.0

6 votes

@Test
public void testRoundtripConversionPrimitive() {
  Schema expectedSchema = new Schema(
      optional(1, "intCol", Types.IntegerType.get()),
      optional(3, "longCol", Types.LongType.get()),
      optional(6, "intCol2", Types.IntegerType.get()),
      optional(20, "intCol3", Types.IntegerType.get()),
      required(9, "doubleCol", Types.DoubleType.get()),
      required(10, "uuidCol", Types.UUIDType.get()),
      optional(2, "booleanCol", Types.BooleanType.get()),
      optional(21, "fixedCol", Types.FixedType.ofLength(4096)),
      required(22, "binaryCol", Types.BinaryType.get()),
      required(23, "stringCol", Types.StringType.get()),
      required(24, "decimalCol", Types.DecimalType.of(15, 3)),
      required(25, "floatCol", Types.FloatType.get()),
      optional(30, "dateCol", Types.DateType.get()),
      required(32, "timeCol", Types.TimeType.get()),
      required(34, "timestampCol", Types.TimestampType.withZone())
  );
  TypeDescription orcSchema = ORCSchemaUtil.convert(expectedSchema);
  assertEquals(expectedSchema.asStruct(), ORCSchemaUtil.convert(orcSchema).asStruct());
}

Source File: TestBucketingProjection.java From iceberg with Apache License 2.0

6 votes

@Test
public void testBucketIntegerStrict() {
  Integer value = 100;
  Schema schema = new Schema(optional(1, "value", Types.IntegerType.get()));
  PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("value", 10).build();

  // the bucket number of the value (i.e. 100) is 6
  assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "6");
  assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE);
  assertProjectionStrictValue(spec, lessThan("value", value), Expression.Operation.FALSE);
  assertProjectionStrictValue(spec, lessThanOrEqual("value", value), Expression.Operation.FALSE);
  assertProjectionStrictValue(spec, greaterThan("value", value), Expression.Operation.FALSE);
  assertProjectionStrictValue(spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE);

  assertProjectionStrict(spec, notIn("value", value - 1, value, value + 1),
      Expression.Operation.NOT_IN, "[6, 7, 8]");
  assertProjectionStrictValue(spec, in("value", value, value + 1), Expression.Operation.FALSE);
}

Source File: Writer.java From iceberg with Apache License 2.0

6 votes

Writer(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
       DataSourceOptions options, boolean replacePartitions, String applicationId, String wapId,
       Schema writeSchema, StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.replacePartitions = replacePartitions;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}

Source File: HadoopTables.java From iceberg with Apache License 2.0

6 votes

/**
 * Create a table using the FileSystem implementation resolve from
 * location.
 *
 * @param schema iceberg schema used to create the table
 * @param spec partitioning spec, if null the table will be unpartitioned
 * @param properties a string map of table properties, initialized to empty if null
 * @param location a path URI (e.g. hdfs:///warehouse/my_table)
 * @return newly created table implementation
 */
@Override
public Table create(Schema schema, PartitionSpec spec, Map<String, String> properties,
                    String location) {
  Preconditions.checkNotNull(schema, "A table schema is required");

  TableOperations ops = newTableOps(location);
  if (ops.current() != null) {
    throw new AlreadyExistsException("Table already exists at location: " + location);
  }

  Map<String, String> tableProps = properties == null ? ImmutableMap.of() : properties;
  PartitionSpec partitionSpec = spec == null ? PartitionSpec.unpartitioned() : spec;
  TableMetadata metadata = TableMetadata.newTableMetadata(schema, partitionSpec, location, tableProps);
  ops.commit(null, metadata);

  return new BaseTable(ops, location);
}

Source File: TestTruncatesProjection.java From iceberg with Apache License 2.0

6 votes

@Test
public void testStringInclusive() {
  String value = "abcdefg";
  Schema schema = new Schema(optional(1, "value", Types.StringType.get()));
  PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 5).build();

  assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "abcde");
  assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "abcde");
  assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "abcde");
  assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "abcde");
  assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "abcde");
  assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE);

  assertProjectionInclusive(spec, in("value", value, value + "abc"),
      Expression.Operation.IN, "[abcde, abcde]");
  assertProjectionInclusiveValue(spec, notIn("value", value, value + "abc"), Expression.Operation.TRUE);
}

Source File: TestTruncatesProjection.java From iceberg with Apache License 2.0

6 votes

@Test
public void testBinaryInclusive() throws Exception {
  ByteBuffer value = ByteBuffer.wrap("abcdefg".getBytes("UTF-8"));
  Schema schema = new Schema(optional(1, "value", Types.BinaryType.get()));
  PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 5).build();
  String expectedValue = TransformUtil.base64encode(ByteBuffer.wrap("abcde".getBytes("UTF-8")));

  assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, expectedValue);
  assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, expectedValue);
  assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, expectedValue);
  assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, expectedValue);
  assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, expectedValue);
  assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE);

  ByteBuffer anotherValue = ByteBuffer.wrap("abcdehij".getBytes("UTF-8"));
  assertProjectionInclusive(spec, in("value", value, anotherValue),
      Expression.Operation.IN, String.format("[%s, %s]", expectedValue, expectedValue));
  assertProjectionInclusiveValue(spec, notIn("value", value, anotherValue), Expression.Operation.TRUE);
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
                                       OutputMode mode, DataSourceOptions options) {
  Preconditions.checkArgument(
      mode == OutputMode.Append() || mode == OutputMode.Complete(),
      "Output mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  // Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
  // so we fetch it directly from sparkContext to make writes idempotent
  String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
  String appId = lazySparkSession().sparkContext().applicationId();

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}

Source File: TestResiduals.java From iceberg with Apache License 2.0

6 votes

@Test
public void testIn() {
  Schema schema = new Schema(
      Types.NestedField.optional(50, "dateint", Types.IntegerType.get()),
      Types.NestedField.optional(51, "hour", Types.IntegerType.get())
  );

  PartitionSpec spec = PartitionSpec.builderFor(schema)
      .identity("dateint")
      .build();

  ResidualEvaluator resEval = ResidualEvaluator.of(spec,
      in("dateint", 20170815, 20170816, 20170817), true);

  Expression residual = resEval.residualFor(Row.of(20170815));
  Assert.assertEquals("Residual should be alwaysTrue", alwaysTrue(), residual);

  residual = resEval.residualFor(Row.of(20180815));
  Assert.assertEquals("Residual should be alwaysFalse", alwaysFalse(), residual);
}

Source File: TestParquetReadProjection.java From iceberg with Apache License 2.0

6 votes

@Override
protected GenericData.Record writeAndRead(String desc,
                                          Schema writeSchema,
                                          Schema readSchema,
                                          GenericData.Record record)
    throws IOException {
  File file = temp.newFile(desc + ".parquet");
  file.delete();

  try (FileAppender<GenericData.Record> appender = Parquet.write(Files.localOutput(file))
      .schema(writeSchema)
      .build()) {
    appender.add(record);
  }

  Iterable<GenericData.Record> records = Parquet.read(Files.localInput(file))
      .project(readSchema)
      .callInit()
      .build();

  return Iterables.getOnlyElement(records);
}

Source File: TestTruncatesProjection.java From iceberg with Apache License 2.0

6 votes

@Test
public void testDecimalInclusiveUpperBound() {
  Types.DecimalType type = Types.DecimalType.of(9, 2);
  BigDecimal value = (BigDecimal) Literal.of("99.99").to(type).value();
  Schema schema = new Schema(optional(1, "value", type));
  PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build();

  assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "99.90");
  assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "99.90");
  assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100.00");
  assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "99.90");
  assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "99.90");
  assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE);

  BigDecimal delta = new BigDecimal(1);
  assertProjectionInclusive(spec, in("value", value.add(delta), value, value.subtract(delta)),
      Expression.Operation.IN, "[98.90, 99.90, 100.90]");
  assertProjectionInclusiveValue(spec, notIn("value", value, value.subtract(delta)), Expression.Operation.TRUE);
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}

Source File: VectorizedReadFlatParquetDataBenchmark.java From iceberg with Apache License 2.0

6 votes

@Override
protected Table initTable() {
  Schema schema = new Schema(
      optional(1, "longCol", Types.LongType.get()),
      optional(2, "intCol", Types.IntegerType.get()),
      optional(3, "floatCol", Types.FloatType.get()),
      optional(4, "doubleCol", Types.DoubleType.get()),
      optional(5, "decimalCol", Types.DecimalType.of(20, 5)),
      optional(6, "dateCol", Types.DateType.get()),
      optional(7, "timestampCol", Types.TimestampType.withZone()),
      optional(8, "stringCol", Types.StringType.get()));
  PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
  HadoopTables tables = new HadoopTables(hadoopConf());
  Map<String, String> properties = parquetWriteProps();
  return tables.create(schema, partitionSpec, properties, newTableLocation());
}

Source File: TestAvroReadProjection.java From iceberg with Apache License 2.0

6 votes

@Override
protected GenericData.Record writeAndRead(String desc,
                                          Schema writeSchema,
                                          Schema readSchema,
                                          GenericData.Record record)
    throws IOException {
  File file = temp.newFile(desc + ".avro");
  file.delete();

  try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(file))
      .schema(writeSchema)
      .build()) {
    appender.add(record);
  }

  Iterable<GenericData.Record> records = Avro.read(Files.localInput(file))
      .project(readSchema)
      .build();

  return Iterables.getOnlyElement(records);
}

Source File: TestReadabilityChecks.java From iceberg with Apache License 2.0

5 votes

@Test
public void testCheckNullabilityRequiredSchemaField() {
  Schema write = new Schema(optional(1, "from_field", Types.IntegerType.get()));
  Schema read = new Schema(required(1, "to_field", Types.IntegerType.get()));

  List<String> errors = CheckCompatibility.typeCompatibilityErrors(read, write);
  Assert.assertEquals("Should produce no error messages", 0, errors.size());
}

Source File: SchemaUtilTest.java From iceberg with Apache License 2.0

5 votes

@Test
public void testLongInBag() throws IOException {
  Schema icebergSchema = new Schema(
      optional(
          1, "nested_list",
          MapType.ofOptional(
              2, 3,
              StringType.get(),
              ListType.ofRequired(5, LongType.get()))));
  SchemaUtil.convert(icebergSchema);
}

Source File: TestGenericRecord.java From iceberg with Apache License 2.0

5 votes

@Test
public void testGetNullValue() {
  Types.LongType type = Types.LongType.get();
  Schema schema = new Schema(optional(1, "id", type));
  GenericRecord record = GenericRecord.create(schema);
  record.set(0, null);

  Assert.assertNull(record.get(0, type.typeId().javaClass()));
}

Source File: TestReadabilityChecks.java From iceberg with Apache License 2.0

5 votes

@Test
public void testIncompatibleMapAndPrimitive() {
  Schema write = new Schema(required(0, "map_field", Types.MapType.ofOptional(
      1, 2, Types.StringType.get(), Types.IntegerType.get()
  )));
  Schema read = new Schema(required(0, "map_field", Types.StringType.get()));

  List<String> errors = CheckCompatibility.writeCompatibilityErrors(read, write);
  Assert.assertEquals("Should produce 1 error message", 1, errors.size());

  Assert.assertTrue("Should complain about incompatible types",
      errors.get(0).contains("map cannot be read as a string"));
}

Source File: GenericParquetReaders.java From iceberg with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public static ParquetValueReader<Record> buildReader(Schema expectedSchema,
                                                     MessageType fileSchema,
                                                     Map<Integer, ?> idToConstant) {
  if (ParquetSchemaUtil.hasIds(fileSchema)) {
    return (ParquetValueReader<Record>)
        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
            new ReadBuilder(fileSchema, idToConstant));
  } else {
    return (ParquetValueReader<Record>)
        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
            new FallbackReadBuilder(fileSchema, idToConstant));
  }
}

Source File: ArrowSchemaUtilTest.java From iceberg with Apache License 2.0

5 votes

@Test
public void convertComplex() {
  Schema iceberg = new Schema(
      Types.NestedField.optional(0, "m", MapType.ofOptional(
          1, 2, StringType.get(),
          LongType.get())
      ),
      Types.NestedField.required(3, "m2", MapType.ofOptional(
          4, 5, StringType.get(),
          ListType.ofOptional(6, TimestampType.withoutZone()))
      )
  );
  org.apache.arrow.vector.types.pojo.Schema arrow = ArrowSchemaUtil.convert(iceberg);
  Assert.assertEquals(iceberg.columns().size(), arrow.getFields().size());
}

Source File: TestExpressionSerialization.java From iceberg with Apache License 2.0

5 votes

@Test
public void testExpressions() throws Exception {
  Schema schema = new Schema(
      Types.NestedField.optional(34, "a", Types.IntegerType.get()),
      Types.NestedField.required(35, "s", Types.StringType.get())
  );

  Expression[] expressions = new Expression[] {
      Expressions.alwaysFalse(),
      Expressions.alwaysTrue(),
      Expressions.lessThan("x", 5),
      Expressions.lessThanOrEqual("y", -3),
      Expressions.greaterThan("z", 0),
      Expressions.greaterThanOrEqual("t", 129),
      Expressions.equal("col", "data"),
      Expressions.in("col", "a", "b"),
      Expressions.notIn("col", 1, 2, 3),
      Expressions.notEqual("col", "abc"),
      Expressions.notNull("maybeNull"),
      Expressions.isNull("maybeNull2"),
      Expressions.not(Expressions.greaterThan("a", 10)),
      Expressions.and(Expressions.greaterThanOrEqual("a", 0), Expressions.lessThan("a", 3)),
      Expressions.or(Expressions.lessThan("a", 0), Expressions.greaterThan("a", 10)),
      Expressions.equal("a", 5).bind(schema.asStruct()),
      Expressions.in("a", 5, 6, 7).bind(schema.asStruct()),
      Expressions.notIn("s", "abc", "xyz").bind(schema.asStruct()),
      Expressions.isNull("a").bind(schema.asStruct()),
  };

  for (Expression expression : expressions) {
    Expression copy = TestHelpers.roundTripSerialize(expression);
    Assert.assertTrue(
        "Expression should equal the deserialized copy: " + expression + " != " + copy,
        equals(expression, copy));
  }
}

Source File: IcebergFileWriterFactory.java From presto with Apache License 2.0

5 votes

private IcebergFileWriter createParquetWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session)
{
    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, columns.stream()
            .map(IcebergColumnHandle::getName)
            .collect(joining(",")));
    properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream()
            .map(column -> toHiveType(column.getType()).getHiveTypeName().toString())
            .collect(joining(":")));

    setParquetSchema(jobConf, convert(icebergSchema, "table"));
    jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name());

    return new IcebergRecordFileWriter(
            outputPath,
            columns.stream()
                    .map(IcebergColumnHandle::getName)
                    .collect(toImmutableList()),
            fromHiveStorageFormat(HiveStorageFormat.PARQUET),
            properties,
            HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(),
            jobConf,
            typeManager,
            session);
}

org.apache.iceberg.Schema Java Examples