org.apache.iceberg.types.TypeUtil#select

Source File: Schema.java From iceberg with Apache License 2.0

6 votes

private Schema internalSelect(Collection<String> names, boolean caseSensitive) {
  if (names.contains(ALL_COLUMNS)) {
    return this;
  }

  Set<Integer> selected = Sets.newHashSet();
  for (String name : names) {
    Integer id;
    if (caseSensitive) {
      id = lazyNameToId().get(name);
    } else {
      id = lazyLowerCaseNameToId().get(name.toLowerCase(Locale.ROOT));
    }

    if (id != null) {
      selected.add(id);
    }
  }

  return TypeUtil.select(this, selected);
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

6 votes

@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
  DataFile file = task.file();

  // update the current file for Spark's filename() function
  InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());

  // schema or rows returned by readers
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();

  if (projectsIdentityPartitionColumns) {
    return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant))
        .iterator();
  }
  // return the base iterator
  return open(task, expectedSchema, ImmutableMap.of()).iterator();
}

Source File: BaseTableScan.java From iceberg with Apache License 2.0

6 votes

/**
 * To be able to make refinements {@link #select(Collection)} and {@link #caseSensitive(boolean)} in any order,
 * we resolve the schema to be projected lazily here.
 *
 * @return the Schema to project
 */
private Schema lazyColumnProjection() {
  Collection<String> selectedColumns = context.selectedColumns();
  if (selectedColumns != null) {
    Set<Integer> requiredFieldIds = Sets.newHashSet();

    // all of the filter columns are required
    requiredFieldIds.addAll(
        Binder.boundReferences(table.schema().asStruct(),
            Collections.singletonList(context.rowFilter()), context.caseSensitive()));

    // all of the projection columns are required
    Set<Integer> selectedIds;
    if (context.caseSensitive()) {
      selectedIds = TypeUtil.getProjectedIds(table.schema().select(selectedColumns));
    } else {
      selectedIds = TypeUtil.getProjectedIds(table.schema().caseInsensitiveSelect(selectedColumns));
    }
    requiredFieldIds.addAll(selectedIds);

    return TypeUtil.select(table.schema(), requiredFieldIds);
  }

  return schema;
}

Source File: IcebergInputFormat.java From iceberg with Apache License 2.0

5 votes

private Map<Integer, ?> constantsMap(FileScanTask task, BiFunction<Type, Object, Object> converter) {
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();
  if (projectsIdentityPartitionColumns) {
    return PartitionUtil.constantsMap(task, converter);
  } else {
    return Collections.emptyMap();
  }
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

5 votes

private static Schema withColumns(String... names) {
  Map<String, Integer> indexByName = TypeUtil.indexByName(LOG_SCHEMA.asStruct());
  Set<Integer> projectedIds = Sets.newHashSet();
  for (String name : names) {
    projectedIds.add(indexByName.get(name));
  }
  return TypeUtil.select(LOG_SCHEMA, projectedIds);
}

Source File: IcebergPigInputFormat.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private boolean advance() throws IOException {
  if (reader != null) {
    reader.close();
  }

  if (!tasks.hasNext()) {
    return false;
  }

  FileScanTask currentTask = tasks.next();

  Schema tableSchema = (Schema) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_SCHEMA)));
  LOG.debug("[{}]: Task table schema: {}", signature, tableSchema);

  List<String> projectedFields =
      (List<String>) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_PROJECTED_FIELDS)));
  LOG.debug("[{}]: Task projected fields: {}", signature, projectedFields);

  Schema projectedSchema = projectedFields != null ? SchemaUtil.project(tableSchema, projectedFields) : tableSchema;

  PartitionSpec spec = currentTask.asFileScanTask().spec();
  DataFile file = currentTask.file();
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());

  Set<Integer> idColumns = spec.identitySourceIds();

  // schema needed for the projection and filtering
  boolean hasJoinedPartitionColumns = !idColumns.isEmpty();

  switch (file.format()) {
    case PARQUET:
      Map<Integer, Object> partitionValueMap = Maps.newHashMap();

      if (hasJoinedPartitionColumns) {

        Schema readSchema = TypeUtil.selectNot(projectedSchema, idColumns);
        Schema projectedPartitionSchema = TypeUtil.select(projectedSchema, idColumns);

        Map<String, Integer> partitionSpecFieldIndexMap = Maps.newHashMap();
        for (int i = 0; i < spec.fields().size(); i++) {
          partitionSpecFieldIndexMap.put(spec.fields().get(i).name(), i);
        }

        for (Types.NestedField field : projectedPartitionSchema.columns()) {
          int partitionIndex = partitionSpecFieldIndexMap.get(field.name());

          Object partitionValue = file.partition().get(partitionIndex, Object.class);
          partitionValueMap.put(field.fieldId(), convertPartitionValue(field.type(), partitionValue));
        }

        reader = Parquet.read(inputFile)
            .project(readSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      } else {
        reader = Parquet.read(inputFile)
            .project(projectedSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      }

      recordIterator = reader.iterator();

      break;
    default:
      throw new UnsupportedOperationException("Unsupported file format: " + file.format());
  }

  return true;
}

Java Code Examples for org.apache.iceberg.types.TypeUtil#select()