Java Code Examples for org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple#empty()

The following examples show how to use org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple#empty() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestExpansionService.java From beam with Apache License 2.0

5 votes

@Override
public KeyedPCollectionTuple<Long> createInput(
    Pipeline p, Map<String, PCollection<?>> inputs) {
  KeyedPCollectionTuple inputTuple = KeyedPCollectionTuple.empty(p);
  for (Map.Entry<String, PCollection<?>> entry : inputs.entrySet()) {
    inputTuple = inputTuple.and(new TupleTag(entry.getKey()), entry.getValue());
  }
  return inputTuple;
}

Example 2

Source File: CoGroup.java From beam with Apache License 2.0

4 votes

private static JoinInformation from(
    PCollectionTuple input,
    Function<String, FieldAccessDescriptor> getFieldAccessDescriptor,
    Function<String, Boolean> getIsSideInput) {
  KeyedPCollectionTuple<Row> keyedPCollectionTuple =
      KeyedPCollectionTuple.empty(input.getPipeline());

  List<String> sortedTags =
      input.getAll().keySet().stream()
          .map(TupleTag::getId)
          .sorted()
          .collect(Collectors.toList());

  // Keep this in a TreeMap so that it's sorted. This way we get a deterministic output
  // schema.
  TreeMap<String, Schema> componentSchemas = Maps.newTreeMap();
  Map<Integer, SerializableFunction<Object, Row>> toRows = Maps.newHashMap();

  Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs = Maps.newHashMap();
  Map<Integer, String> tagToKeyedTag = Maps.newHashMap();
  Schema keySchema = null;
  for (Map.Entry<TupleTag<?>, PCollection<?>> entry : input.getAll().entrySet()) {
    String tag = entry.getKey().getId();
    int tagIndex = sortedTags.indexOf(tag);
    PCollection<?> pc = entry.getValue();
    Schema schema = pc.getSchema();
    componentSchemas.put(tag, schema);
    toRows.put(tagIndex, (SerializableFunction<Object, Row>) pc.getToRowFunction());
    FieldAccessDescriptor fieldAccessDescriptor = getFieldAccessDescriptor.apply(tag);
    if (fieldAccessDescriptor == null) {
      throw new IllegalStateException("No fields were set for input " + tag);
    }
    // Resolve the key schema, keeping the fields in the order specified by the user.
    // Otherwise, if different field names are specified for different PCollections, they
    // might not match up.
    // The key schema contains the field names from the first PCollection specified.
    FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(schema);
    Schema currentKeySchema = SelectHelpers.getOutputSchema(schema, resolved);
    if (keySchema == null) {
      keySchema = currentKeySchema;
    } else {
      keySchema = SchemaUtils.mergeWideningNullable(keySchema, currentKeySchema);
    }

    // Create a new tag for the output.
    TupleTag randomTag = new TupleTag<>();
    String keyedTag = tag + "_" + randomTag;
    tagToKeyedTag.put(tagIndex, keyedTag);
    PCollection<KV<Row, Row>> keyedPCollection =
        extractKey(pc, schema, keySchema, resolved, tag);
    if (getIsSideInput.apply(tag)) {
      sideInputs.put(
          keyedTag, keyedPCollection.apply("computeSideInputView" + tag, View.asMultimap()));
    } else {
      keyedPCollectionTuple = keyedPCollectionTuple.and(keyedTag, keyedPCollection);
    }
  }
  return new JoinInformation(
      keyedPCollectionTuple,
      sideInputs,
      keySchema,
      componentSchemas,
      toRows,
      sortedTags,
      tagToKeyedTag);
}