org.apache.beam.sdk.values.PCollectionList Java Exaples

Source File: SetsTest.java From beam with Apache License 2.0

6 votes

@Test
@Category(NeedsRunner.class)
public void testIntersectionCollectionList() {

  PCollection<String> third = p.apply("third", Create.of(Arrays.asList("b", "b", "c", "f")));
  PCollection<Row> thirdRows = p.apply("thirdRows", Create.of(toRows("b", "b", "c", "f")));

  PAssert.that(
          PCollectionList.of(first)
              .and(second)
              .and(third)
              .apply("stringsCols", Sets.intersectDistinct()))
      .containsInAnyOrder("b", "c");

  PCollection<Row> results =
      PCollectionList.of(firstRows)
          .and(secondRows)
          .and(thirdRows)
          .apply("rowCols", Sets.intersectDistinct());

  PAssert.that(results).containsInAnyOrder(toRows("b", "c"));

  assertEquals(schema, results.getSchema());

  p.run();
}

Source File: RepublishPerNamespace.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerNamespaceDestinations().entrySet().stream()
      .map(entry -> new Destination(entry.getKey(), entry.getValue()))
      .collect(Collectors.toList());
  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByNamespace",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);
    opts.setOutput(StaticValueProvider.of(destination.dest));
    String name = String.join("_", "republish", destination.namespace);
    partitioned.get(i).apply(name, opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}

Source File: BeamSqlRelUtils.java From beam with Apache License 2.0

6 votes

/**
 * A {@link BeamRelNode} is a recursive structure, the {@code BeamQueryPlanner} visits it with a
 * DFS(Depth-First-Search) algorithm.
 */
static PCollection<Row> toPCollection(
    Pipeline pipeline, BeamRelNode node, Map<Integer, PCollection<Row>> cache) {
  PCollection<Row> output = cache.get(node.getId());
  if (output != null) {
    return output;
  }

  String name = node.getClass().getSimpleName() + "_" + node.getId();
  PCollectionList<Row> input = buildPCollectionList(node.getPCollectionInputs(), pipeline, cache);
  PTransform<PCollectionList<Row>, PCollection<Row>> transform = node.buildPTransform();
  output = Pipeline.applyTransform(name, input, transform);

  cache.put(node.getId(), output);
  return output;
}

Source File: FlattenTest.java From beam with Apache License 2.0

6 votes

@Test
@Category({ValidatesRunner.class, UsesSideInputs.class})
public void testEmptyFlattenAsSideInput() {
  final PCollectionView<Iterable<String>> view =
      PCollectionList.<String>empty(p)
          .apply(Flatten.pCollections())
          .setCoder(StringUtf8Coder.of())
          .apply(View.asIterable());

  PCollection<String> output =
      p.apply(Create.of((Void) null).withCoder(VoidCoder.of()))
          .apply(
              ParDo.of(
                      new DoFn<Void, String>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
                          for (String side : c.sideInput(view)) {
                            c.output(side);
                          }
                        }
                      })
                  .withSideInputs(view));

  PAssert.that(output).empty();
  p.run();
}

Source File: BeamTableFunctionScanRel.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Row> expand(PCollectionList<Row> input) {
  checkArgument(
      input.size() == 1,
      "Wrong number of inputs for %s, expected 1 input but received: %s",
      BeamTableFunctionScanRel.class.getSimpleName(),
      input);
  String operatorName = ((RexCall) getCall()).getOperator().getName();
  checkArgument(
      tvfToPTransformMap.keySet().contains(operatorName),
      "Only support %s table-valued functions. Current operator: %s",
      tvfToPTransformMap.keySet(),
      operatorName);

  return tvfToPTransformMap.get(operatorName).toPTransform(((RexCall) getCall()), input.get(0));
}

Source File: BeamUncollectRel.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 1,
      "Wrong number of inputs for %s: %s",
      BeamUncollectRel.class.getSimpleName(),
      pinput);
  PCollection<Row> upstream = pinput.get(0);

  // Each row of the input contains a single array of things to be emitted; Calcite knows
  // what the row looks like
  Schema outputSchema = CalciteUtils.toSchema(getRowType());

  PCollection<Row> uncollected =
      upstream.apply(ParDo.of(new UncollectDoFn(outputSchema))).setRowSchema(outputSchema);

  return uncollected;
}

Source File: Sink.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(SinkOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> failureCollections = new ArrayList<>();

  pipeline //
      .apply(options.getInputType().read(options)) //
      .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
      .apply(options.getOutputType().write(options)).failuresTo(failureCollections);

  PCollectionList.of(failureCollections) //
      .apply("FlattenFailureCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}

Source File: Partition.java From beam with Apache License 2.0

6 votes

@Override
public PCollectionList<T> expand(PCollection<T> in) {
  final TupleTagList outputTags = partitionDoFn.getOutputTags();

  PCollectionTuple outputs =
      in.apply(
          ParDo.of(partitionDoFn)
              .withOutputTags(new TupleTag<Void>() {}, outputTags)
              .withSideInputs(partitionDoFn.getSideInputs()));

  PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
  Coder<T> coder = in.getCoder();

  for (TupleTag<?> outputTag : outputTags.getAll()) {
    // All the tuple tags are actually TupleTag<T>
    // And all the collections are actually PCollection<T>
    @SuppressWarnings("unchecked")
    TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
    pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
  }
  return pcs;
}

Source File: Window.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<T> expand(PCollection<T> input) {
  applicableTo(input);

  WindowingStrategy<?, ?> outputStrategy =
      getOutputStrategyInternal(input.getWindowingStrategy());

  if (getWindowFn() == null) {
    // A new PCollection must be created in case input is reused in a different location as the
    // two PCollections will, in general, have a different windowing strategy.
    return PCollectionList.of(input)
        .apply(Flatten.pCollections())
        .setWindowingStrategyInternal(outputStrategy);
  } else {
    // This is the AssignWindows primitive
    return input.apply(new Assign<>(this, outputStrategy));
  }
}

Source File: EmptyFlattenAsCreateFactoryTest.java From beam with Apache License 2.0

6 votes

@Test
@Category(NeedsRunner.class)
public void testOverride() {
  PCollectionList<Long> empty = PCollectionList.empty(pipeline);
  PCollection<Long> emptyFlattened =
      empty.apply(
          factory
              .getReplacementTransform(
                  AppliedPTransform.of(
                      "nonEmptyInput",
                      Collections.emptyMap(),
                      Collections.emptyMap(),
                      Flatten.pCollections(),
                      pipeline))
              .getTransform());
  PAssert.that(emptyFlattened).empty();
  pipeline.run();
}

Source File: FlattenTest.java From beam with Apache License 2.0

6 votes

@Test
@Category(ValidatesRunner.class)
public void testFlattenWithDifferentInputAndOutputCoders2() {
  // This test exists to prevent a regression in Dataflow. It tests a
  // GroupByKey followed by a Flatten with an SDK-specific output coder.
  PCollection<KV<String, Iterable<String>>> flattenInput =
      p.apply(Create.of(LINES))
          .apply(WithKeys.of("a"))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))
          .apply(GroupByKey.create());
  PCollection<String> output =
      PCollectionList.of(flattenInput)
          .apply(Flatten.pCollections())
          .setCoder(SerializableCoder.of(new TypeDescriptor<KV<String, Iterable<String>>>() {}))
          .apply(Values.create())
          .setCoder(IterableCoder.of(StringUtf8Coder.of()))
          .apply(
              FlatMapElements.into(TypeDescriptors.strings())
                  .via((Iterable<String> values) -> values));
  PAssert.that(output).containsInAnyOrder(LINES);
  p.run();
}

Source File: FlattenTranslatorBatch.java From beam with Apache License 2.0

6 votes

@Override
public void translateTransform(
    PTransform<PCollectionList<T>, PCollection<T>> transform, TranslationContext context) {
  Collection<PValue> pcollectionList = context.getInputs().values();
  Dataset<WindowedValue<T>> result = null;
  if (pcollectionList.isEmpty()) {
    result = context.emptyDataset();
  } else {
    for (PValue pValue : pcollectionList) {
      checkArgument(
          pValue instanceof PCollection,
          "Got non-PCollection input to flatten: %s of type %s",
          pValue,
          pValue.getClass().getSimpleName());
      @SuppressWarnings("unchecked")
      PCollection<T> pCollection = (PCollection<T>) pValue;
      Dataset<WindowedValue<T>> current = context.getDataset(pCollection);
      if (result == null) {
        result = current;
      } else {
        result = result.union(current);
      }
    }
  }
  context.putDataset(context.getOutput(), result);
}

Source File: TaskTest.java From beam with Apache License 2.0

6 votes

@Test
public void groupByKey() {
  PCollection<Integer> numbers =
      testPipeline.apply(
          Create.of(1, 2, 3, 4, 5, 100, 110, 150, 250)
      );

  PCollectionList<Integer> results = Task.applyTransform(numbers);

  PAssert.that(results.get(0))
      .containsInAnyOrder(110, 150, 250);

  PAssert.that(results.get(1))
      .containsInAnyOrder(1, 2, 3, 4, 5, 100);

  testPipeline.run().waitUntilFinish();
}

Source File: ExpansionService.java From beam with Apache License 2.0

6 votes

default Map<String, PCollection<?>> extractOutputs(OutputT output) {
  if (output instanceof PDone) {
    return Collections.emptyMap();
  } else if (output instanceof PCollection) {
    return ImmutableMap.of("output", (PCollection<?>) output);
  } else if (output instanceof PCollectionTuple) {
    return ((PCollectionTuple) output)
        .getAll().entrySet().stream()
            .collect(Collectors.toMap(entry -> entry.getKey().getId(), Map.Entry::getValue));
  } else if (output instanceof PCollectionList<?>) {
    PCollectionList<?> listOutput = (PCollectionList<?>) output;
    return IntStream.range(0, listOutput.size())
        .boxed()
        .collect(Collectors.toMap(Object::toString, listOutput::get));
  } else {
    throw new UnsupportedOperationException("Unknown output type: " + output.getClass());
  }
}

Source File: Combine.java From beam with Apache License 2.0

6 votes

private PCollection<OutputT> insertDefaultValueIfEmpty(PCollection<OutputT> maybeEmpty) {
  final PCollectionView<Iterable<OutputT>> maybeEmptyView = maybeEmpty.apply(View.asIterable());

  final OutputT defaultValue = fn.defaultValue();
  PCollection<OutputT> defaultIfEmpty =
      maybeEmpty
          .getPipeline()
          .apply("CreateVoid", Create.of((Void) null).withCoder(VoidCoder.of()))
          .apply(
              "ProduceDefault",
              ParDo.of(
                      new DoFn<Void, OutputT>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
                          Iterator<OutputT> combined = c.sideInput(maybeEmptyView).iterator();
                          if (!combined.hasNext()) {
                            c.output(defaultValue);
                          }
                        }
                      })
                  .withSideInputs(maybeEmptyView))
          .setCoder(maybeEmpty.getCoder())
          .setWindowingStrategyInternal(maybeEmpty.getWindowingStrategy());

  return PCollectionList.of(maybeEmpty).and(defaultIfEmpty).apply(Flatten.pCollections());
}

Source File: EmptyFlattenAsCreateFactoryTest.java From beam with Apache License 2.0

6 votes

@Test
public void getInputNonEmptyThrows() {
  PCollectionList<Long> nonEmpty =
      PCollectionList.of(pipeline.apply("unbounded", GenerateSequence.from(0)))
          .and(pipeline.apply("bounded", GenerateSequence.from(0).to(100)));
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(nonEmpty.expand().toString());
  thrown.expectMessage(EmptyFlattenAsCreateFactory.class.getSimpleName());
  factory.getReplacementTransform(
      AppliedPTransform.of(
          "nonEmptyInput",
          nonEmpty.expand(),
          Collections.emptyMap(),
          Flatten.pCollections(),
          pipeline));
}

Source File: BeamPushDownIOSourceRel.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Row> expand(PCollectionList<Row> input) {
  checkArgument(
      input.size() == 0,
      "Should not have received input for %s: %s",
      BeamIOSourceRel.class.getSimpleName(),
      input);

  final PBegin begin = input.getPipeline().begin();
  final BeamSqlTable beamSqlTable = BeamPushDownIOSourceRel.this.getBeamSqlTable();

  if (usedFields.isEmpty() && tableFilters instanceof DefaultTableFilter) {
    return beamSqlTable.buildIOReader(begin);
  }

  final Schema newBeamSchema = CalciteUtils.toSchema(getRowType());
  return beamSqlTable
      .buildIOReader(begin, tableFilters, usedFields)
      .setRowSchema(newBeamSchema);
}

Source File: SetsTest.java From beam with Apache License 2.0

6 votes

@Test
@Category(NeedsRunner.class)
public void testExceptCollectionList() {
  PCollection<String> third = p.apply("third", Create.of(Arrays.asList("a", "b", "b", "g", "g")));
  PCollection<Row> thirdRows = p.apply("thirdRows", Create.of(toRows("a", "b", "b", "g", "g")));

  PAssert.that(
          PCollectionList.of(first)
              .and(second)
              .and(third)
              .apply("stringsCols", Sets.exceptDistinct()))
      .containsInAnyOrder("h");

  PCollection<Row> results =
      PCollectionList.of(firstRows)
          .and(secondRows)
          .and(thirdRows)
          .apply("rowCols", Sets.exceptDistinct());

  PAssert.that(results).containsInAnyOrder(toRows("h"));

  assertEquals(schema, results.getSchema());

  p.run();
}

Source File: UnconsumedReadsTest.java From beam with Apache License 2.0

6 votes

@Test
public void doesNotConsumeAlreadyConsumedRead() {
  Unbounded<Long> transform = Read.from(CountingSource.unbounded());
  final PCollection<Long> output = pipeline.apply(transform);
  final Flatten.PCollections<Long> consumer = Flatten.pCollections();
  PCollectionList.of(output).apply(consumer);
  UnconsumedReads.ensureAllReadsConsumed(pipeline);
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          // The output should only be consumed by a single consumer
          if (node.getInputs().values().contains(output)) {
            assertThat(node.getTransform(), Matchers.is(consumer));
          }
        }
      });
}

Source File: SetsTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(NeedsRunner.class)
public void testUnionAllCollections() {

  PCollection<String> third = p.apply("third", Create.of(Arrays.asList("a", "b", "b", "k", "k")));
  PCollection<Row> thirdRows = p.apply("thirdRows", Create.of(toRows("a", "b", "b", "k", "k")));

  PAssert.that(
          PCollectionList.of(first).and(second).and(third).apply("stringsCols", Sets.unionAll()))
      .containsInAnyOrder(
          "a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "c", "c", "d", "d",
          "d", "d", "e", "e", "f", "f", "g", "g", "h", "h", "k", "k");

  PCollection<Row> results =
      PCollectionList.of(firstRows)
          .and(secondRows)
          .and(thirdRows)
          .apply("rowCols", Sets.unionAll());

  PAssert.that(results)
      .containsInAnyOrder(
          toRows(
              "a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "c", "c", "d", "d",
              "d", "d", "e", "e", "f", "f", "g", "g", "h", "h", "k", "k"));

  assertEquals(schema, results.getSchema());

  p.run();
}

Source File: BeamSideInputLookupJoinRel.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  Schema schema = CalciteUtils.toSchema(getRowType());

  BeamRelNode seekableRel =
      BeamSqlRelUtils.getBeamRelInput(getInput(seekableInputIndex().get()));
  BeamRelNode nonSeekableRel =
      BeamSqlRelUtils.getBeamRelInput(getInput(nonSeekableInputIndex().get()));

  // Offset field references according to which table is on the left
  int factColOffset =
      nonSeekableInputIndex().get() == 0
          ? 0
          : CalciteUtils.toSchema(seekableRel.getRowType()).getFieldCount();
  int lkpColOffset =
      seekableInputIndex().get() == 0
          ? 0
          : CalciteUtils.toSchema(nonSeekableRel.getRowType()).getFieldCount();

  // HACK: if the input is an immediate instance of a seekable IO, we can do lookups
  // so we ignore the PCollection
  BeamIOSourceRel seekableInput = (BeamIOSourceRel) seekableRel;
  BeamSqlSeekableTable seekableTable = (BeamSqlSeekableTable) seekableInput.getBeamSqlTable();

  // getPCollectionInputs() ensures that there is only one and it is the non-seekable input
  PCollection<Row> nonSeekableInput = pinput.get(0);

  return nonSeekableInput
      .apply(
          "join_as_lookup",
          new BeamJoinTransforms.JoinAsLookup(
              condition,
              seekableTable,
              CalciteUtils.toSchema(seekableInput.getRowType()),
              schema,
              factColOffset,
              lkpColOffset))
      .setRowSchema(schema);
}

Source File: BeamValuesRel.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 0,
      "Should not have received input for %s: %s",
      BeamValuesRel.class.getSimpleName(),
      pinput);

  Schema schema = CalciteUtils.toSchema(getRowType());
  List<Row> rows = tuples.stream().map(tuple -> tupleToRow(schema, tuple)).collect(toList());
  return pinput.getPipeline().begin().apply(Create.of(rows).withRowSchema(schema));
}

Source File: BatchLoads.java From beam with Apache License 2.0

5 votes

PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFiles(
    PCollection<KV<DestinationT, ElementT>> input, PCollectionView<String> tempFilePrefix) {
  TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag =
      new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {};
  TupleTag<KV<ShardedKey<DestinationT>, ElementT>> unwrittedRecordsTag =
      new TupleTag<KV<ShardedKey<DestinationT>, ElementT>>("unwrittenRecords") {};
  PCollectionTuple writeBundlesTuple =
      input.apply(
          "WriteBundlesToFiles",
          ParDo.of(
                  new WriteBundlesToFiles<>(
                      tempFilePrefix,
                      unwrittedRecordsTag,
                      maxNumWritersPerBundle,
                      maxFileSize,
                      rowWriterFactory))
              .withSideInputs(tempFilePrefix)
              .withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles =
      writeBundlesTuple
          .get(writtenFilesTag)
          .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
  PCollection<KV<ShardedKey<DestinationT>, ElementT>> unwrittenRecords =
      writeBundlesTuple
          .get(unwrittedRecordsTag)
          .setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), elementCoder));

  // If the bundles contain too many output tables to be written inline to files (due to memory
  // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
  // Group these records by key, and write the files after grouping. Since the record is grouped
  // by key, we can ensure that only one file is open at a time in each bundle.
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped =
      writeShardedRecords(unwrittenRecords, tempFilePrefix);

  // PCollection of filename, file byte size, and table destination.
  return PCollectionList.of(writtenFiles)
      .and(writtenFilesGrouped)
      .apply("FlattenFiles", Flatten.pCollections())
      .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
}

Source File: AssignEventTime.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<InputT> expand(PCollectionList<InputT> inputs) {
  final PCollection<InputT> input = PCollectionLists.getOnlyElement(inputs);
  return FlatMap.named(getName().orElse(null))
      .of(input)
      .using(
          (InputT element, Collector<InputT> coll) -> coll.collect(element),
          input.getTypeDescriptor())
      .eventTimeBy(getEventTimeExtractor(), allowedTimestampSkew)
      .output();
}

Source File: EmptyFlattenAsCreateFactoryTest.java From beam with Apache License 2.0

5 votes

@Test
public void getInputEmptySucceeds() {
  PTransformReplacement<PCollectionList<Long>, PCollection<Long>> replacement =
      factory.getReplacementTransform(
          AppliedPTransform.of(
              "nonEmptyInput",
              Collections.emptyMap(),
              Collections.emptyMap(),
              Flatten.pCollections(),
              pipeline));
  assertThat(replacement.getInput().getAll(), emptyIterable());
}

Source File: MapElements.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<OutputT> expand(PCollectionList<InputT> inputs) {
  return FlatMap.named(getName().orElse(null))
      .of(PCollectionLists.getOnlyElement(inputs))
      .using(
          (InputT elem, Collector<OutputT> coll) ->
              coll.collect(getMapper().apply(elem, coll.asContext())),
          getOutputType().orElse(null))
      .output();
}

Source File: BeamSetOperatorRelBase.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PCollectionList<Row> inputs) {
  checkArgument(
      inputs.size() == 2,
      "Wrong number of arguments to %s: %s",
      beamRelNode.getClass().getSimpleName(),
      inputs);
  PCollection<Row> leftRows = inputs.get(0);
  PCollection<Row> rightRows = inputs.get(1);

  WindowFn leftWindow = leftRows.getWindowingStrategy().getWindowFn();
  WindowFn rightWindow = rightRows.getWindowingStrategy().getWindowFn();
  if (!leftWindow.isCompatible(rightWindow)) {
    throw new IllegalArgumentException(
        "inputs of "
            + opType
            + " have different window strategy: "
            + leftWindow
            + " VS "
            + rightWindow);
  }

  // TODO: We may want to preaggregate the counts first using Group instead of calling CoGroup and
  // measuring the
  // iterable size. If on average there are duplicates in the input, this will be faster.
  final String lhsTag = "lhs";
  final String rhsTag = "rhs";
  PCollection<Row> joined =
      PCollectionTuple.of(lhsTag, leftRows, rhsTag, rightRows)
          .apply("CoGroup", CoGroup.join(By.fieldNames("*")));
  return joined
      .apply(
          "FilterResults",
          ParDo.of(
              new BeamSetOperatorsTransforms.SetOperatorFilteringDoFn(
                  lhsTag, rhsTag, opType, all)))
      .setRowSchema(joined.getSchema().getField("key").getType().getRowSchema());
}

Source File: SingleInputOutputOverrideFactoryTest.java From beam with Apache License 2.0

5 votes

@Test
public void testMapOutputsMultipleOriginalOutputsFails() {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  PCollection<Integer> output = input.apply("Map", MapElements.via(fn));
  PCollection<Integer> reappliedOutput = input.apply("ReMap", MapElements.via(fn));
  thrown.expect(IllegalArgumentException.class);
  factory.mapOutputs(
      PCollectionList.of(output).and(input).and(reappliedOutput).expand(), reappliedOutput);
}

Source File: FlattenTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(ValidatesRunner.class)
public void testFlattenInputMultipleCopies() {
  int count = 5;
  PCollection<Long> longs = p.apply("mkLines", GenerateSequence.from(0).to(count));
  PCollection<Long> biggerLongs =
      p.apply("mkOtherLines", GenerateSequence.from(0).to(count))
          .apply(
              MapElements.via(
                  new SimpleFunction<Long, Long>() {
                    @Override
                    public Long apply(Long input) {
                      return input + 10L;
                    }
                  }));

  PCollection<Long> flattened =
      PCollectionList.of(longs).and(longs).and(biggerLongs).apply(Flatten.pCollections());

  List<Long> expectedLongs = new ArrayList<>();
  for (int i = 0; i < count; i++) {
    // The duplicated input
    expectedLongs.add((long) i);
    expectedLongs.add((long) i);
    // The bigger longs
    expectedLongs.add(i + 10L);
  }
  PAssert.that(flattened).containsInAnyOrder(expectedLongs);

  p.run();
}

Source File: Distinct.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<InputT> expand(PCollectionList<InputT> inputs) {
  PCollection<InputT> tmp = PCollectionLists.getOnlyElement(inputs);
  PCollection<InputT> input =
      getWindow()
          .map(
              w -> {
                PCollection<InputT> ret = tmp.apply(w);
                ret.setTypeDescriptor(tmp.getTypeDescriptor());
                return ret;
              })
          .orElse(tmp);
  if (!projected) {
    PCollection<KV<InputT, Void>> distinct =
        ReduceByKey.named(getName().orElse(null))
            .of(input)
            .keyBy(e -> e, input.getTypeDescriptor())
            .valueBy(e -> null, TypeDescriptors.nulls())
            .combineBy(e -> null, TypeDescriptors.nulls())
            .output();
    return MapElements.named(getName().orElse("") + "::extract-keys")
        .of(distinct)
        .using(KV::getKey, input.getTypeDescriptor())
        .output();
  }
  UnaryFunction<PCollection<InputT>, PCollection<InputT>> transformFn = getTransformFn();
  return transformFn.apply(input);
}

org.apache.beam.sdk.values.PCollectionList Java Examples