org.apache.beam.sdk.values.TupleTagList Java Exaples

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

6 votes

@Test
public void parDoRequiresStableInput() {
  DoFn<Object, Object> doFnRSI =
      new DoFn<Object, Object>() {
        @RequiresStableInput
        @ProcessElement
        public void process(ProcessContext ctxt) {}
      };

  AppliedPTransform<?, ?, ?> single = getAppliedTransform(ParDo.of(doFn));
  AppliedPTransform<?, ?, ?> singleRSI = getAppliedTransform(ParDo.of(doFnRSI));
  AppliedPTransform<?, ?, ?> multi =
      getAppliedTransform(ParDo.of(doFn).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
  AppliedPTransform<?, ?, ?> multiRSI =
      getAppliedTransform(
          ParDo.of(doFnRSI).withOutputTags(new TupleTag<>(), TupleTagList.empty()));

  assertThat(PTransformMatchers.requiresStableInputParDoSingle().matches(single), is(false));
  assertThat(PTransformMatchers.requiresStableInputParDoSingle().matches(singleRSI), is(true));
  assertThat(PTransformMatchers.requiresStableInputParDoSingle().matches(multi), is(false));
  assertThat(PTransformMatchers.requiresStableInputParDoSingle().matches(multiRSI), is(false));
  assertThat(PTransformMatchers.requiresStableInputParDoMulti().matches(single), is(false));
  assertThat(PTransformMatchers.requiresStableInputParDoMulti().matches(singleRSI), is(false));
  assertThat(PTransformMatchers.requiresStableInputParDoMulti().matches(multi), is(false));
  assertThat(PTransformMatchers.requiresStableInputParDoMulti().matches(multiRSI), is(true));
}

Source File: SplittableDoFnTest.java From beam with Apache License 2.0

6 votes

private void testAdditionalOutput(IsBounded bounded) {
  TupleTag<String> mainOutputTag = new TupleTag<String>("main") {};
  TupleTag<String> additionalOutputTag = new TupleTag<String>("additional") {};

  PCollectionTuple res =
      p.apply("input", Create.of(0, 1, 2))
          .apply(
              ParDo.of(sdfWithAdditionalOutput(bounded, additionalOutputTag))
                  .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));

  PAssert.that(res.get(mainOutputTag))
      .containsInAnyOrder(Arrays.asList("main:0", "main:1", "main:2"));
  PAssert.that(res.get(additionalOutputTag))
      .containsInAnyOrder(Arrays.asList("additional:0", "additional:1", "additional:2"));

  p.run();
}

Source File: PubSubToElasticsearch.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PCollection<PubsubMessage> input) {

  // Map the incoming messages into FailsafeElements so we can recover from failures
  // across multiple transforms.
  PCollection<FailsafeElement<PubsubMessage, String>> failsafeElements =
      input.apply("MapToRecord", ParDo.of(new PubsubMessageToFailsafeElementFn()));

  // If a Udf is supplied then use it to parse the PubSubMessages.
  if (javascriptTextTransformGcsPath() != null) {
    return failsafeElements.apply(
        "InvokeUDF",
        FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
            .setFileSystemPath(javascriptTextTransformGcsPath())
            .setFunctionName(javascriptTextTransformFunctionName())
            .setSuccessTag(TRANSFORM_OUT)
            .setFailureTag(TRANSFORM_DEADLETTER_OUT)
            .build());
  } else {
    return failsafeElements.apply(
        "ProcessPubSubMessages",
        ParDo.of(new ProcessFailsafePubSubFn())
            .withOutputTags(TRANSFORM_OUT, TupleTagList.of(TRANSFORM_DEADLETTER_OUT)));
  }
}

Source File: PubSubToMongoDB.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PCollection<PubsubMessage> input) {

  // Map the incoming messages into FailsafeElements so we can recover from failures
  // across multiple transforms.
  PCollection<FailsafeElement<PubsubMessage, String>> failsafeElements =
          input.apply("MapToRecord", ParDo.of(new PubsubMessageToFailsafeElementFn()));

  // If a Udf is supplied then use it to parse the PubSubMessages.
  if (javascriptTextTransformGcsPath() != null) {
    return failsafeElements.apply(
            "InvokeUDF",
            JavascriptTextTransformer.FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                    .setFileSystemPath(javascriptTextTransformGcsPath())
                    .setFunctionName(javascriptTextTransformFunctionName())
                    .setSuccessTag(TRANSFORM_OUT)
                    .setFailureTag(TRANSFORM_DEADLETTER_OUT)
                    .build());
  } else {
    return failsafeElements.apply(
            "ProcessPubSubMessages",
            ParDo.of(new ProcessFailsafePubSubFn())
                    .withOutputTags(TRANSFORM_OUT, TupleTagList.of(TRANSFORM_DEADLETTER_OUT)));
  }
}

Source File: BigQueryConverters.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PCollection<FailsafeElement<T, String>> failsafeElements) {
  return failsafeElements.apply(
      "JsonToTableRow",
      ParDo.of(
              new DoFn<FailsafeElement<T, String>, TableRow>() {
                @ProcessElement
                public void processElement(ProcessContext context) {
                  FailsafeElement<T, String> element = context.element();
                  String json = element.getPayload();

                  try {
                    TableRow row = convertJsonToTableRow(json);
                    context.output(row);
                  } catch (Exception e) {
                    context.output(
                        failureTag(),
                        FailsafeElement.of(element)
                            .setErrorMessage(e.getMessage())
                            .setStacktrace(Throwables.getStackTraceAsString(e)));
                  }
                }
              })
          .withOutputTags(successTag(), TupleTagList.of(failureTag())));
}

Source File: BatchStatefulParDoOverridesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testMultiOutputOverrideNonCrashing() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions();
  options.setRunner(DataflowRunner.class);
  Pipeline pipeline = Pipeline.create(options);

  TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {};
  TupleTag<Integer> sideOutputTag = new TupleTag<Integer>() {};

  DummyStatefulDoFn fn = new DummyStatefulDoFn();
  pipeline
      .apply(Create.of(KV.of(1, 2)))
      .apply(ParDo.of(fn).withOutputTags(mainOutputTag, TupleTagList.of(sideOutputTag)));

  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn));
}

Source File: ParDoTranslationTest.java From beam with Apache License 2.0

6 votes

@Parameters(name = "{index}: {0}")
public static Iterable<ParDo.MultiOutput<?, ?>> data() {
  return ImmutableList.of(
      ParDo.of(new DropElementsFn()).withOutputTags(new TupleTag<>(), TupleTagList.empty()),
      ParDo.of(new DropElementsFn())
          .withOutputTags(new TupleTag<>(), TupleTagList.empty())
          .withSideInputs(singletonSideInput, multimapSideInput),
      ParDo.of(new DropElementsFn())
          .withOutputTags(
              new TupleTag<>(),
              TupleTagList.of(new TupleTag<byte[]>() {}).and(new TupleTag<Integer>() {}))
          .withSideInputs(singletonSideInput, multimapSideInput),
      ParDo.of(new DropElementsFn())
          .withOutputTags(
              new TupleTag<>(),
              TupleTagList.of(new TupleTag<byte[]>() {}).and(new TupleTag<Integer>() {})),
      ParDo.of(new SplittableDropElementsFn())
          .withOutputTags(new TupleTag<>(), TupleTagList.empty()),
      ParDo.of(new StateTimerDropElementsFn())
          .withOutputTags(new TupleTag<>(), TupleTagList.empty()));
}

Source File: CsvConverters.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}

Source File: ParDoTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTaggedOutputUnregisteredExplicitCoder() throws Exception {
  pipeline.enableAbandonedNodeEnforcement(false);

  PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(1, 2, 3)));

  final TupleTag<Integer> mainOutputTag = new TupleTag<>("main");
  final TupleTag<TestDummy> additionalOutputTag = new TupleTag<>("unregisteredSide");
  ParDo.MultiOutput<Integer, Integer> pardo =
      ParDo.of(new TaggedOutputDummyFn(mainOutputTag, additionalOutputTag))
          .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag));
  PCollectionTuple outputTuple = input.apply(pardo);

  outputTuple.get(additionalOutputTag).setCoder(new TestDummyCoder());

  outputTuple.get(additionalOutputTag).apply(View.asSingleton());

  assertEquals(new TestDummyCoder(), outputTuple.get(additionalOutputTag).getCoder());
  outputTuple
      .get(additionalOutputTag)
      .finishSpecifyingOutput("ParDo", input, pardo); // Check for crashes
  assertEquals(
      new TestDummyCoder(),
      outputTuple.get(additionalOutputTag).getCoder()); // Check for corruption
}

Source File: BigQueryConverters.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PCollection<FailsafeElement<T, String>> failsafeElements) {
  return failsafeElements.apply(
      "JsonToTableRow",
      ParDo.of(
              new DoFn<FailsafeElement<T, String>, TableRow>() {
                @ProcessElement
                public void processElement(ProcessContext context) {
                  FailsafeElement<T, String> element = context.element();
                  String json = element.getPayload();

                  try {
                    TableRow row = convertJsonToTableRow(json);
                    context.output(row);
                  } catch (Exception e) {
                    context.output(
                        failureTag(),
                        FailsafeElement.of(element)
                            .setErrorMessage(e.getMessage())
                            .setStacktrace(Throwables.getStackTraceAsString(e)));
                  }
                }
              })
          .withOutputTags(successTag(), TupleTagList.of(failureTag())));
}

Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0

6 votes

/**
 * @param filteredIndexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
	
	PCollectionTuple splitAB = filteredIndexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	filteredIndexes = allIndexSummaries;
	return filteredIndexes;
}

Source File: ParDoTranslationTest.java From beam with Apache License 2.0

6 votes

@Test
public void testFinishBundle() throws Exception {
  Pipeline p = Pipeline.create();
  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  ParDoPayload payload =
      ParDoTranslation.translateParDo(
          ParDo.of(new FinishBundleDoFn())
              .withOutputTags(new TupleTag<>(), TupleTagList.empty()),
          PCollection.createPrimitiveOutputInternal(
              p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, StringUtf8Coder.of()),
          DoFnSchemaInformation.create(),
          TestPipeline.create(),
          sdkComponents);

  assertTrue(payload.getRequestsFinalization());
}

Source File: Task.java From beam with Apache License 2.0

6 votes

static PCollectionTuple applyTransform(
    PCollection<Integer> numbers, TupleTag<Integer> numBelow100Tag,
    TupleTag<Integer> numAbove100Tag) {

  return numbers.apply(ParDo.of(new DoFn<Integer, Integer>() {

    @ProcessElement
    public void processElement(@Element Integer number, MultiOutputReceiver out) {
      if (number <= 100) {
        out.get(numBelow100Tag).output(number);
      } else {
        out.get(numAbove100Tag).output(number);
      }
    }

  }).withOutputTags(numBelow100Tag, TupleTagList.of(numAbove100Tag)));
}

Source File: JsonToRow.java From beam with Apache License 2.0

6 votes

@Override
public ParseResult expand(PCollection<String> jsonStrings) {

  PCollectionTuple result =
      jsonStrings.apply(
          ParDo.of(ParseWithError.create(this))
              .withOutputTags(PARSED_LINE, TupleTagList.of(PARSE_ERROR)));

  PCollection<Row> failures;

  if (getExtendedErrorInfo()) {
    failures =
        result.get(PARSE_ERROR).setRowSchema(JsonToRowWithErrFn.ERROR_ROW_WITH_ERR_MSG_SCHEMA);
  } else {
    failures = result.get(PARSE_ERROR).setRowSchema(JsonToRowWithErrFn.ERROR_ROW_SCHEMA);
  }

  return ParseResult.resultBuilder()
      .setCallingPipeline(jsonStrings.getPipeline())
      .setJsonToRowWithErrFn(this)
      .setParsedLine(result.get(PARSED_LINE).setRowSchema(this.getSchema()))
      .setFailedParse(failures)
      .build();
}

Source File: PTransformTranslationTest.java From beam with Apache License 2.0

6 votes

private static AppliedPTransform<?, ?, ?> multiMultiParDo(Pipeline pipeline) {
  PCollectionView<String> view = pipeline.apply(Create.of("foo")).apply(View.asSingleton());
  PCollection<Long> input = pipeline.apply(GenerateSequence.from(0));
  ParDo.MultiOutput<Long, KV<Long, String>> parDo =
      ParDo.of(new TestDoFn())
          .withSideInputs(view)
          .withOutputTags(
              new TupleTag<KV<Long, String>>() {},
              TupleTagList.of(new TupleTag<KV<String, Long>>() {}));
  PCollectionTuple output = input.apply(parDo);

  Map<TupleTag<?>, PValue> inputs = new HashMap<>();
  inputs.putAll(parDo.getAdditionalInputs());
  inputs.putAll(input.expand());

  return AppliedPTransform
      .<PCollection<Long>, PCollectionTuple, ParDo.MultiOutput<Long, KV<Long, String>>>of(
          "MultiParDoInAndOut", inputs, output.expand(), parDo, pipeline);
}

Source File: BatchStatefulParDoOverridesTest.java From beam with Apache License 2.0

6 votes

@Test
@Ignore(
    "TODO: BEAM-2902 Add support for user state in a ParDo.Multi once PTransformMatcher "
        + "exposes a way to know when the replacement is not required by checking that the "
        + "preceding ParDos to a GBK are key preserving.")
public void testFnApiMultiOutputOverrideNonCrashing() throws Exception {
  DataflowPipelineOptions options = buildPipelineOptions("--experiments=beam_fn_api");
  options.setRunner(DataflowRunner.class);
  Pipeline pipeline = Pipeline.create(options);

  TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {};
  TupleTag<Integer> sideOutputTag = new TupleTag<Integer>() {};

  DummyStatefulDoFn fn = new DummyStatefulDoFn();
  pipeline
      .apply(Create.of(KV.of(1, 2)))
      .apply(ParDo.of(fn).withOutputTags(mainOutputTag, TupleTagList.of(sideOutputTag)));

  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn));
}

Source File: Partition.java From beam with Apache License 2.0

6 votes

@Override
public PCollectionList<T> expand(PCollection<T> in) {
  final TupleTagList outputTags = partitionDoFn.getOutputTags();

  PCollectionTuple outputs =
      in.apply(
          ParDo.of(partitionDoFn)
              .withOutputTags(new TupleTag<Void>() {}, outputTags)
              .withSideInputs(partitionDoFn.getSideInputs()));

  PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
  Coder<T> coder = in.getCoder();

  for (TupleTag<?> outputTag : outputTags.getAll()) {
    // All the tuple tags are actually TupleTag<T>
    // And all the collections are actually PCollection<T>
    @SuppressWarnings("unchecked")
    TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
    pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
  }
  return pcs;
}

Source File: Partition.java From beam with Apache License 2.0

6 votes

/**
 * Constructs a PartitionDoFn.
 *
 * @throws IllegalArgumentException if {@code numPartitions <= 0}
 */
private PartitionDoFn(
    int numPartitions,
    Contextful<Contextful.Fn<X, Integer>> ctxFn,
    Object originalFnClassForDisplayData) {
  this.ctxFn = ctxFn;
  this.originalFnClassForDisplayData = originalFnClassForDisplayData;
  if (numPartitions <= 0) {
    throw new IllegalArgumentException("numPartitions must be > 0");
  }

  this.numPartitions = numPartitions;

  TupleTagList buildOutputTags = TupleTagList.empty();
  for (int partition = 0; partition < numPartitions; partition++) {
    buildOutputTags = buildOutputTags.and(new TupleTag<X>());
  }
  outputTags = buildOutputTags;
}

Source File: PassThroughThenCleanup.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PCollection<T> input) {
  TupleTag<T> mainOutput = new TupleTag<>();
  TupleTag<Void> cleanupSignal = new TupleTag<>();
  PCollectionTuple outputs =
      input.apply(
          ParDo.of(new IdentityFn<T>())
              .withOutputTags(mainOutput, TupleTagList.of(cleanupSignal)));

  PCollectionView<Iterable<Void>> cleanupSignalView =
      outputs.get(cleanupSignal).setCoder(VoidCoder.of()).apply(View.asIterable());

  input
      .getPipeline()
      .apply("Create(CleanupOperation)", Create.of(cleanupOperation))
      .apply(
          "Cleanup",
          ParDo.of(
                  new DoFn<CleanupOperation, Void>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) throws Exception {
                      c.element().cleanup(new ContextContainer(c, jobIdSideInput));
                    }
                  })
              .withSideInputs(jobIdSideInput, cleanupSignalView));

  return outputs.get(mainOutput).setCoder(input.getCoder());
}

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

5 votes

@Test
public void parDoSplittable() {
  AppliedPTransform<?, ?, ?> parDoApplication =
      getAppliedTransform(
          ParDo.of(splittableDoFn).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
  assertThat(PTransformMatchers.splittableParDo().matches(parDoApplication), is(true));

  assertThat(PTransformMatchers.stateOrTimerParDoMulti().matches(parDoApplication), is(false));
  assertThat(PTransformMatchers.splittableParDoSingle().matches(parDoApplication), is(false));
  assertThat(PTransformMatchers.stateOrTimerParDoSingle().matches(parDoApplication), is(false));
}

Source File: CoGbkResultTest.java From beam with Apache License 2.0

5 votes

private CoGbkResultSchema createSchema(int size) {
  List<TupleTag<?>> tags = new ArrayList<>();
  for (int i = 0; i < size; i++) {
    tags.add(new TupleTag<Integer>("tag" + i));
  }
  return new CoGbkResultSchema(TupleTagList.of(tags));
}

Source File: KeyedPCollectionTuple.java From beam with Apache License 2.0

5 votes

KeyedPCollectionTuple(
    Pipeline pipeline,
    List<TaggedKeyedPCollection<K, ?>> keyedCollections,
    TupleTagList tupleTagList,
    @Nullable Coder<K> keyCoder) {
  this.pipeline = pipeline;
  this.keyedCollections = keyedCollections;
  this.schema = new CoGbkResultSchema(tupleTagList);
  this.keyCoder = keyCoder;
}

Source File: FlattenTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(ValidatesRunner.class)
public void testFlattenMultiplePCollectionsHavingMultipleConsumers() {
  PCollection<String> input = p.apply(Create.of("AA", "BBB", "CC"));
  final TupleTag<String> outputEvenLengthTag = new TupleTag<String>() {};
  final TupleTag<String> outputOddLengthTag = new TupleTag<String>() {};

  PCollectionTuple tuple =
      input.apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                      if (c.element().length() % 2 == 0) {
                        c.output(c.element());
                      } else {
                        c.output(outputOddLengthTag, c.element());
                      }
                    }
                  })
              .withOutputTags(outputEvenLengthTag, TupleTagList.of(outputOddLengthTag)));

  PCollection<String> outputEvenLength = tuple.get(outputEvenLengthTag);
  PCollection<String> outputOddLength = tuple.get(outputOddLengthTag);

  PCollection<String> outputMerged =
      PCollectionList.of(outputEvenLength).and(outputOddLength).apply(Flatten.pCollections());

  PAssert.that(outputMerged).containsInAnyOrder("AA", "BBB", "CC");
  PAssert.that(outputEvenLength).containsInAnyOrder("AA", "CC");
  PAssert.that(outputOddLength).containsInAnyOrder("BBB");

  p.run();
}

Source File: FlinkRequiresStableInputTest.java From beam with Apache License 2.0

5 votes

private static Pipeline createPipeline(
    PipelineOptions options, String singleOutputPrefix, String multiOutputPrefix) {
  Pipeline p = Pipeline.create(options);

  SerializableFunction<Void, Void> firstTime =
      (SerializableFunction<Void, Void>)
          value -> {
            latch.countDown();
            return null;
          };

  PCollection<String> impulse = p.apply("CreatePCollectionOfOneValue", Create.of(VALUE));
  impulse
      .apply(
          "Single-PairWithRandomKey",
          MapElements.via(new RequiresStableInputIT.PairWithRandomKeyFn()))
      .apply(
          "Single-MakeSideEffectAndThenFail",
          ParDo.of(
              new RequiresStableInputIT.MakeSideEffectAndThenFailFn(
                  singleOutputPrefix, firstTime)));
  impulse
      .apply(
          "Multi-PairWithRandomKey",
          MapElements.via(new RequiresStableInputIT.PairWithRandomKeyFn()))
      .apply(
          "Multi-MakeSideEffectAndThenFail",
          ParDo.of(
                  new RequiresStableInputIT.MakeSideEffectAndThenFailFn(
                      multiOutputPrefix, firstTime))
              .withOutputTags(new TupleTag<>(), TupleTagList.empty()));

  return p;
}

Source File: TextTableProvider.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PCollection<String> input) {
  PCollectionTuple rows =
      input.apply(
          ParDo.of(
                  new DoFn<String, Row>() {
                    @ProcessElement
                    public void processElement(ProcessContext context) {
                      try {
                        context.output(jsonToRow(getObjectMapper(), context.element()));
                      } catch (UnsupportedRowJsonException jsonException) {
                        if (deadLetterFile() != null) {
                          context.output(DLF_TAG, context.element());
                        } else {
                          throw new RuntimeException("Error parsing JSON", jsonException);
                        }
                      }
                    }
                  })
              .withOutputTags(
                  MAIN_TAG,
                  deadLetterFile() != null ? TupleTagList.of(DLF_TAG) : TupleTagList.empty()));

  if (deadLetterFile() != null) {
    rows.get(DLF_TAG).setCoder(StringUtf8Coder.of()).apply(writeJsonToDlf());
  }
  return rows.get(MAIN_TAG).setRowSchema(schema());
}

Source File: PubsubMessageToRow.java From beam with Apache License 2.0

5 votes

@Override
public PCollectionTuple expand(PCollection<PubsubMessage> input) {
  PCollectionTuple rows =
      input.apply(
          ParDo.of(
                  useFlatSchema()
                      ? new FlatSchemaPubsubMessageToRoW(messageSchema(), useDlq())
                      : new NestedSchemaPubsubMessageToRow(messageSchema(), useDlq()))
              .withOutputTags(
                  MAIN_TAG, useDlq() ? TupleTagList.of(DLQ_TAG) : TupleTagList.empty()));
  rows.get(MAIN_TAG).setRowSchema(messageSchema());
  return rows;
}

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

5 votes

@Test
public void parDoMulti() {
  AppliedPTransform<?, ?, ?> parDoApplication =
      getAppliedTransform(ParDo.of(doFn).withOutputTags(new TupleTag<>(), TupleTagList.empty()));

  assertThat(PTransformMatchers.splittableParDoMulti().matches(parDoApplication), is(false));
  assertThat(PTransformMatchers.stateOrTimerParDoMulti().matches(parDoApplication), is(false));
  assertThat(PTransformMatchers.splittableParDoSingle().matches(parDoApplication), is(false));
  assertThat(PTransformMatchers.stateOrTimerParDoSingle().matches(parDoApplication), is(false));
}

Source File: ParDoLifecycleTest.java From beam with Apache License 2.0

5 votes

@Test
@Category({ValidatesRunner.class, UsesStatefulParDo.class, UsesParDoLifecycle.class})
public void testFnCallSequenceStateful() {
  PCollectionList.of(p.apply("Impolite", Create.of(KV.of("a", 1), KV.of("b", 2), KV.of("a", 4))))
      .and(
          p.apply(
              "Polite", Create.of(KV.of("b", 3), KV.of("a", 5), KV.of("c", 6), KV.of("c", 7))))
      .apply(Flatten.pCollections())
      .apply(
          ParDo.of(new CallSequenceEnforcingStatefulFn<String, Integer>())
              .withOutputTags(new TupleTag<KV<String, Integer>>() {}, TupleTagList.empty()));

  p.run();
}

Source File: BigQueryConverters.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public PCollectionTuple expand(PCollection<TableRow> input) {

  PCollectionTuple udfOut;

  PCollectionTuple failsafeTableRows =
      input.apply(
          "TableRowToFailsafeElement",
          ParDo.of(new TableRowToFailsafeElementFn(transformDeadletterOutTag()))
              .withOutputTags(transformOutTag(), TupleTagList.of(transformDeadletterOutTag())));

  // Use Udf to parse table rows if supplied.
  if (options().getJavascriptTextTransformGcsPath() != null) {
    udfOut =
        failsafeTableRows
            .get(transformOutTag())
            .apply(
                "ProcessFailsafeRowsUdf",
                JavascriptTextTransformer.FailsafeJavascriptUdf.<TableRow>newBuilder()
                    .setFileSystemPath(options().getJavascriptTextTransformGcsPath())
                    .setFunctionName(options().getJavascriptTextTransformFunctionName())
                    .setSuccessTag(udfOutTag())
                    .setFailureTag(udfDeadletterOutTag())
                    .build());

    PCollection<FailsafeElement<TableRow, String>> failedOut =
        PCollectionList.of(udfOut.get(udfDeadletterOutTag()))
            .and(failsafeTableRows.get(transformDeadletterOutTag()))
            .apply("FlattenFailedOut", Flatten.pCollections());

    return PCollectionTuple.of(transformOutTag(), udfOut.get(udfOutTag()))
        .and(transformDeadletterOutTag(), failedOut);
  } else {
    return failsafeTableRows;
  }
}

Source File: QueryablePipelineTest.java From beam with Apache License 2.0

5 votes

@Test
public void retainOnlyPrimitivesWithOnlyPrimitivesUnchanged() {
  Pipeline p = Pipeline.create();
  p.apply("Read", Read.from(CountingSource.unbounded()))
      .apply(
          "multi-do",
          ParDo.of(new TestFn()).withOutputTags(new TupleTag<>(), TupleTagList.empty()));

  Components originalComponents = PipelineTranslation.toProto(p).getComponents();
  Collection<String> primitiveComponents =
      QueryablePipeline.getPrimitiveTransformIds(originalComponents);

  assertThat(primitiveComponents, equalTo(originalComponents.getTransformsMap().keySet()));
}

org.apache.beam.sdk.values.TupleTagList Java Examples