org.apache.beam.sdk.values.PCollectionTuple#of

Source File: CsvConverters.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}

Source File: BeamSqlDslArrayTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnnestLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c'])"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}

Source File: BeamSqlDslArrayTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnnestNamedLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery",
          SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c']) AS t(f_string)"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

5 votes

/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a JSON schema.
 */
@Test
public void testLineToFailsafeJsonNoHeadersJsonSchema() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJson",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(TEST_JSON_SCHEMA__PATH)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}

Source File: BeamSqlDslArrayTest.java From beam with Apache License 2.0

5 votes

@Test
public void testUnnestCrossJoin() {
  Row row1 =
      Row.withSchema(INPUT_SCHEMA)
          .addValues(42)
          .addArray(Arrays.asList("111", "222", "333"))
          .build();

  Row row2 =
      Row.withSchema(INPUT_SCHEMA).addValues(13).addArray(Arrays.asList("444", "555")).build();

  PCollection<Row> input =
      pipeline.apply("boundedInput1", Create.of(row1, row2).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addInt32Field("f_int").addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery",
          SqlTransform.query(
              "SELECT f_int, arrElems.f_string FROM main "
                  + " CROSS JOIN UNNEST (main.f_stringArr) AS arrElems(f_string)"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues(42, "111").build(),
          Row.withSchema(resultType).addValues(42, "222").build(),
          Row.withSchema(resultType).addValues(42, "333").build(),
          Row.withSchema(resultType).addValues(13, "444").build(),
          Row.withSchema(resultType).addValues(13, "555").build());

  pipeline.run();
}

Source File: PipelineTest.java From beam with Apache License 2.0

5 votes

/** Tests that Pipeline supports pulling an element out of a tuple as a transform. */
@Test
@Category(ValidatesRunner.class)
public void testTupleProjectionTransform() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3, 4));

  TupleTag<Integer> tag = new TupleTag<>();
  PCollectionTuple tuple = PCollectionTuple.of(tag, input);

  PCollection<Integer> output = tuple.apply("ProjectTag", new TupleProjectionTransform<>(tag));

  PAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
  pipeline.run();
}

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdf() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdf",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT)).empty();

  pipeline.run();
}

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}. Should output record to deadletter table tag.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdfDeadletter() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdfBad",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT)).empty();
  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT))
      .satisfies(
          collection -> {
            FailsafeElement result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}

Source File: PipelineTest.java From beam with Apache License 2.0

4 votes

@Override
public PCollectionTuple expand(PCollection<T> input) {
  return PCollectionTuple.of(tag, input);
}

Source File: TestUtils.java From beam with Apache License 2.0

2 votes

public static <T> PCollectionTuple tuple(String tag, PCollection<T> pCollection) {

    return PCollectionTuple.of(new TupleTag<>(tag), pCollection);
  }

Java Code Examples for org.apache.beam.sdk.values.PCollectionTuple#of()