org.apache.beam.sdk.coders.CoderRegistry#registerCoderForType

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

5 votes

/** Tests that if different headers are found an exception is thrown. */
@Test(expected = RuntimeException.class)
public void testDifferentHeaders() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> headers =
      pipeline.apply("CreateInput", Create.of(HEADER_STRING, "wrong,header,thing\n"));
  PCollection<String> lines = pipeline.apply("Create lines", Create.of(RECORD_STRING));

  PCollectionTuple readCsvHeadersOut =
      PCollectionTuple.of(CSV_HEADERS, headers).and(CSV_LINES, lines);

  PCollectionTuple test =
      readCsvHeadersOut.apply(
          "TestDifferentHeaders",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .setUdfOutputTag(PROCESSING_OUT)
              .build());

  pipeline.run();
}

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

5 votes

/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a JSON schema.
 */
@Test
public void testLineToFailsafeJsonNoHeadersJsonSchema() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJson",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(TEST_JSON_SCHEMA__PATH)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}

Source File: PubSubToElasticsearchTest.java From DataflowTemplates with Apache License 2.0

5 votes

/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with an empty message payload but attributes populated. */
@Test
public void testPubSubToElasticsearchOnlyAttributesE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER);
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER);

  PubSubToElasticsearch.PubSubToElasticsearchOptions options =
          TestPipeline.testingPipelineOptions()
                  .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class);

  options.setDeadletterTable("test:dataset.table");

  PCollectionTuple pc =
          pipeline
                  .apply(Create.of(goodTestMessages.get(goodTestMessages.size() - 1)))
                  .apply(
                          PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder()
                                  .setJavascriptTextTransformFunctionName(
                                          options.getJavascriptTextTransformFunctionName())
                                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                                  .build());

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT))
          .satisfies(
                  collection -> {
                    FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
                    assertThat(
                            new Gson().fromJson(element.getPayload(), HashMap.class),
                            is(equalTo(element.getOriginalPayload().getAttributeMap())));
                    return null;
                  });

  // Execute pipeline
  pipeline.run(options);
}

Source File: ErrorConvertersTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests that {@link ErrorConverters.FailedPubsubMessageToTableRowFn} properly formats failed
 * {@link PubsubMessage} objects into {@link TableRow} objects to save to BigQuery.
 */
@Test
public void testFailedPubsubMessageToTableRowFn() {
  // Test input
  final String payload = "Super secret";
  final String errorMessage = "Failed to parse input JSON";
  final String stacktrace = "Error at com.google.cloud.teleport.PubsubToBigQuery";

  final PubsubMessage message =
      new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));

  final FailsafeElement<PubsubMessage, String> input =
      FailsafeElement.of(message, payload)
          .setErrorMessage(errorMessage)
          .setStacktrace(stacktrace);

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build pipeline
  PCollection<TableRow> output =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(input, timestamp)).withCoder(coder))
          .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()));

  // Assert
  PAssert.that(output)
      .satisfies(
          collection -> {
            final TableRow result = collection.iterator().next();
            assertThat(result.get("timestamp"), is(equalTo("2022-02-22 22:22:22.222000")));
            assertThat(result.get("attributes"), is(notNullValue()));
            assertThat(result.get("payloadString"), is(equalTo(payload)));
            assertThat(result.get("payloadBytes"), is(notNullValue()));
            assertThat(result.get("errorMessage"), is(equalTo(errorMessage)));
            assertThat(result.get("stacktrace"), is(equalTo(stacktrace)));
            return null;
          });

  // Execute pipeline
  pipeline.run();
}

Source File: CsvToElasticsearchTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link CsvToElasticsearch} pipeline the headers of the Csv to parse it. */
@Test
public void testCsvToElasticsearchHeadersE2E() {

  final String header = "id,state,price";
  final String record = "007,CA,26.23";
  final String stringJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":\"26.23\"}";

  final FailsafeElementCoder<String, String> coder =
      FailsafeElementCoder.of(
          NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  CsvToElasticsearch.CsvToElasticsearchOptions options =
      PipelineOptionsFactory.create().as(CsvToElasticsearch.CsvToElasticsearchOptions.class);

  options.setContainsHeaders(true);
  options.setInputFileSpec(HEADER_CSV_FILE_PATH);

  // Build pipeline with no headers.
  PCollectionTuple readCsvOut =
      pipeline
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .build())
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .setUdfOutputTag(CsvToElasticsearch.PROCESSING_OUT)
                  .setUdfDeadletterTag(CsvToElasticsearch.PROCESSING_DEADLETTER_OUT)
                  .build());

  // Assert
  PAssert.that(readCsvOut.get(CsvToElasticsearch.PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement element = collection.iterator().next();
            assertThat(element.getOriginalPayload(), is(equalTo(record)));
            assertThat(element.getPayload(), is(equalTo(stringJsonRecord)));
            return null;
          });

  //  Execute pipeline
  pipeline.run();
}

Source File: KafkaToBigQuery.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(Options options) {

  Pipeline pipeline = Pipeline.create(options);

  // Register the coder for pipeline
  FailsafeElementCoder<KV<String, String>, String> coder =
      FailsafeElementCoder.of(
          KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  /*
   * Steps:
   *  1) Read messages in from Kafka
   *  2) Transform the Kafka Messages into TableRows
   *     - Transform message payload via UDF
   *     - Convert UDF result to TableRow objects
   *  3) Write successful records out to BigQuery
   *  4) Write failed records out to BigQuery
   */
  PCollectionTuple transformOut =
      pipeline
          /*
           * Step #1: Read messages in from Kafka
           */
          .apply(
              "ReadFromKafka",
              KafkaIO.<String, String>read()
                  .withBootstrapServers(options.getBootstrapServers())
                  .withTopic(options.getInputTopic())
                  .withKeyDeserializer(StringDeserializer.class)
                  .withValueDeserializer(StringDeserializer.class)
                  // NumSplits is hard-coded to 1 for single-partition use cases (e.g., Debezium
                  // Change Data Capture). Once Dataflow dynamic templates are available, this can
                  // be deprecated.
                  .withNumSplits(1)
                  .withoutMetadata())

          /*
           * Step #2: Transform the Kafka Messages into TableRows
           */
          .apply("ConvertMessageToTableRow", new MessageToTableRow(options));

  /*
   * Step #3: Write the successful records out to BigQuery
   */
  transformOut
      .get(TRANSFORM_OUT)
      .apply(
          "WriteSuccessfulRecords",
          BigQueryIO.writeTableRows()
              .withoutValidation()
              .withCreateDisposition(CreateDisposition.CREATE_NEVER)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND)
              .to(options.getOutputTableSpec()));

  /*
   * Step #4: Write failed records out to BigQuery
   */
  PCollectionList.of(transformOut.get(UDF_DEADLETTER_OUT))
      .and(transformOut.get(TRANSFORM_DEADLETTER_OUT))
      .apply("Flatten", Flatten.pCollections())
      .apply(
          "WriteFailedRecords",
          WriteKafkaMessageErrors.newBuilder()
              .setErrorRecordsTable(
                  ValueProviderUtils.maybeUseDefaultDeadletterTable(
                      options.getOutputDeadletterTable(),
                      options.getOutputTableSpec(),
                      DEFAULT_DEADLETTER_TABLE_SUFFIX))
              .setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson())
              .build());
  return pipeline.run();
}

Source File: BigQueryConvertersTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests the {@link BigQueryConverters.FailsafeJsonToTableRow} transform with invalid JSON input.
 */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJsonToTableRowInvalidJSON() {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "JsonToTableRow",
              FailsafeJsonToTableRow.<PubsubMessage>newBuilder()
                  .setSuccessTag(TABLE_ROW_TAG)
                  .setFailureTag(FAILSAFE_ELM_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(TABLE_ROW_TAG)).empty();
  PAssert.that(output.get(FAILSAFE_ELM_TAG))
      .satisfies(
          collection -> {
            final FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            // Check the individual elements of the PubsubMessage since the message above won't be
            // serializable.
            assertThat(new String(result.getOriginalPayload().getPayload())).isEqualTo(payload);
            assertThat(result.getOriginalPayload().getAttributeMap()).isEqualTo(attributes);
            assertThat(result.getPayload()).isEqualTo(payload);
            assertThat(result.getErrorMessage()).isNotNull();
            assertThat(result.getStacktrace()).isNotNull();
            return null;
          });

  // Execute the test
  pipeline.run();
}

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}. Should output record to deadletter table tag.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdfDeadletter() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdfBad",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT)).empty();
  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT))
      .satisfies(
          collection -> {
            FailsafeElement result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}

Source File: ErrorConvertersTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests that {@link ErrorConverters.FailedStringToTableRowFn} properly formats failed String
 * objects into {@link TableRow} objects to save to BigQuery.
 */
@Test
public void testFailedStringMessageToTableRowFn() {
  // Test input
  final String message = "Super secret";
  final String errorMessage = "Failed to parse input JSON";
  final String stacktrace = "Error at com.google.cloud.teleport.TextToBigQueryStreaming";

  final FailsafeElement<String, String> input =
      FailsafeElement.of(message, message)
          .setErrorMessage(errorMessage)
          .setStacktrace(stacktrace);

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<String, String> coder =
      FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build pipeline
  PCollection<TableRow> output =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(input, timestamp)).withCoder(coder))
          .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()));

  // Assert
  PAssert.that(output)
      .satisfies(
          collection -> {
            final TableRow result = collection.iterator().next();
            assertThat(result.get("timestamp")).isEqualTo("2022-02-22 22:22:22.222000");
            assertThat(result.get("attributes")).isNull();
            assertThat(result.get("payloadString")).isEqualTo(message);
            assertThat(result.get("payloadBytes")).isNotNull();
            assertThat(result.get("errorMessage")).isEqualTo(errorMessage);
            assertThat(result.get("stacktrace")).isEqualTo(stacktrace);
            return null;
          });

  // Execute pipeline
  pipeline.run();
}

Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests the {@link FailsafeJavascriptUdf} when it's passed invalid JSON. In this case the UDF
 * should output the input {@link FailsafeElement} to the dead-letter enriched with error
 * information.
 */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfInvalidInput() {
  // Test input
  final String fileSystemPath = TRANSFORM_FILE_PATH;
  final String functionName = "transform";

  final String payload = "\"ticker\": \"GOOGL\", \"price\": 1006.94";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG)).empty();
  PAssert.that(output.get(FAILURE_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(payload)));
            assertThat(result.getErrorMessage(), is(notNullValue()));
            assertThat(result.getStacktrace(), is(notNullValue()));
            return null;
          });

  // Execute the test
  pipeline.run();
}

Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link FailsafeJavascriptUdf} when the input is valid. */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfValidInput() {
  // Test input
  final String fileSystemPath = TRANSFORM_FILE_PATH;
  final String functionName = "transform";

  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();
            String expectedPayload =
                "{\"ticker\":\"GOOGL\",\"price\":1006.94,\"someProp\":\"someValue\"}";

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(expectedPayload)));
            assertThat(result.getErrorMessage(), is(nullValue()));
            assertThat(result.getStacktrace(), is(nullValue()));
            return null;
          });

  PAssert.that(output.get(FAILURE_TAG)).empty();

  // Execute the test
  pipeline.run();
}

Source File: PubSubCdcToBigQueryTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link PubSubCdcToBigQuery} pipeline end-to-end. */
@Test
public void testPubSubCdcToBigQueryApplyJavaScriptUDF() throws Exception {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final PubsubMessage message =
      new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  final FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Parameters
  String transformPath = TRANSFORM_FILE_PATH;
  String transformFunction = "transform";

  PubSubCdcToBigQuery.Options options =
      PipelineOptionsFactory.create().as(PubSubCdcToBigQuery.Options.class);

  options.setJavascriptTextTransformGcsPath(transformPath);
  options.setJavascriptTextTransformFunctionName(transformFunction);

  // Build pipeline
  PCollectionTuple transformOut =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(message, timestamp))
                  .withCoder(PubsubMessageWithAttributesCoder.of()))
          .apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options));

  // Assert
  PAssert.that(transformOut.get(PubSubCdcToBigQuery.UDF_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubCdcToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubCdcToBigQuery.TRANSFORM_OUT))
      .satisfies(
          collection -> {
            TableRow result = collection.iterator().next();
            assertThat(result.get("ticker"), is(equalTo("GOOGL")));
            assertThat(result.get("price"), is(equalTo(1006.94)));
            return null;
          });

  // Execute pipeline
  pipeline.run();
}

Source File: PubsubToBigQueryTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link PubSubToBigQuery} pipeline end-to-end. */
@Test
public void testPubsubToBigQueryE2E() throws Exception {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final PubsubMessage message =
      new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  final FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Parameters
  ValueProvider<String> transformPath = pipeline.newProvider(TRANSFORM_FILE_PATH);
  ValueProvider<String> transformFunction = pipeline.newProvider("transform");

  PubSubToBigQuery.Options options =
      PipelineOptionsFactory.create().as(PubSubToBigQuery.Options.class);

  options.setJavascriptTextTransformGcsPath(transformPath);
  options.setJavascriptTextTransformFunctionName(transformFunction);

  // Build pipeline
  PCollectionTuple transformOut =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(message, timestamp))
                  .withCoder(PubsubMessageWithAttributesCoder.of()))
          .apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options));

  // Assert
  PAssert.that(transformOut.get(PubSubToBigQuery.UDF_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_OUT))
      .satisfies(
          collection -> {
            TableRow result = collection.iterator().next();
            assertThat(result.get("ticker"), is(equalTo("GOOGL")));
            assertThat(result.get("price"), is(equalTo(1006.94)));
            return null;
          });

  // Execute pipeline
  pipeline.run();
}

Source File: PubSubToMongoDBTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link PubSubToMongoDB} pipeline end-to-end with a bad UDF. */
@Test
public void testPubSubToMongoDBBadUdfE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
      PubSubToMongoDB.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(),
      PubSubToMongoDB.FAILSAFE_ELEMENT_CODER);
  coderRegistry.registerCoderForType(
      PubSubToMongoDB.CODER.getEncodedTypeDescriptor(), PubSubToMongoDB.CODER);

  PubSubToMongoDB.Options options =
      TestPipeline.testingPipelineOptions().as(PubSubToMongoDB.Options.class);

  options.setDeadletterTable("test:dataset.table");
  options.setJavascriptTextTransformFunctionName("transformBad");
  options.setJavascriptTextTransformGcsPath(BAD_TRANSFORM_FILE_PATH);

  PCollectionTuple pc =
      pipeline
          .apply(Create.of(badTestMessages.get(0)))
          .apply(
              PubSubToMongoDB.PubSubMessageToJsonDocument.newBuilder()
                  .setJavascriptTextTransformFunctionName(
                      options.getJavascriptTextTransformFunctionName())
                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                  .build());

  PAssert.that(pc.get(PubSubToMongoDB.TRANSFORM_DEADLETTER_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
            assertThat(
                element.getOriginalPayload().getPayload(),
                is(equalTo(badTestMessages.get(0).getPayload())));
            return null;
          });

  PAssert.that(pc.get(PubSubToMongoDB.TRANSFORM_OUT)).empty();

  // Execute pipeline
  pipeline.run(options);
}

Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Tests the {@link FailsafeJavascriptUdf} when it's passed invalid JSON. In this case the UDF
 * should output the input {@link FailsafeElement} to the dead-letter enriched with error
 * information.
 */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfInvalidInput() {
  // Test input
  final ValueProvider<String> fileSystemPath = pipeline.newProvider(TRANSFORM_FILE_PATH);
  final ValueProvider<String> functionName = pipeline.newProvider("transform");

  final String payload = "\"ticker\": \"GOOGL\", \"price\": 1006.94";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG)).empty();
  PAssert.that(output.get(FAILURE_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(payload)));
            assertThat(result.getErrorMessage(), is(notNullValue()));
            assertThat(result.getStacktrace(), is(notNullValue()));
            return null;
          });

  // Execute the test
  pipeline.run();
}

Source File: CsvToElasticsearch.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(CsvToElasticsearchOptions options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  // Register the coder for pipeline
  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
      FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);

  // Throw error if containsHeaders is true and a schema or Udf is also set.
  if (options.getContainsHeaders()) {
    checkArgument(
        options.getJavascriptTextTransformGcsPath() == null
            && options.getJsonSchemaPath() == null,
        "Cannot parse file containing headers with UDF or Json schema.");
  }

  // Throw error if only one retry configuration parameter is set.
  if (options.getMaxRetryAttempts() != null || options.getMaxRetryDuration() != null) {
    checkArgument(
        options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null,
        "To specify retry configuration both max attempts and max duration must be set.");
  }

  /*
   * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
   *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
   *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
   *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
   */
  PCollectionTuple convertedCsvLines =
      pipeline
          /*
           * Step 1: Read CSV file(s) from Cloud Storage using {@link CsvConverters.ReadCsv}.
           */
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .build())
          /*
           * Step 2: Convert lines to Elasticsearch document.
           */
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .setUdfOutputTag(PROCESSING_OUT)
                  .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
                  .build());
  /*
   * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
   */
  convertedCsvLines
      .get(PROCESSING_OUT)
      .apply(
          "GetJsonDocuments",
          MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload))
      .apply(
          "WriteToElasticsearch",
          WriteToElasticsearch.newBuilder()
              .setOptions(options.as(WriteToElasticsearchOptions.class))
              .build());

  /*
   * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
   */
  convertedCsvLines
      .get(PROCESSING_DEADLETTER_OUT)
      .apply(
          "AddTimestamps",
          WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant()))
      .apply(
          "WriteFailedElementsToBigQuery",
          WriteStringMessageErrors.newBuilder()
              .setErrorRecordsTable(options.getDeadletterTable())
              .setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA)
              .build());

  return pipeline.run();
}

Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link FailsafeJavascriptUdf} when the input is valid. */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfValidInput() {
  // Test input
  final ValueProvider<String> fileSystemPath = pipeline.newProvider(TRANSFORM_FILE_PATH);
  final ValueProvider<String> functionName = pipeline.newProvider("transform");

  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();
            String expectedPayload =
                "{\"ticker\":\"GOOGL\",\"price\":1006.94,\"someProp\":\"someValue\"}";

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(expectedPayload)));
            assertThat(result.getErrorMessage(), is(nullValue()));
            assertThat(result.getStacktrace(), is(nullValue()));
            return null;
          });

  PAssert.that(output.get(FAILURE_TAG)).empty();

  // Execute the test
  pipeline.run();
}

Source File: PubSubToElasticsearchTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with a bad UDF. */
@Test
public void testPubSubToElasticsearchBadUdfE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER);
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER);

  PubSubToElasticsearch.PubSubToElasticsearchOptions options =
          TestPipeline.testingPipelineOptions()
                  .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class);

  options.setDeadletterTable("test:dataset.table");
  options.setJavascriptTextTransformFunctionName("transformBad");
  options.setJavascriptTextTransformGcsPath(BAD_TRANSFORM_FILE_PATH);

  PCollectionTuple pc =
          pipeline
                  .apply(Create.of(badTestMessages.get(0)))
                  .apply(
                          PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder()
                                  .setJavascriptTextTransformFunctionName(
                                          options.getJavascriptTextTransformFunctionName())
                                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                                  .build());

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_DEADLETTER_OUT))
          .satisfies(
                  collection -> {
                    FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
                    assertThat(
                            element.getOriginalPayload().getPayload(),
                            is(equalTo(badTestMessages.get(0).getPayload())));
                    return null;
                  });

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT)).empty();

  // Execute pipeline
  pipeline.run(options);
}

Source File: CsvToElasticsearchTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link CsvToElasticsearch} pipeline using a Udf to parse the Csv. */
@Test
public void testCsvToElasticsearchUdfE2E() {

  final String record = "007,CA,26.23";
  final String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}";

  final FailsafeElementCoder<String, String> coder =
      FailsafeElementCoder.of(
          NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  CsvToElasticsearch.CsvToElasticsearchOptions options =
      PipelineOptionsFactory.create().as(CsvToElasticsearch.CsvToElasticsearchOptions.class);

  options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");
  options.setContainsHeaders(false);
  options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH);

  // Build pipeline with no headers.
  PCollectionTuple readCsvOut =
      pipeline
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .build())
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .setUdfOutputTag(CsvToElasticsearch.PROCESSING_OUT)
                  .setUdfDeadletterTag(CsvToElasticsearch.PROCESSING_DEADLETTER_OUT)
                  .build());

  // Assert
  PAssert.that(readCsvOut.get(CsvToElasticsearch.PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement element = collection.iterator().next();
            assertThat(element.getOriginalPayload(), is(equalTo(record)));
            assertThat(element.getPayload(), is(equalTo(stringifiedJsonRecord)));
            return null;
          });

  //  Execute pipeline
  pipeline.run();
}

Source File: PubSubToElasticsearchTest.java From DataflowTemplates with Apache License 2.0

4 votes

/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with no UDF supplied. */
@Test
public void testPubSubToElasticsearchNoUdfE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();

  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER);

  coderRegistry.registerCoderForType(
      PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER);

  PubSubToElasticsearch.PubSubToElasticsearchOptions options =
      TestPipeline.testingPipelineOptions()
          .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class);

  options.setDeadletterTable("test:dataset.table");
  options.setJavascriptTextTransformFunctionName(null);
  options.setJavascriptTextTransformGcsPath(null);

  PCollectionTuple pc =
      pipeline
          .apply(Create.of(goodTestMessages.get(0)))
          .apply(
              PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder()
                  .setJavascriptTextTransformFunctionName(
                      options.getJavascriptTextTransformFunctionName())
                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                  .build());

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
            assertThat(
                element.getOriginalPayload().getPayload(),
                is(equalTo(goodTestMessages.get(0).getPayload())));
            return null;
          });

  // Execute pipeline
  pipeline.run(options);
}

Java Code Examples for org.apache.beam.sdk.coders.CoderRegistry#registerCoderForType()