Java Code Examples for org.apache.beam.sdk.coders.CoderRegistry#registerCoderForType()
The following examples show how to use
org.apache.beam.sdk.coders.CoderRegistry#registerCoderForType() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0 | 5 votes |
/** Tests that if different headers are found an exception is thrown. */ @Test(expected = RuntimeException.class) public void testDifferentHeaders() { FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER; CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); PCollection<String> headers = pipeline.apply("CreateInput", Create.of(HEADER_STRING, "wrong,header,thing\n")); PCollection<String> lines = pipeline.apply("Create lines", Create.of(RECORD_STRING)); PCollectionTuple readCsvHeadersOut = PCollectionTuple.of(CSV_HEADERS, headers).and(CSV_LINES, lines); PCollectionTuple test = readCsvHeadersOut.apply( "TestDifferentHeaders", CsvConverters.LineToFailsafeJson.newBuilder() .setDelimiter(",") .setUdfFileSystemPath(null) .setUdfFunctionName(null) .setJsonSchemaPath(null) .setHeaderTag(CSV_HEADERS) .setLineTag(CSV_LINES) .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT) .setUdfOutputTag(PROCESSING_OUT) .build()); pipeline.run(); }
Example 2
Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0 | 5 votes |
/** * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement} * correctly using a JSON schema. */ @Test public void testLineToFailsafeJsonNoHeadersJsonSchema() { FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER; CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); PCollection<String> lines = pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of())); PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines); PCollectionTuple failsafe = linesTuple.apply( "TestLineToFailsafeJson", CsvConverters.LineToFailsafeJson.newBuilder() .setDelimiter(",") .setUdfFileSystemPath(null) .setUdfFunctionName(null) .setJsonSchemaPath(TEST_JSON_SCHEMA__PATH) .setHeaderTag(CSV_HEADERS) .setLineTag(CSV_LINES) .setUdfOutputTag(PROCESSING_OUT) .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT) .build()); PAssert.that(failsafe.get(PROCESSING_OUT)) .satisfies( collection -> { FailsafeElement<String, String> result = collection.iterator().next(); assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD))); return null; }); pipeline.run(); }
Example 3
Source File: PubSubToElasticsearchTest.java From DataflowTemplates with Apache License 2.0 | 5 votes |
/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with an empty message payload but attributes populated. */ @Test public void testPubSubToElasticsearchOnlyAttributesE2E() { CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType( PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER); coderRegistry.registerCoderForType( PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER); PubSubToElasticsearch.PubSubToElasticsearchOptions options = TestPipeline.testingPipelineOptions() .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class); options.setDeadletterTable("test:dataset.table"); PCollectionTuple pc = pipeline .apply(Create.of(goodTestMessages.get(goodTestMessages.size() - 1))) .apply( PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder() .setJavascriptTextTransformFunctionName( options.getJavascriptTextTransformFunctionName()) .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath()) .build()); PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> element = collection.iterator().next(); assertThat( new Gson().fromJson(element.getPayload(), HashMap.class), is(equalTo(element.getOriginalPayload().getAttributeMap()))); return null; }); // Execute pipeline pipeline.run(options); }
Example 4
Source File: ErrorConvertersTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Tests that {@link ErrorConverters.FailedPubsubMessageToTableRowFn} properly formats failed * {@link PubsubMessage} objects into {@link TableRow} objects to save to BigQuery. */ @Test public void testFailedPubsubMessageToTableRowFn() { // Test input final String payload = "Super secret"; final String errorMessage = "Failed to parse input JSON"; final String stacktrace = "Error at com.google.cloud.teleport.PubsubToBigQuery"; final PubsubMessage message = new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event")); final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload) .setErrorMessage(errorMessage) .setStacktrace(stacktrace); final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant(); // Register the coder for the pipeline. This prevents having to invoke .setCoder() on // many transforms. FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Build pipeline PCollection<TableRow> output = pipeline .apply( "CreateInput", Create.timestamped(TimestampedValue.of(input, timestamp)).withCoder(coder)) .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn())); // Assert PAssert.that(output) .satisfies( collection -> { final TableRow result = collection.iterator().next(); assertThat(result.get("timestamp"), is(equalTo("2022-02-22 22:22:22.222000"))); assertThat(result.get("attributes"), is(notNullValue())); assertThat(result.get("payloadString"), is(equalTo(payload))); assertThat(result.get("payloadBytes"), is(notNullValue())); assertThat(result.get("errorMessage"), is(equalTo(errorMessage))); assertThat(result.get("stacktrace"), is(equalTo(stacktrace))); return null; }); // Execute pipeline pipeline.run(); }
Example 5
Source File: CsvToElasticsearchTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link CsvToElasticsearch} pipeline the headers of the Csv to parse it. */ @Test public void testCsvToElasticsearchHeadersE2E() { final String header = "id,state,price"; final String record = "007,CA,26.23"; final String stringJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":\"26.23\"}"; final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of( NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); CsvToElasticsearch.CsvToElasticsearchOptions options = PipelineOptionsFactory.create().as(CsvToElasticsearch.CsvToElasticsearchOptions.class); options.setContainsHeaders(true); options.setInputFileSpec(HEADER_CSV_FILE_PATH); // Build pipeline with no headers. PCollectionTuple readCsvOut = pipeline .apply( "ReadCsv", CsvConverters.ReadCsv.newBuilder() .setCsvFormat(options.getCsvFormat()) .setDelimiter(options.getDelimiter()) .setHasHeaders(options.getContainsHeaders()) .setInputFileSpec(options.getInputFileSpec()) .setHeaderTag(CsvToElasticsearch.CSV_HEADERS) .setLineTag(CsvToElasticsearch.CSV_LINES) .build()) .apply( "ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder() .setDelimiter(options.getDelimiter()) .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()) .setUdfFunctionName(options.getJavascriptTextTransformFunctionName()) .setJsonSchemaPath(options.getJsonSchemaPath()) .setHeaderTag(CsvToElasticsearch.CSV_HEADERS) .setLineTag(CsvToElasticsearch.CSV_LINES) .setUdfOutputTag(CsvToElasticsearch.PROCESSING_OUT) .setUdfDeadletterTag(CsvToElasticsearch.PROCESSING_DEADLETTER_OUT) .build()); // Assert PAssert.that(readCsvOut.get(CsvToElasticsearch.PROCESSING_OUT)) .satisfies( collection -> { FailsafeElement element = collection.iterator().next(); assertThat(element.getOriginalPayload(), is(equalTo(record))); assertThat(element.getPayload(), is(equalTo(stringJsonRecord))); return null; }); // Execute pipeline pipeline.run(); }
Example 6
Source File: KafkaToBigQuery.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Runs the pipeline to completion with the specified options. This method does not wait until the * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result * object to block until the pipeline is finished running if blocking programmatic execution is * required. * * @param options The execution options. * @return The pipeline result. */ public static PipelineResult run(Options options) { Pipeline pipeline = Pipeline.create(options); // Register the coder for pipeline FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of( KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); /* * Steps: * 1) Read messages in from Kafka * 2) Transform the Kafka Messages into TableRows * - Transform message payload via UDF * - Convert UDF result to TableRow objects * 3) Write successful records out to BigQuery * 4) Write failed records out to BigQuery */ PCollectionTuple transformOut = pipeline /* * Step #1: Read messages in from Kafka */ .apply( "ReadFromKafka", KafkaIO.<String, String>read() .withBootstrapServers(options.getBootstrapServers()) .withTopic(options.getInputTopic()) .withKeyDeserializer(StringDeserializer.class) .withValueDeserializer(StringDeserializer.class) // NumSplits is hard-coded to 1 for single-partition use cases (e.g., Debezium // Change Data Capture). Once Dataflow dynamic templates are available, this can // be deprecated. .withNumSplits(1) .withoutMetadata()) /* * Step #2: Transform the Kafka Messages into TableRows */ .apply("ConvertMessageToTableRow", new MessageToTableRow(options)); /* * Step #3: Write the successful records out to BigQuery */ transformOut .get(TRANSFORM_OUT) .apply( "WriteSuccessfulRecords", BigQueryIO.writeTableRows() .withoutValidation() .withCreateDisposition(CreateDisposition.CREATE_NEVER) .withWriteDisposition(WriteDisposition.WRITE_APPEND) .to(options.getOutputTableSpec())); /* * Step #4: Write failed records out to BigQuery */ PCollectionList.of(transformOut.get(UDF_DEADLETTER_OUT)) .and(transformOut.get(TRANSFORM_DEADLETTER_OUT)) .apply("Flatten", Flatten.pCollections()) .apply( "WriteFailedRecords", WriteKafkaMessageErrors.newBuilder() .setErrorRecordsTable( ValueProviderUtils.maybeUseDefaultDeadletterTable( options.getOutputDeadletterTable(), options.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)) .setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()) .build()); return pipeline.run(); }
Example 7
Source File: BigQueryConvertersTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Tests the {@link BigQueryConverters.FailsafeJsonToTableRow} transform with invalid JSON input. */ @Test @Category(NeedsRunner.class) public void testFailsafeJsonToTableRowInvalidJSON() { // Test input final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94"; final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock"); final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes); final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload); // Register the coder for the pipeline. This prevents having to invoke .setCoder() on // many transforms. FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Build the pipeline PCollectionTuple output = pipeline .apply("CreateInput", Create.of(input).withCoder(coder)) .apply( "JsonToTableRow", FailsafeJsonToTableRow.<PubsubMessage>newBuilder() .setSuccessTag(TABLE_ROW_TAG) .setFailureTag(FAILSAFE_ELM_TAG) .build()); // Assert PAssert.that(output.get(TABLE_ROW_TAG)).empty(); PAssert.that(output.get(FAILSAFE_ELM_TAG)) .satisfies( collection -> { final FailsafeElement<PubsubMessage, String> result = collection.iterator().next(); // Check the individual elements of the PubsubMessage since the message above won't be // serializable. assertThat(new String(result.getOriginalPayload().getPayload())).isEqualTo(payload); assertThat(result.getOriginalPayload().getAttributeMap()).isEqualTo(attributes); assertThat(result.getPayload()).isEqualTo(payload); assertThat(result.getErrorMessage()).isNotNull(); assertThat(result.getStacktrace()).isNotNull(); return null; }); // Execute the test pipeline.run(); }
Example 8
Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement} * correctly using a Javascript Udf. Udf processing is handled by {@link * JavascriptTextTransformer}. Should output record to deadletter table tag. */ @Test public void testLineToFailsafeJsonNoHeadersUdfDeadletter() { FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER; CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); PCollection<String> lines = pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of())); PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines); CsvConverters.CsvPipelineOptions options = PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class); options.setDelimiter(","); options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH); options.setJavascriptTextTransformFunctionName("transform"); PCollectionTuple failsafe = linesTuple.apply( "TestLineToFailsafeJsonNoHeadersUdfBad", CsvConverters.LineToFailsafeJson.newBuilder() .setDelimiter(options.getDelimiter()) .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()) .setUdfFunctionName(options.getJavascriptTextTransformFunctionName()) .setJsonSchemaPath(options.getJsonSchemaPath()) .setJsonSchemaPath(null) .setHeaderTag(CSV_HEADERS) .setLineTag(CSV_LINES) .setUdfOutputTag(PROCESSING_OUT) .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT) .build()); PAssert.that(failsafe.get(PROCESSING_OUT)).empty(); PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT)) .satisfies( collection -> { FailsafeElement result = collection.iterator().next(); assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD))); return null; }); pipeline.run(); }
Example 9
Source File: ErrorConvertersTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Tests that {@link ErrorConverters.FailedStringToTableRowFn} properly formats failed String * objects into {@link TableRow} objects to save to BigQuery. */ @Test public void testFailedStringMessageToTableRowFn() { // Test input final String message = "Super secret"; final String errorMessage = "Failed to parse input JSON"; final String stacktrace = "Error at com.google.cloud.teleport.TextToBigQueryStreaming"; final FailsafeElement<String, String> input = FailsafeElement.of(message, message) .setErrorMessage(errorMessage) .setStacktrace(stacktrace); final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant(); // Register the coder for the pipeline. This prevents having to invoke .setCoder() on // many transforms. FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Build pipeline PCollection<TableRow> output = pipeline .apply( "CreateInput", Create.timestamped(TimestampedValue.of(input, timestamp)).withCoder(coder)) .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn())); // Assert PAssert.that(output) .satisfies( collection -> { final TableRow result = collection.iterator().next(); assertThat(result.get("timestamp")).isEqualTo("2022-02-22 22:22:22.222000"); assertThat(result.get("attributes")).isNull(); assertThat(result.get("payloadString")).isEqualTo(message); assertThat(result.get("payloadBytes")).isNotNull(); assertThat(result.get("errorMessage")).isEqualTo(errorMessage); assertThat(result.get("stacktrace")).isEqualTo(stacktrace); return null; }); // Execute pipeline pipeline.run(); }
Example 10
Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Tests the {@link FailsafeJavascriptUdf} when it's passed invalid JSON. In this case the UDF * should output the input {@link FailsafeElement} to the dead-letter enriched with error * information. */ @Test @Category(NeedsRunner.class) public void testFailsafeJavaScriptUdfInvalidInput() { // Test input final String fileSystemPath = TRANSFORM_FILE_PATH; final String functionName = "transform"; final String payload = "\"ticker\": \"GOOGL\", \"price\": 1006.94"; final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock"); final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes); final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload); // Register the coder for the pipeline. This prevents having to invoke .setCoder() on // many transforms. FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Build the pipeline PCollectionTuple output = pipeline .apply("CreateInput", Create.of(input).withCoder(coder)) .apply( "InvokeUdf", FailsafeJavascriptUdf.<PubsubMessage>newBuilder() .setFileSystemPath(fileSystemPath) .setFunctionName(functionName) .setSuccessTag(SUCCESS_TAG) .setFailureTag(FAILURE_TAG) .build()); // Assert PAssert.that(output.get(SUCCESS_TAG)).empty(); PAssert.that(output.get(FAILURE_TAG)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> result = collection.iterator().next(); PubsubMessage resultMessage = result.getOriginalPayload(); assertThat(new String(resultMessage.getPayload()), is(equalTo(payload))); assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes))); assertThat(result.getPayload(), is(equalTo(payload))); assertThat(result.getErrorMessage(), is(notNullValue())); assertThat(result.getStacktrace(), is(notNullValue())); return null; }); // Execute the test pipeline.run(); }
Example 11
Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link FailsafeJavascriptUdf} when the input is valid. */ @Test @Category(NeedsRunner.class) public void testFailsafeJavaScriptUdfValidInput() { // Test input final String fileSystemPath = TRANSFORM_FILE_PATH; final String functionName = "transform"; final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}"; final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock"); final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes); final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload); // Register the coder for the pipeline. This prevents having to invoke .setCoder() on // many transforms. FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Build the pipeline PCollectionTuple output = pipeline .apply("CreateInput", Create.of(input).withCoder(coder)) .apply( "InvokeUdf", FailsafeJavascriptUdf.<PubsubMessage>newBuilder() .setFileSystemPath(fileSystemPath) .setFunctionName(functionName) .setSuccessTag(SUCCESS_TAG) .setFailureTag(FAILURE_TAG) .build()); // Assert PAssert.that(output.get(SUCCESS_TAG)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> result = collection.iterator().next(); PubsubMessage resultMessage = result.getOriginalPayload(); String expectedPayload = "{\"ticker\":\"GOOGL\",\"price\":1006.94,\"someProp\":\"someValue\"}"; assertThat(new String(resultMessage.getPayload()), is(equalTo(payload))); assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes))); assertThat(result.getPayload(), is(equalTo(expectedPayload))); assertThat(result.getErrorMessage(), is(nullValue())); assertThat(result.getStacktrace(), is(nullValue())); return null; }); PAssert.that(output.get(FAILURE_TAG)).empty(); // Execute the test pipeline.run(); }
Example 12
Source File: PubSubCdcToBigQueryTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link PubSubCdcToBigQuery} pipeline end-to-end. */ @Test public void testPubSubCdcToBigQueryApplyJavaScriptUDF() throws Exception { // Test input final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}"; final PubsubMessage message = new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event")); final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant(); final FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Parameters String transformPath = TRANSFORM_FILE_PATH; String transformFunction = "transform"; PubSubCdcToBigQuery.Options options = PipelineOptionsFactory.create().as(PubSubCdcToBigQuery.Options.class); options.setJavascriptTextTransformGcsPath(transformPath); options.setJavascriptTextTransformFunctionName(transformFunction); // Build pipeline PCollectionTuple transformOut = pipeline .apply( "CreateInput", Create.timestamped(TimestampedValue.of(message, timestamp)) .withCoder(PubsubMessageWithAttributesCoder.of())) .apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options)); // Assert PAssert.that(transformOut.get(PubSubCdcToBigQuery.UDF_DEADLETTER_OUT)).empty(); PAssert.that(transformOut.get(PubSubCdcToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty(); PAssert.that(transformOut.get(PubSubCdcToBigQuery.TRANSFORM_OUT)) .satisfies( collection -> { TableRow result = collection.iterator().next(); assertThat(result.get("ticker"), is(equalTo("GOOGL"))); assertThat(result.get("price"), is(equalTo(1006.94))); return null; }); // Execute pipeline pipeline.run(); }
Example 13
Source File: PubsubToBigQueryTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link PubSubToBigQuery} pipeline end-to-end. */ @Test public void testPubsubToBigQueryE2E() throws Exception { // Test input final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}"; final PubsubMessage message = new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event")); final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant(); final FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Parameters ValueProvider<String> transformPath = pipeline.newProvider(TRANSFORM_FILE_PATH); ValueProvider<String> transformFunction = pipeline.newProvider("transform"); PubSubToBigQuery.Options options = PipelineOptionsFactory.create().as(PubSubToBigQuery.Options.class); options.setJavascriptTextTransformGcsPath(transformPath); options.setJavascriptTextTransformFunctionName(transformFunction); // Build pipeline PCollectionTuple transformOut = pipeline .apply( "CreateInput", Create.timestamped(TimestampedValue.of(message, timestamp)) .withCoder(PubsubMessageWithAttributesCoder.of())) .apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options)); // Assert PAssert.that(transformOut.get(PubSubToBigQuery.UDF_DEADLETTER_OUT)).empty(); PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty(); PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_OUT)) .satisfies( collection -> { TableRow result = collection.iterator().next(); assertThat(result.get("ticker"), is(equalTo("GOOGL"))); assertThat(result.get("price"), is(equalTo(1006.94))); return null; }); // Execute pipeline pipeline.run(); }
Example 14
Source File: PubSubToMongoDBTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link PubSubToMongoDB} pipeline end-to-end with a bad UDF. */ @Test public void testPubSubToMongoDBBadUdfE2E() { CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType( PubSubToMongoDB.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToMongoDB.FAILSAFE_ELEMENT_CODER); coderRegistry.registerCoderForType( PubSubToMongoDB.CODER.getEncodedTypeDescriptor(), PubSubToMongoDB.CODER); PubSubToMongoDB.Options options = TestPipeline.testingPipelineOptions().as(PubSubToMongoDB.Options.class); options.setDeadletterTable("test:dataset.table"); options.setJavascriptTextTransformFunctionName("transformBad"); options.setJavascriptTextTransformGcsPath(BAD_TRANSFORM_FILE_PATH); PCollectionTuple pc = pipeline .apply(Create.of(badTestMessages.get(0))) .apply( PubSubToMongoDB.PubSubMessageToJsonDocument.newBuilder() .setJavascriptTextTransformFunctionName( options.getJavascriptTextTransformFunctionName()) .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath()) .build()); PAssert.that(pc.get(PubSubToMongoDB.TRANSFORM_DEADLETTER_OUT)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> element = collection.iterator().next(); assertThat( element.getOriginalPayload().getPayload(), is(equalTo(badTestMessages.get(0).getPayload()))); return null; }); PAssert.that(pc.get(PubSubToMongoDB.TRANSFORM_OUT)).empty(); // Execute pipeline pipeline.run(options); }
Example 15
Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Tests the {@link FailsafeJavascriptUdf} when it's passed invalid JSON. In this case the UDF * should output the input {@link FailsafeElement} to the dead-letter enriched with error * information. */ @Test @Category(NeedsRunner.class) public void testFailsafeJavaScriptUdfInvalidInput() { // Test input final ValueProvider<String> fileSystemPath = pipeline.newProvider(TRANSFORM_FILE_PATH); final ValueProvider<String> functionName = pipeline.newProvider("transform"); final String payload = "\"ticker\": \"GOOGL\", \"price\": 1006.94"; final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock"); final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes); final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload); // Register the coder for the pipeline. This prevents having to invoke .setCoder() on // many transforms. FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Build the pipeline PCollectionTuple output = pipeline .apply("CreateInput", Create.of(input).withCoder(coder)) .apply( "InvokeUdf", FailsafeJavascriptUdf.<PubsubMessage>newBuilder() .setFileSystemPath(fileSystemPath) .setFunctionName(functionName) .setSuccessTag(SUCCESS_TAG) .setFailureTag(FAILURE_TAG) .build()); // Assert PAssert.that(output.get(SUCCESS_TAG)).empty(); PAssert.that(output.get(FAILURE_TAG)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> result = collection.iterator().next(); PubsubMessage resultMessage = result.getOriginalPayload(); assertThat(new String(resultMessage.getPayload()), is(equalTo(payload))); assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes))); assertThat(result.getPayload(), is(equalTo(payload))); assertThat(result.getErrorMessage(), is(notNullValue())); assertThat(result.getStacktrace(), is(notNullValue())); return null; }); // Execute the test pipeline.run(); }
Example 16
Source File: CsvToElasticsearch.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Runs the pipeline to completion with the specified options. * * @param options The execution options. * @return The pipeline result. */ private static PipelineResult run(CsvToElasticsearchOptions options) { // Create the pipeline Pipeline pipeline = Pipeline.create(options); // Register the coder for pipeline CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType( FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER); // Throw error if containsHeaders is true and a schema or Udf is also set. if (options.getContainsHeaders()) { checkArgument( options.getJavascriptTextTransformGcsPath() == null && options.getJsonSchemaPath() == null, "Cannot parse file containing headers with UDF or Json schema."); } // Throw error if only one retry configuration parameter is set. if (options.getMaxRetryAttempts() != null || options.getMaxRetryDuration() != null) { checkArgument( options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null, "To specify retry configuration both max attempts and max duration must be set."); } /* * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}. * 2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}. * 3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}. * 3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}. */ PCollectionTuple convertedCsvLines = pipeline /* * Step 1: Read CSV file(s) from Cloud Storage using {@link CsvConverters.ReadCsv}. */ .apply( "ReadCsv", CsvConverters.ReadCsv.newBuilder() .setCsvFormat(options.getCsvFormat()) .setDelimiter(options.getDelimiter()) .setHasHeaders(options.getContainsHeaders()) .setInputFileSpec(options.getInputFileSpec()) .setHeaderTag(CSV_HEADERS) .setLineTag(CSV_LINES) .build()) /* * Step 2: Convert lines to Elasticsearch document. */ .apply( "ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder() .setDelimiter(options.getDelimiter()) .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()) .setUdfFunctionName(options.getJavascriptTextTransformFunctionName()) .setJsonSchemaPath(options.getJsonSchemaPath()) .setHeaderTag(CSV_HEADERS) .setLineTag(CSV_LINES) .setUdfOutputTag(PROCESSING_OUT) .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT) .build()); /* * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}. */ convertedCsvLines .get(PROCESSING_OUT) .apply( "GetJsonDocuments", MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload)) .apply( "WriteToElasticsearch", WriteToElasticsearch.newBuilder() .setOptions(options.as(WriteToElasticsearchOptions.class)) .build()); /* * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}. */ convertedCsvLines .get(PROCESSING_DEADLETTER_OUT) .apply( "AddTimestamps", WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant())) .apply( "WriteFailedElementsToBigQuery", WriteStringMessageErrors.newBuilder() .setErrorRecordsTable(options.getDeadletterTable()) .setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA) .build()); return pipeline.run(); }
Example 17
Source File: JavascriptTextTransformerTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link FailsafeJavascriptUdf} when the input is valid. */ @Test @Category(NeedsRunner.class) public void testFailsafeJavaScriptUdfValidInput() { // Test input final ValueProvider<String> fileSystemPath = pipeline.newProvider(TRANSFORM_FILE_PATH); final ValueProvider<String> functionName = pipeline.newProvider("transform"); final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}"; final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock"); final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes); final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload); // Register the coder for the pipeline. This prevents having to invoke .setCoder() on // many transforms. FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of()); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); // Build the pipeline PCollectionTuple output = pipeline .apply("CreateInput", Create.of(input).withCoder(coder)) .apply( "InvokeUdf", FailsafeJavascriptUdf.<PubsubMessage>newBuilder() .setFileSystemPath(fileSystemPath) .setFunctionName(functionName) .setSuccessTag(SUCCESS_TAG) .setFailureTag(FAILURE_TAG) .build()); // Assert PAssert.that(output.get(SUCCESS_TAG)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> result = collection.iterator().next(); PubsubMessage resultMessage = result.getOriginalPayload(); String expectedPayload = "{\"ticker\":\"GOOGL\",\"price\":1006.94,\"someProp\":\"someValue\"}"; assertThat(new String(resultMessage.getPayload()), is(equalTo(payload))); assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes))); assertThat(result.getPayload(), is(equalTo(expectedPayload))); assertThat(result.getErrorMessage(), is(nullValue())); assertThat(result.getStacktrace(), is(nullValue())); return null; }); PAssert.that(output.get(FAILURE_TAG)).empty(); // Execute the test pipeline.run(); }
Example 18
Source File: PubSubToElasticsearchTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with a bad UDF. */ @Test public void testPubSubToElasticsearchBadUdfE2E() { CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType( PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER); coderRegistry.registerCoderForType( PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER); PubSubToElasticsearch.PubSubToElasticsearchOptions options = TestPipeline.testingPipelineOptions() .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class); options.setDeadletterTable("test:dataset.table"); options.setJavascriptTextTransformFunctionName("transformBad"); options.setJavascriptTextTransformGcsPath(BAD_TRANSFORM_FILE_PATH); PCollectionTuple pc = pipeline .apply(Create.of(badTestMessages.get(0))) .apply( PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder() .setJavascriptTextTransformFunctionName( options.getJavascriptTextTransformFunctionName()) .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath()) .build()); PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_DEADLETTER_OUT)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> element = collection.iterator().next(); assertThat( element.getOriginalPayload().getPayload(), is(equalTo(badTestMessages.get(0).getPayload()))); return null; }); PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT)).empty(); // Execute pipeline pipeline.run(options); }
Example 19
Source File: CsvToElasticsearchTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link CsvToElasticsearch} pipeline using a Udf to parse the Csv. */ @Test public void testCsvToElasticsearchUdfE2E() { final String record = "007,CA,26.23"; final String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}"; final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of( NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())); CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder); CsvToElasticsearch.CsvToElasticsearchOptions options = PipelineOptionsFactory.create().as(CsvToElasticsearch.CsvToElasticsearchOptions.class); options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH); options.setJavascriptTextTransformFunctionName("transform"); options.setContainsHeaders(false); options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH); // Build pipeline with no headers. PCollectionTuple readCsvOut = pipeline .apply( "ReadCsv", CsvConverters.ReadCsv.newBuilder() .setCsvFormat(options.getCsvFormat()) .setDelimiter(options.getDelimiter()) .setHasHeaders(options.getContainsHeaders()) .setInputFileSpec(options.getInputFileSpec()) .setHeaderTag(CsvToElasticsearch.CSV_HEADERS) .setLineTag(CsvToElasticsearch.CSV_LINES) .build()) .apply( "ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder() .setDelimiter(options.getDelimiter()) .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()) .setUdfFunctionName(options.getJavascriptTextTransformFunctionName()) .setJsonSchemaPath(options.getJsonSchemaPath()) .setHeaderTag(CsvToElasticsearch.CSV_HEADERS) .setLineTag(CsvToElasticsearch.CSV_LINES) .setUdfOutputTag(CsvToElasticsearch.PROCESSING_OUT) .setUdfDeadletterTag(CsvToElasticsearch.PROCESSING_DEADLETTER_OUT) .build()); // Assert PAssert.that(readCsvOut.get(CsvToElasticsearch.PROCESSING_OUT)) .satisfies( collection -> { FailsafeElement element = collection.iterator().next(); assertThat(element.getOriginalPayload(), is(equalTo(record))); assertThat(element.getPayload(), is(equalTo(stringifiedJsonRecord))); return null; }); // Execute pipeline pipeline.run(); }
Example 20
Source File: PubSubToElasticsearchTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with no UDF supplied. */ @Test public void testPubSubToElasticsearchNoUdfE2E() { CoderRegistry coderRegistry = pipeline.getCoderRegistry(); coderRegistry.registerCoderForType( PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER); coderRegistry.registerCoderForType( PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER); PubSubToElasticsearch.PubSubToElasticsearchOptions options = TestPipeline.testingPipelineOptions() .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class); options.setDeadletterTable("test:dataset.table"); options.setJavascriptTextTransformFunctionName(null); options.setJavascriptTextTransformGcsPath(null); PCollectionTuple pc = pipeline .apply(Create.of(goodTestMessages.get(0))) .apply( PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder() .setJavascriptTextTransformFunctionName( options.getJavascriptTextTransformFunctionName()) .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath()) .build()); PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT)) .satisfies( collection -> { FailsafeElement<PubsubMessage, String> element = collection.iterator().next(); assertThat( element.getOriginalPayload().getPayload(), is(equalTo(goodTestMessages.get(0).getPayload()))); return null; }); // Execute pipeline pipeline.run(options); }