org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage Java Examples
The following examples show how to use
org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PubsubMessageToHopRowFn.java From hop with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement( ProcessContext processContext ) { try { PubsubMessage message = processContext.element(); inputCounter.inc(); Object[] outputRow = RowDataUtil.allocateRowData(rowMeta.size()); outputRow[0] = message; // Serializable processContext.output( new HopRow( outputRow ) ); writtenCounter.inc(); } catch ( Exception e ) { numErrors.inc(); LOG.error( "Error in pub/sub publish messages function", e ); throw new RuntimeException( "Error in pub/sub publish messages function", e ); } }
Example #2
Source File: DecryptPioneerPayloadsTest.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Test public void testOutput() throws Exception { // minimal test for throughput of a single document ValueProvider<String> metadataLocation = pipeline .newProvider(Resources.getResource("pioneer/metadata-local.json").getPath()); ValueProvider<Boolean> kmsEnabled = pipeline.newProvider(false); ValueProvider<Boolean> decompressPayload = pipeline.newProvider(true); final List<String> input = readTestFiles(Arrays.asList("pioneer/study-foo.ciphertext.json")); PCollection<String> output = pipeline.apply(Create.of(input)) .apply(InputFileFormat.text.decode()) .apply("AddAttributes", MapElements.into(TypeDescriptor.of(PubsubMessage.class)) .via(element -> new PubsubMessage(element.getPayload(), ImmutableMap.of(Attribute.DOCUMENT_NAMESPACE, "telemetry", Attribute.DOCUMENT_TYPE, "pioneer-study", Attribute.DOCUMENT_VERSION, "4")))) .apply(DecryptPioneerPayloads.of(metadataLocation, kmsEnabled, decompressPayload)).output() .apply(OutputFileFormat.text.encode()).apply(ReformatJson.of()); final List<String> expectedMain = readTestFiles(Arrays.asList("pioneer/sample.plaintext.json")); PAssert.that(output).containsInAnyOrder(expectedMain); pipeline.run(); }
Example #3
Source File: RowToPubsubMessage.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<PubsubMessage> expand(PCollection<Row> input) { PCollection<Row> withTimestamp = (config.useTimestampAttribute()) ? input.apply( WithTimestamps.of((row) -> row.getDateTime("event_timestamp").toInstant())) : input; return withTimestamp .apply(DropFields.fields("event_timestamp")) .apply(ToJson.of()) .apply( MapElements.into(TypeDescriptor.of(PubsubMessage.class)) .via( (String json) -> new PubsubMessage( json.getBytes(StandardCharsets.ISO_8859_1), ImmutableMap.of()))); }
Example #4
Source File: PubsubToAvroTest.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** Test {@link AvroPubsubMessageRecord} correctly maps the message. */ @Test @Category(NeedsRunner.class) public void testPubsubMessageToArchive() throws Exception { // Create the test input. byte[] payload = "Laces out Dan!".getBytes(); Map<String, String> attributes = ImmutableMap.of("id", "Ace"); PubsubMessage message = new PubsubMessage(payload, attributes); Instant timestamp = Instant.now(); // Apply the ParDo. PCollection<AvroPubsubMessageRecord> results = pipeline .apply(Create.timestamped(TimestampedValue.of(message, timestamp))) .apply(ParDo.of(new PubsubMessageToArchiveDoFn())); // Assert on the results. PAssert.that(results) .containsInAnyOrder( new AvroPubsubMessageRecord(payload, attributes, timestamp.getMillis())); // Run the pipeline. pipeline.run(); }
Example #5
Source File: LimitPayloadSize.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
/** Factory method to create mapper instance. */ public static MapWithFailures<PubsubMessage, PubsubMessage, PubsubMessage> of(int maxBytes) { final Counter countPayloadTooLarge = Metrics.counter(LimitPayloadSize.class, "payload_too_large"); return MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((PubsubMessage msg) -> { msg = PubsubConstraints.ensureNonNull(msg); int numBytes = msg.getPayload().length; if (numBytes > maxBytes) { countPayloadTooLarge.inc(); throw new PayloadTooLargeException("Message payload is " + numBytes + "bytes, larger than the" + " configured limit of " + maxBytes); } return msg; }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class)) .exceptionsVia((ExceptionElement<PubsubMessage> ee) -> { try { throw ee.exception(); } catch (PayloadTooLargeException e) { return FailureMessage.of(LimitPayloadSize.class.getSimpleName(), ee.element(), ee.exception()); } }); }
Example #6
Source File: LimitPayloadSizeTest.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Test public void testLimit() { List<String> passingPayloads = ImmutableList.of("", "abcdefg", StringUtils.repeat("abcdefg", 50)); List<String> failingPayloads = ImmutableList.of(StringUtils.repeat("abcdefghij", 51)); WithFailures.Result<PCollection<PubsubMessage>, PubsubMessage> result = pipeline // .apply(Create.of(Iterables.concat(passingPayloads, failingPayloads))) // .apply(InputFileFormat.text.decode()) // .apply("LimitPayloadSize", LimitPayloadSize.toBytes(500)); PAssert .that(result.output().apply("get success payload", MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) // .containsInAnyOrder(passingPayloads); PAssert .that(result.failures().apply("get failure payload", MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) // .containsInAnyOrder(failingPayloads); pipeline.run(); }
Example #7
Source File: PubsubSink.java From beam with Apache License 2.0 | 6 votes |
PubsubSink( String topic, String timestampLabel, String idLabel, Coder<WindowedValue<T>> coder, SimpleFunction<T, PubsubMessage> formatFn, boolean withAttributes, StreamingModeExecutionContext context) { this.topic = topic; this.timestampLabel = timestampLabel; this.idLabel = idLabel; @SuppressWarnings({"unchecked", "rawtypes"}) WindowedValueCoder<T> windowedCoder = (WindowedValueCoder) coder; this.coder = windowedCoder.getValueCoder(); this.withAttributes = withAttributes; this.formatFn = formatFn; this.context = context; }
Example #8
Source File: PubsubConstraints.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@ProcessElement public void processElement(@Element PubsubMessage message, OutputReceiver<PubsubMessage> out) { Map<String, String> attributes = new HashMap<>(message.getAttributeMap().size()); for (Map.Entry<String, String> entry : message.getAttributeMap().entrySet()) { String key = truncateAttributeKey(entry.getKey()); if (!key.equals(entry.getKey())) { countTruncatedKey.inc(); } String value = truncateAttributeValue(entry.getValue()); if (value != null && !value.equals(entry.getValue())) { countTruncatedValue.inc(); } attributes.put(key, value); } out.output(new PubsubMessage(message.getPayload(), attributes)); }
Example #9
Source File: Deduplicate.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@ProcessElement public void processElement(@Element PubsubMessage element, MultiOutputReceiver out) { element = PubsubConstraints.ensureNonNull(element); boolean idExists = false; boolean exceptionWasThrown = false; try { idExists = // Throws IllegalArgumentException if id is present and invalid getId(element).filter(redisIdService::exists).isPresent(); } catch (Exception e) { exceptionWasThrown = true; out.get(errorTag).output(FailureMessage.of(RemoveDuplicates.this, element, e)); } if (!exceptionWasThrown) { if (idExists) { PerDocTypeCounter.inc(element.getAttributeMap(), "duplicate_submission"); PerDocTypeCounter.inc(element.getAttributeMap(), "duplicate_submission_bytes", element.getPayload().length); out.get(duplicateTag).output(element); } else { out.get(outputTag).output(element); } } }
Example #10
Source File: DecompressPayload.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Override public PubsubMessage apply(PubsubMessage message) { if (enabled.isAccessible() && !enabled.get()) { // Compression has been explicitly turned off in options, so return unchanged message. return message; } else { try { ByteArrayInputStream payloadStream = new ByteArrayInputStream(message.getPayload()); GZIPInputStream gzipStream = new GZIPInputStream(payloadStream); ByteArrayOutputStream decompressedStream = new ByteArrayOutputStream(); // Throws IOException IOUtils.copy(gzipStream, decompressedStream); compressedInput.inc(); return new PubsubMessage(decompressedStream.toByteArray(), message.getAttributeMap()); } catch (IOException ignore) { // payload wasn't valid gzip, assume it wasn't compressed uncompressedInput.inc(); return message; } } }
Example #11
Source File: ParsePayload.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Override public Result<PCollection<PubsubMessage>, PubsubMessage> expand( PCollection<PubsubMessage> messages) { return messages.apply(FlatMapElements.into(TypeDescriptor.of(PubsubMessage.class)) // .via(new Fn()) // .exceptionsInto(TypeDescriptor.of(PubsubMessage.class)) // .exceptionsVia((WithFailures.ExceptionElement<PubsubMessage> ee) -> { try { throw ee.exception(); } catch (IOException | SchemaNotFoundException | ValidationException | MessageScrubberException e) { return FailureMessage.of(ParsePayload.class.getSimpleName(), // ee.element(), // ee.exception()); } })); }
Example #12
Source File: RemoveAttributes.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Override public PubsubMessage apply(PubsubMessage message) { message = PubsubConstraints.ensureNonNull(message); final Map<String, String> attributes = message.getAttributeMap(); Map<String, String> strippedAttributes = new HashMap<>(); // Use geo_country to match IP privacy v1 dataset attributes.put(Attribute.GEO_COUNTRY, message.getAttribute(Attribute.NORMALIZED_COUNTRY_CODE)); ATTRIBUTES_TO_KEEP.forEach(name -> Optional.ofNullable(attributes.get(name)) .ifPresent(value -> strippedAttributes.put(name, value))); return new PubsubMessage(message.getPayload(), strippedAttributes); }
Example #13
Source File: Sink.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
/** * Execute an Apache Beam pipeline and return the {@code PipelineResult}. */ public static PipelineResult run(SinkOptions.Parsed options) { final Pipeline pipeline = Pipeline.create(options); final List<PCollection<PubsubMessage>> failureCollections = new ArrayList<>(); pipeline // .apply(options.getInputType().read(options)) // .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) // .apply(options.getOutputType().write(options)).failuresTo(failureCollections); PCollectionList.of(failureCollections) // .apply("FlattenFailureCollections", Flatten.pCollections()) // .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) // .output(); return pipeline.run(); }
Example #14
Source File: StreamingDataGenerator.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(@Element Long element, @Timestamp Instant timestamp, OutputReceiver<PubsubMessage> receiver, ProcessContext context) throws IOException, JsonDataGeneratorException { // TODO: Add the ability to place eventId and eventTimestamp in the attributes. byte[] payload; Map<String, String> attributes = new HashMap<>(); // Generate the fake JSON according to the schema. try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) { dataGenerator.generateTestDataJson(schema, byteArrayOutputStream); payload = byteArrayOutputStream.toByteArray(); } receiver.output(new PubsubMessage(payload, attributes)); }
Example #15
Source File: PubSubToMongoDB.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@Override public PCollectionTuple expand(PCollection<PubsubMessage> input) { // Map the incoming messages into FailsafeElements so we can recover from failures // across multiple transforms. PCollection<FailsafeElement<PubsubMessage, String>> failsafeElements = input.apply("MapToRecord", ParDo.of(new PubsubMessageToFailsafeElementFn())); // If a Udf is supplied then use it to parse the PubSubMessages. if (javascriptTextTransformGcsPath() != null) { return failsafeElements.apply( "InvokeUDF", JavascriptTextTransformer.FailsafeJavascriptUdf.<PubsubMessage>newBuilder() .setFileSystemPath(javascriptTextTransformGcsPath()) .setFunctionName(javascriptTextTransformFunctionName()) .setSuccessTag(TRANSFORM_OUT) .setFailureTag(TRANSFORM_DEADLETTER_OUT) .build()); } else { return failsafeElements.apply( "ProcessPubSubMessages", ParDo.of(new ProcessFailsafePubSubFn()) .withOutputTags(TRANSFORM_OUT, TupleTagList.of(TRANSFORM_DEADLETTER_OUT))); } }
Example #16
Source File: Deduplicate.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Override public Result<PCollection<PubsubMessage>, PubsubMessage> expand( PCollection<PubsubMessage> elements) { return elements.apply( MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((PubsubMessage el) -> { el = PubsubConstraints.ensureNonNull(el); if (uri != null && uri.isAccessible() && uri.get() != null) { // Throws IllegalArgumentException if ID is present and invalid. getId(el).ifPresent(id -> redisIdService.setWithExpiration(id, getTtlSeconds())); } return el; }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class)) .exceptionsVia((ExceptionElement<PubsubMessage> ee) -> { try { throw ee.exception(); } catch (IllegalArgumentException e) { return FailureMessage.of(Deduplicate.class.getSimpleName(), // ee.element(), // ee.exception()); } })); }
Example #17
Source File: PubSubToMongoDB.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext context) { PubsubMessage pubsubMessage = context.element().getOriginalPayload(); JsonObject messageObject = new JsonObject(); try { if (pubsubMessage.getPayload().length > 0) { messageObject = gson.fromJson(new String(pubsubMessage.getPayload()), JsonObject.class); } // If message attributes are present they will be serialized along with the message payload if (pubsubMessage.getAttributeMap() != null) { pubsubMessage.getAttributeMap().forEach(messageObject::addProperty); } context.output( FailsafeElement.of(pubsubMessage, messageObject.toString())); successCounter.inc(); } catch (JsonSyntaxException e) { context.output( TRANSFORM_DEADLETTER_OUT, FailsafeElement.of(context.element()) .setErrorMessage(e.getMessage()) .setStacktrace(Throwables.getStackTraceAsString(e))); failedCounter.inc(); } }
Example #18
Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public WriteResult expand(PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) { return failedRecords .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn())) .apply( "WriteFailedRecordsToBigQuery", BigQueryIO.writeTableRows() .to(getErrorRecordsTable()) .withJsonSchema(getErrorRecordsTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #19
Source File: DecodePubsubMessages.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
/** Decoder from json to PubsubMessage. */ public static PTransform<PCollection<? extends String>, PCollection<PubsubMessage>> json() { return PTransform.compose("DecodePubsubMessages.Json", input -> input .apply(MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((String s) -> { try { return com.mozilla.telemetry.util.Json.readPubsubMessage(s); } catch (IOException e) { throw new UncheckedIOException(e); } }))); }
Example #20
Source File: PubSubToBigQuery.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public PCollectionTuple expand(PCollection<PubsubMessage> input) { PCollectionTuple udfOut = input // Map the incoming messages into FailsafeElements so we can recover from failures // across multiple transforms. .apply("MapToRecord", ParDo.of(new PubsubMessageToFailsafeElementFn())) .apply( "InvokeUDF", FailsafeJavascriptUdf.<PubsubMessage>newBuilder() .setFileSystemPath(options.getJavascriptTextTransformGcsPath()) .setFunctionName(options.getJavascriptTextTransformFunctionName()) .setSuccessTag(UDF_OUT) .setFailureTag(UDF_DEADLETTER_OUT) .build()); // Convert the records which were successfully processed by the UDF into TableRow objects. PCollectionTuple jsonToTableRowOut = udfOut .get(UDF_OUT) .apply( "JsonToTableRow", FailsafeJsonToTableRow.<PubsubMessage>newBuilder() .setSuccessTag(TRANSFORM_OUT) .setFailureTag(TRANSFORM_DEADLETTER_OUT) .build()); // Re-wrap the PCollections so we can return a single PCollectionTuple return PCollectionTuple.of(UDF_OUT, udfOut.get(UDF_OUT)) .and(UDF_DEADLETTER_OUT, udfOut.get(UDF_DEADLETTER_OUT)) .and(TRANSFORM_OUT, jsonToTableRowOut.get(TRANSFORM_OUT)) .and(TRANSFORM_DEADLETTER_OUT, jsonToTableRowOut.get(TRANSFORM_DEADLETTER_OUT)); }
Example #21
Source File: RepublishPerDocType.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Override public int partitionFor(PubsubMessage message, int numPartitions) { message = PubsubConstraints.ensureNonNull(message); String docType = message.getAttribute("document_type"); String namespace = message.getAttribute("document_namespace"); for (int i = 0; i < destinations.size(); i++) { if (destinations.get(i).matches(namespace, docType)) { return i; } } // The last partition catches docTypes that aren't configured to have a destination; // these will be ignored. return numPartitions - 1; }
Example #22
Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext ctx) { PubsubMessage message = ctx.element(); Map<String, String> attributes = message.getAttributeMap(); try { Schema schema = getStore().getSchema(attributes); GenericRecord record = formatter.formatRecord(message, schema); byte[] avroPayload = binaryEncoder.encodeRecord(record, schema); ctx.output(successTag, new PubsubMessage(avroPayload, attributes)); } catch (Exception e) { ctx.output(errorTag, FailureMessage.of(this, message, e)); } }
Example #23
Source File: RepublishPerNamespace.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Override public int partitionFor(PubsubMessage message, int numPartitions) { message = PubsubConstraints.ensureNonNull(message); String namespace = message.getAttribute("document_namespace"); for (int i = 0; i < destinations.size(); i++) { if (destinations.get(i).matches(namespace)) { return i; } } // The last partition catches docTypes that aren't configured to have a destination; // these will be ignored. return numPartitions - 1; }
Example #24
Source File: RepublishPerChannel.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Override public PDone expand(PCollection<PubsubMessage> input) { List<Destination> destinations = baseOptions.getPerChannelSampleRatios().entrySet().stream() // .map(Destination::new) // .collect(Collectors.toList()); int numDestinations = destinations.size(); int numPartitions = numDestinations + 1; PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByChannel", Partition.of(numPartitions, new PartitionFn(destinations))); for (int i = 0; i < numDestinations; i++) { Destination destination = destinations.get(i); RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class); // The destination pattern here must be compile-time due to a detail of Dataflow's // streaming PubSub producer implementation; if that restriction is lifted in the future, // this can become a runtime parameter and we can perform replacement via NestedValueProvider. opts.setOutput(StaticValueProvider .of(baseOptions.getPerChannelDestination().replace("${channel}", destination.channel))); partitioned.get(i) // .apply("Sample" + destination.getCapitalizedChannel() + "BySampleIdOrRandomNumber", Filter.by(message -> { message = PubsubConstraints.ensureNonNull(message); String sampleId = message.getAttribute("sample_id"); return RandomSampler.filterBySampleIdOrRandomNumber(sampleId, destination.ratio); })) .apply("Republish" + destination.getCapitalizedChannel() + "Sample", opts.getOutputType().write(opts)); } return PDone.in(input.getPipeline()); }
Example #25
Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Override public WithFailures.Result<PDone, PubsubMessage> expand(PCollection<PubsubMessage> input) { ValueProvider<DynamicPathTemplate> pathTemplate = NestedValueProvider.of(outputPrefix, DynamicPathTemplate::new); ValueProvider<String> staticPrefix = NestedValueProvider.of(pathTemplate, value -> value.staticPrefix); FileIO.Write<List<String>, PubsubMessage> write = FileIO .<List<String>, PubsubMessage>writeDynamic() // We can't pass the attribute map to by() directly since MapCoder isn't // deterministic; // instead, we extract an ordered list of the needed placeholder values. // That list is later available to withNaming() to determine output location. .by(message -> pathTemplate.get() .extractValuesFrom(DerivedAttributesMap.of(message.getAttributeMap()))) .withDestinationCoder(ListCoder.of(StringUtf8Coder.of())) // .withCompression(compression) // .via(Contextful.fn(format::encodeSingleMessage), TextIO.sink()) // .to(staticPrefix) // .withNaming(placeholderValues -> NoColonFileNaming.defaultNaming( pathTemplate.get().replaceDynamicPart(placeholderValues), format.suffix())); if (inputType == InputType.pubsub) { // Passing a ValueProvider to withNumShards disables runner-determined sharding, so we // need to be careful to pass this only for streaming input (where runner-determined // sharding is not an option). write = write.withNumShards(numShards); } input // .apply(Window.<PubsubMessage>into(FixedWindows.of(windowDuration)) // We allow lateness up to the maximum Cloud Pub/Sub retention of 7 days documented in // https://cloud.google.com/pubsub/docs/subscriber .withAllowedLateness(Duration.standardDays(7)) // .discardingFiredPanes()) .apply(write); return WithFailures.Result.of(PDone.in(input.getPipeline()), EmptyErrors.in(input.getPipeline())); }
Example #26
Source File: CdcPCollectionsFetchers.java From DataflowTemplates with Apache License 2.0 | 5 votes |
public Map<String, PCollection<Row>> changelogPcollections(Pipeline p) { Map<String, PCollection<Row>> result = new HashMap<>(); List<TopicSubscriptionSchema> readSourceSchemas = buildTopicSubscriptionSchemas( options.as(GcpOptions.class).getProject(), options.getInputTopics(), options.getInputSubscriptions()); for (TopicSubscriptionSchema rss: readSourceSchemas) { String transformTopicPrefix = rss.topic; PCollection<PubsubMessage> pubsubData; if (rss.subscription == null) { pubsubData = p.apply( String.format("%s/Read Updates from PubSub", transformTopicPrefix), PubsubIO.readMessagesWithAttributes() .fromTopic(String.format( "projects/%s/topics/%s", options.as(GcpOptions.class).getProject(), rss.topic))); } else { pubsubData = p.apply( String.format("%s/Read Updates from PubSub", transformTopicPrefix), PubsubIO.readMessagesWithAttributes().fromSubscription(String.format( "projects/%s/subscriptions/%s", options.as(GcpOptions.class).getProject(), rss.subscription))); } PCollection<Row> collectionOfRows = pubsubData .apply(String.format("%s/Extract payload", transformTopicPrefix), MapElements.into(TypeDescriptor.of(byte[].class)) .via(PubsubMessage::getPayload)) .apply( String.format("%s/Decode", transformTopicPrefix), DecodeRows.withSchema(rss.schema)); result.put(transformTopicPrefix, collectionOfRows); } return result; }
Example #27
Source File: StreamingDataGeneratorTest.java From DataflowTemplates with Apache License 2.0 | 5 votes |
/** Tests the {@link MessageGeneratorFn} generates fake data. */ @Test public void testMessageGenerator() throws IOException { // Arrange // String schema = "{" + "\"id\": \"{{uuid()}}\", " + "\"eventTime\": \"{{timestamp()}}\", " + "\"username\": \"{{username()}}\", " + "\"score\": {{integer(0,100)}}" + "}"; File file = tempFolder.newFile(); writeToFile(file.getAbsolutePath(), schema); // Act // PCollection<PubsubMessage> results = pipeline .apply("CreateInput", Create.of(0L)) .apply("GenerateMessage", ParDo.of(new MessageGeneratorFn(file.getAbsolutePath()))); // Assert // PAssert.that(results) .satisfies( input -> { PubsubMessage message = input.iterator().next(); assertThat(message, is(notNullValue())); assertThat(message.getPayload(), is(notNullValue())); assertThat(message.getAttributeMap(), is(notNullValue())); return null; }); pipeline.run(); }
Example #28
Source File: NexmarkLauncher.java From beam with Apache License 2.0 | 5 votes |
/** Send {@code events} to Pubsub. */ private void sinkEventsToPubsub(PCollection<Event> events) { checkState(pubsubTopic != null, "Pubsub topic needs to be set up before initializing sink"); NexmarkUtils.console("Writing events to Pubsub %s", pubsubTopic); PubsubIO.Write<PubsubMessage> io = PubsubIO.writeMessages().to(pubsubTopic).withIdAttribute(NexmarkUtils.PUBSUB_ID); if (!configuration.usePubsubPublishTime) { io = io.withTimestampAttribute(NexmarkUtils.PUBSUB_TIMESTAMP); } events .apply(queryName + ".EventToPubsubMessage", ParDo.of(new EventPubsubMessageDoFn())) .apply(queryName + ".WritePubsubEvents", io); }
Example #29
Source File: Read.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Override public PCollection<PubsubMessage> expand(PBegin input) { return input // .apply(PubsubIO.readMessagesWithAttributesAndMessageId().fromSubscription(subscription)) .apply(MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via(message -> { Map<String, String> attributesWithMessageId = new HashMap<>(message.getAttributeMap()); attributesWithMessageId.put(Attribute.MESSAGE_ID, message.getMessageId()); return new PubsubMessage(message.getPayload(), attributesWithMessageId); })); }
Example #30
Source File: IpPrivacyDecoder.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
/** * Execute an Apache Beam pipeline and return the {@code PipelineResult}. */ public static PipelineResult run(IpPrivacyDecoderOptions.Parsed options) { final Pipeline pipeline = Pipeline.create(options); final List<PCollection<PubsubMessage>> errorCollections = new ArrayList<>(); // We wrap pipeline in Optional for more convenience in chaining together transforms. Optional.of(pipeline) // .map(p -> p // .apply(options.getInputType().read(options)) // .apply(ParseUri.of()).failuresTo(errorCollections) // .apply("RestrictToMainPings", Filter .by((message) -> "main".equals(message.getAttribute(Attribute.DOCUMENT_TYPE)))) .apply(ParseProxy.of()) // .apply(ParseIp.of()) // .apply(GeoCityLookup.of(options.getGeoCityDatabase(), options.getGeoCityFilter())) // .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) // .apply(ExtractClientIdAndDropPayload.of()).failuresTo(errorCollections) // .apply(HashClientInfo.of(options.getClientIdHashKey(), options.getClientIpHashKey())) // .apply(NormalizeAttributes.of())) // .map(p -> p // .apply(RemoveAttributes.of()) // .apply(options.getOutputType().write(options)).failuresTo(errorCollections)); // Write error output collections. PCollectionList.of(errorCollections) // .apply("FlattenErrorCollections", Flatten.pCollections()) // .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) // .output(); return pipeline.run(); }