org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PubsubMessageToHopRowFn.java    From hop with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement( ProcessContext processContext ) {
  try {

    PubsubMessage message = processContext.element();
    inputCounter.inc();

    Object[] outputRow = RowDataUtil.allocateRowData(rowMeta.size());
    outputRow[0] = message; // Serializable

    processContext.output( new HopRow( outputRow ) );
    writtenCounter.inc();

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in pub/sub publish messages function", e );
    throw new RuntimeException( "Error in pub/sub publish messages function", e );
  }
}
 
Example #2
Source File: DecryptPioneerPayloadsTest.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Test
public void testOutput() throws Exception {
  // minimal test for throughput of a single document
  ValueProvider<String> metadataLocation = pipeline
      .newProvider(Resources.getResource("pioneer/metadata-local.json").getPath());
  ValueProvider<Boolean> kmsEnabled = pipeline.newProvider(false);
  ValueProvider<Boolean> decompressPayload = pipeline.newProvider(true);

  final List<String> input = readTestFiles(Arrays.asList("pioneer/study-foo.ciphertext.json"));
  PCollection<String> output = pipeline.apply(Create.of(input))
      .apply(InputFileFormat.text.decode())
      .apply("AddAttributes", MapElements.into(TypeDescriptor.of(PubsubMessage.class))
          .via(element -> new PubsubMessage(element.getPayload(),
              ImmutableMap.of(Attribute.DOCUMENT_NAMESPACE, "telemetry", Attribute.DOCUMENT_TYPE,
                  "pioneer-study", Attribute.DOCUMENT_VERSION, "4"))))
      .apply(DecryptPioneerPayloads.of(metadataLocation, kmsEnabled, decompressPayload)).output()
      .apply(OutputFileFormat.text.encode()).apply(ReformatJson.of());

  final List<String> expectedMain = readTestFiles(Arrays.asList("pioneer/sample.plaintext.json"));
  PAssert.that(output).containsInAnyOrder(expectedMain);

  pipeline.run();
}
 
Example #3
Source File: RowToPubsubMessage.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PCollection<Row> input) {
  PCollection<Row> withTimestamp =
      (config.useTimestampAttribute())
          ? input.apply(
              WithTimestamps.of((row) -> row.getDateTime("event_timestamp").toInstant()))
          : input;

  return withTimestamp
      .apply(DropFields.fields("event_timestamp"))
      .apply(ToJson.of())
      .apply(
          MapElements.into(TypeDescriptor.of(PubsubMessage.class))
              .via(
                  (String json) ->
                      new PubsubMessage(
                          json.getBytes(StandardCharsets.ISO_8859_1), ImmutableMap.of())));
}
 
Example #4
Source File: PubsubToAvroTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/** Test {@link AvroPubsubMessageRecord} correctly maps the message. */
@Test
@Category(NeedsRunner.class)
public void testPubsubMessageToArchive() throws Exception {
  // Create the test input.
  byte[] payload = "Laces out Dan!".getBytes();
  Map<String, String> attributes = ImmutableMap.of("id", "Ace");

  PubsubMessage message = new PubsubMessage(payload, attributes);
  Instant timestamp = Instant.now();

  // Apply the ParDo.
  PCollection<AvroPubsubMessageRecord> results =
      pipeline
          .apply(Create.timestamped(TimestampedValue.of(message, timestamp)))
          .apply(ParDo.of(new PubsubMessageToArchiveDoFn()));

  // Assert on the results.
  PAssert.that(results)
      .containsInAnyOrder(
          new AvroPubsubMessageRecord(payload, attributes, timestamp.getMillis()));

  // Run the pipeline.
  pipeline.run();
}
 
Example #5
Source File: LimitPayloadSize.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/** Factory method to create mapper instance. */
public static MapWithFailures<PubsubMessage, PubsubMessage, PubsubMessage> of(int maxBytes) {
  final Counter countPayloadTooLarge = Metrics.counter(LimitPayloadSize.class,
      "payload_too_large");
  return MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((PubsubMessage msg) -> {
    msg = PubsubConstraints.ensureNonNull(msg);
    int numBytes = msg.getPayload().length;
    if (numBytes > maxBytes) {
      countPayloadTooLarge.inc();
      throw new PayloadTooLargeException("Message payload is " + numBytes
          + "bytes, larger than the" + " configured limit of " + maxBytes);
    }
    return msg;
  }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class))
      .exceptionsVia((ExceptionElement<PubsubMessage> ee) -> {
        try {
          throw ee.exception();
        } catch (PayloadTooLargeException e) {
          return FailureMessage.of(LimitPayloadSize.class.getSimpleName(), ee.element(),
              ee.exception());
        }
      });
}
 
Example #6
Source File: LimitPayloadSizeTest.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Test
public void testLimit() {
  List<String> passingPayloads = ImmutableList.of("", "abcdefg",
      StringUtils.repeat("abcdefg", 50));
  List<String> failingPayloads = ImmutableList.of(StringUtils.repeat("abcdefghij", 51));

  WithFailures.Result<PCollection<PubsubMessage>, PubsubMessage> result = pipeline //
      .apply(Create.of(Iterables.concat(passingPayloads, failingPayloads))) //
      .apply(InputFileFormat.text.decode()) //
      .apply("LimitPayloadSize", LimitPayloadSize.toBytes(500));

  PAssert
      .that(result.output().apply("get success payload",
          MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) //
      .containsInAnyOrder(passingPayloads);
  PAssert
      .that(result.failures().apply("get failure payload",
          MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) //
      .containsInAnyOrder(failingPayloads);

  pipeline.run();
}
 
Example #7
Source File: PubsubSink.java    From beam with Apache License 2.0 6 votes vote down vote up
PubsubSink(
    String topic,
    String timestampLabel,
    String idLabel,
    Coder<WindowedValue<T>> coder,
    SimpleFunction<T, PubsubMessage> formatFn,
    boolean withAttributes,
    StreamingModeExecutionContext context) {
  this.topic = topic;
  this.timestampLabel = timestampLabel;
  this.idLabel = idLabel;
  @SuppressWarnings({"unchecked", "rawtypes"})
  WindowedValueCoder<T> windowedCoder = (WindowedValueCoder) coder;
  this.coder = windowedCoder.getValueCoder();
  this.withAttributes = withAttributes;
  this.formatFn = formatFn;
  this.context = context;
}
 
Example #8
Source File: PubsubConstraints.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(@Element PubsubMessage message,
    OutputReceiver<PubsubMessage> out) {
  Map<String, String> attributes = new HashMap<>(message.getAttributeMap().size());
  for (Map.Entry<String, String> entry : message.getAttributeMap().entrySet()) {
    String key = truncateAttributeKey(entry.getKey());
    if (!key.equals(entry.getKey())) {
      countTruncatedKey.inc();
    }
    String value = truncateAttributeValue(entry.getValue());
    if (value != null && !value.equals(entry.getValue())) {
      countTruncatedValue.inc();
    }
    attributes.put(key, value);
  }
  out.output(new PubsubMessage(message.getPayload(), attributes));
}
 
Example #9
Source File: Deduplicate.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(@Element PubsubMessage element, MultiOutputReceiver out) {
  element = PubsubConstraints.ensureNonNull(element);
  boolean idExists = false;
  boolean exceptionWasThrown = false;
  try {
    idExists =
        // Throws IllegalArgumentException if id is present and invalid
        getId(element).filter(redisIdService::exists).isPresent();
  } catch (Exception e) {
    exceptionWasThrown = true;
    out.get(errorTag).output(FailureMessage.of(RemoveDuplicates.this, element, e));
  }
  if (!exceptionWasThrown) {
    if (idExists) {
      PerDocTypeCounter.inc(element.getAttributeMap(), "duplicate_submission");
      PerDocTypeCounter.inc(element.getAttributeMap(), "duplicate_submission_bytes",
          element.getPayload().length);
      out.get(duplicateTag).output(element);
    } else {
      out.get(outputTag).output(element);
    }
  }
}
 
Example #10
Source File: DecompressPayload.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public PubsubMessage apply(PubsubMessage message) {
  if (enabled.isAccessible() && !enabled.get()) {
    // Compression has been explicitly turned off in options, so return unchanged message.
    return message;
  } else {
    try {
      ByteArrayInputStream payloadStream = new ByteArrayInputStream(message.getPayload());
      GZIPInputStream gzipStream = new GZIPInputStream(payloadStream);
      ByteArrayOutputStream decompressedStream = new ByteArrayOutputStream();
      // Throws IOException
      IOUtils.copy(gzipStream, decompressedStream);
      compressedInput.inc();
      return new PubsubMessage(decompressedStream.toByteArray(), message.getAttributeMap());
    } catch (IOException ignore) {
      // payload wasn't valid gzip, assume it wasn't compressed
      uncompressedInput.inc();
      return message;
    }
  }
}
 
Example #11
Source File: ParsePayload.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public Result<PCollection<PubsubMessage>, PubsubMessage> expand(
    PCollection<PubsubMessage> messages) {
  return messages.apply(FlatMapElements.into(TypeDescriptor.of(PubsubMessage.class)) //
      .via(new Fn()) //
      .exceptionsInto(TypeDescriptor.of(PubsubMessage.class)) //
      .exceptionsVia((WithFailures.ExceptionElement<PubsubMessage> ee) -> {
        try {
          throw ee.exception();
        } catch (IOException | SchemaNotFoundException | ValidationException
            | MessageScrubberException e) {
          return FailureMessage.of(ParsePayload.class.getSimpleName(), //
              ee.element(), //
              ee.exception());
        }
      }));
}
 
Example #12
Source File: RemoveAttributes.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public PubsubMessage apply(PubsubMessage message) {
  message = PubsubConstraints.ensureNonNull(message);

  final Map<String, String> attributes = message.getAttributeMap();

  Map<String, String> strippedAttributes = new HashMap<>();

  // Use geo_country to match IP privacy v1 dataset
  attributes.put(Attribute.GEO_COUNTRY,
      message.getAttribute(Attribute.NORMALIZED_COUNTRY_CODE));

  ATTRIBUTES_TO_KEEP.forEach(name -> Optional.ofNullable(attributes.get(name))
      .ifPresent(value -> strippedAttributes.put(name, value)));

  return new PubsubMessage(message.getPayload(), strippedAttributes);
}
 
Example #13
Source File: Sink.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(SinkOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> failureCollections = new ArrayList<>();

  pipeline //
      .apply(options.getInputType().read(options)) //
      .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
      .apply(options.getOutputType().write(options)).failuresTo(failureCollections);

  PCollectionList.of(failureCollections) //
      .apply("FlattenFailureCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}
 
Example #14
Source File: StreamingDataGenerator.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(@Element Long element, @Timestamp Instant timestamp,
    OutputReceiver<PubsubMessage> receiver, ProcessContext context)
    throws IOException, JsonDataGeneratorException {

  // TODO: Add the ability to place eventId and eventTimestamp in the attributes.
  byte[] payload;
  Map<String, String> attributes = new HashMap<>();

  // Generate the fake JSON according to the schema.
  try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
    dataGenerator.generateTestDataJson(schema, byteArrayOutputStream);

    payload = byteArrayOutputStream.toByteArray();
  }

  receiver.output(new PubsubMessage(payload, attributes));
}
 
Example #15
Source File: PubSubToMongoDB.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<PubsubMessage> input) {

  // Map the incoming messages into FailsafeElements so we can recover from failures
  // across multiple transforms.
  PCollection<FailsafeElement<PubsubMessage, String>> failsafeElements =
          input.apply("MapToRecord", ParDo.of(new PubsubMessageToFailsafeElementFn()));

  // If a Udf is supplied then use it to parse the PubSubMessages.
  if (javascriptTextTransformGcsPath() != null) {
    return failsafeElements.apply(
            "InvokeUDF",
            JavascriptTextTransformer.FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                    .setFileSystemPath(javascriptTextTransformGcsPath())
                    .setFunctionName(javascriptTextTransformFunctionName())
                    .setSuccessTag(TRANSFORM_OUT)
                    .setFailureTag(TRANSFORM_DEADLETTER_OUT)
                    .build());
  } else {
    return failsafeElements.apply(
            "ProcessPubSubMessages",
            ParDo.of(new ProcessFailsafePubSubFn())
                    .withOutputTags(TRANSFORM_OUT, TupleTagList.of(TRANSFORM_DEADLETTER_OUT)));
  }
}
 
Example #16
Source File: Deduplicate.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public Result<PCollection<PubsubMessage>, PubsubMessage> expand(
    PCollection<PubsubMessage> elements) {
  return elements.apply(
      MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((PubsubMessage el) -> {
        el = PubsubConstraints.ensureNonNull(el);
        if (uri != null && uri.isAccessible() && uri.get() != null) {
          // Throws IllegalArgumentException if ID is present and invalid.
          getId(el).ifPresent(id -> redisIdService.setWithExpiration(id, getTtlSeconds()));
        }
        return el;
      }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class))
          .exceptionsVia((ExceptionElement<PubsubMessage> ee) -> {
            try {
              throw ee.exception();
            } catch (IllegalArgumentException e) {
              return FailureMessage.of(Deduplicate.class.getSimpleName(), //
                  ee.element(), //
                  ee.exception());
            }
          }));
}
 
Example #17
Source File: PubSubToMongoDB.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context) {
  PubsubMessage pubsubMessage = context.element().getOriginalPayload();

  JsonObject messageObject = new JsonObject();

  try {
    if (pubsubMessage.getPayload().length > 0) {
      messageObject = gson.fromJson(new String(pubsubMessage.getPayload()), JsonObject.class);
    }

    // If message attributes are present they will be serialized along with the message payload
    if (pubsubMessage.getAttributeMap() != null) {
      pubsubMessage.getAttributeMap().forEach(messageObject::addProperty);
    }

    context.output(
            FailsafeElement.of(pubsubMessage, messageObject.toString()));
    successCounter.inc();

  } catch (JsonSyntaxException e) {
    context.output(
            TRANSFORM_DEADLETTER_OUT,
            FailsafeElement.of(context.element())
                    .setErrorMessage(e.getMessage())
                    .setStacktrace(Throwables.getStackTraceAsString(e)));
    failedCounter.inc();
  }
}
 
Example #18
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public WriteResult expand(PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}
 
Example #19
Source File: DecodePubsubMessages.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/** Decoder from json to PubsubMessage. */
public static PTransform<PCollection<? extends String>, PCollection<PubsubMessage>> json() {
  return PTransform.compose("DecodePubsubMessages.Json", input -> input
      .apply(MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((String s) -> {
        try {
          return com.mozilla.telemetry.util.Json.readPubsubMessage(s);
        } catch (IOException e) {
          throw new UncheckedIOException(e);
        }
      })));
}
 
Example #20
Source File: PubSubToBigQuery.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<PubsubMessage> input) {

  PCollectionTuple udfOut =
      input
          // Map the incoming messages into FailsafeElements so we can recover from failures
          // across multiple transforms.
          .apply("MapToRecord", ParDo.of(new PubsubMessageToFailsafeElementFn()))
          .apply(
              "InvokeUDF",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setSuccessTag(UDF_OUT)
                  .setFailureTag(UDF_DEADLETTER_OUT)
                  .build());

  // Convert the records which were successfully processed by the UDF into TableRow objects.
  PCollectionTuple jsonToTableRowOut =
      udfOut
          .get(UDF_OUT)
          .apply(
              "JsonToTableRow",
              FailsafeJsonToTableRow.<PubsubMessage>newBuilder()
                  .setSuccessTag(TRANSFORM_OUT)
                  .setFailureTag(TRANSFORM_DEADLETTER_OUT)
                  .build());

  // Re-wrap the PCollections so we can return a single PCollectionTuple
  return PCollectionTuple.of(UDF_OUT, udfOut.get(UDF_OUT))
      .and(UDF_DEADLETTER_OUT, udfOut.get(UDF_DEADLETTER_OUT))
      .and(TRANSFORM_OUT, jsonToTableRowOut.get(TRANSFORM_OUT))
      .and(TRANSFORM_DEADLETTER_OUT, jsonToTableRowOut.get(TRANSFORM_DEADLETTER_OUT));
}
 
Example #21
Source File: RepublishPerDocType.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public int partitionFor(PubsubMessage message, int numPartitions) {
  message = PubsubConstraints.ensureNonNull(message);
  String docType = message.getAttribute("document_type");
  String namespace = message.getAttribute("document_namespace");
  for (int i = 0; i < destinations.size(); i++) {
    if (destinations.get(i).matches(namespace, docType)) {
      return i;
    }
  }
  // The last partition catches docTypes that aren't configured to have a destination;
  // these will be ignored.
  return numPartitions - 1;
}
 
Example #22
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext ctx) {
  PubsubMessage message = ctx.element();
  Map<String, String> attributes = message.getAttributeMap();
  try {
    Schema schema = getStore().getSchema(attributes);
    GenericRecord record = formatter.formatRecord(message, schema);
    byte[] avroPayload = binaryEncoder.encodeRecord(record, schema);
    ctx.output(successTag, new PubsubMessage(avroPayload, attributes));
  } catch (Exception e) {
    ctx.output(errorTag, FailureMessage.of(this, message, e));
  }
}
 
Example #23
Source File: RepublishPerNamespace.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public int partitionFor(PubsubMessage message, int numPartitions) {
  message = PubsubConstraints.ensureNonNull(message);
  String namespace = message.getAttribute("document_namespace");
  for (int i = 0; i < destinations.size(); i++) {
    if (destinations.get(i).matches(namespace)) {
      return i;
    }
  }
  // The last partition catches docTypes that aren't configured to have a destination;
  // these will be ignored.
  return numPartitions - 1;
}
 
Example #24
Source File: RepublishPerChannel.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerChannelSampleRatios().entrySet().stream() //
      .map(Destination::new) //
      .collect(Collectors.toList());
  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByChannel",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);

    // The destination pattern here must be compile-time due to a detail of Dataflow's
    // streaming PubSub producer implementation; if that restriction is lifted in the future,
    // this can become a runtime parameter and we can perform replacement via NestedValueProvider.
    opts.setOutput(StaticValueProvider
        .of(baseOptions.getPerChannelDestination().replace("${channel}", destination.channel)));

    partitioned.get(i) //
        .apply("Sample" + destination.getCapitalizedChannel() + "BySampleIdOrRandomNumber",
            Filter.by(message -> {
              message = PubsubConstraints.ensureNonNull(message);
              String sampleId = message.getAttribute("sample_id");
              return RandomSampler.filterBySampleIdOrRandomNumber(sampleId, destination.ratio);
            }))
        .apply("Republish" + destination.getCapitalizedChannel() + "Sample",
            opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}
 
Example #25
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public WithFailures.Result<PDone, PubsubMessage> expand(PCollection<PubsubMessage> input) {
  ValueProvider<DynamicPathTemplate> pathTemplate = NestedValueProvider.of(outputPrefix,
      DynamicPathTemplate::new);
  ValueProvider<String> staticPrefix = NestedValueProvider.of(pathTemplate,
      value -> value.staticPrefix);

  FileIO.Write<List<String>, PubsubMessage> write = FileIO
      .<List<String>, PubsubMessage>writeDynamic()
      // We can't pass the attribute map to by() directly since MapCoder isn't
      // deterministic;
      // instead, we extract an ordered list of the needed placeholder values.
      // That list is later available to withNaming() to determine output location.
      .by(message -> pathTemplate.get()
          .extractValuesFrom(DerivedAttributesMap.of(message.getAttributeMap())))
      .withDestinationCoder(ListCoder.of(StringUtf8Coder.of())) //
      .withCompression(compression) //
      .via(Contextful.fn(format::encodeSingleMessage), TextIO.sink()) //
      .to(staticPrefix) //
      .withNaming(placeholderValues -> NoColonFileNaming.defaultNaming(
          pathTemplate.get().replaceDynamicPart(placeholderValues), format.suffix()));

  if (inputType == InputType.pubsub) {
    // Passing a ValueProvider to withNumShards disables runner-determined sharding, so we
    // need to be careful to pass this only for streaming input (where runner-determined
    // sharding is not an option).
    write = write.withNumShards(numShards);
  }

  input //
      .apply(Window.<PubsubMessage>into(FixedWindows.of(windowDuration))
          // We allow lateness up to the maximum Cloud Pub/Sub retention of 7 days documented in
          // https://cloud.google.com/pubsub/docs/subscriber
          .withAllowedLateness(Duration.standardDays(7)) //
          .discardingFiredPanes())
      .apply(write);
  return WithFailures.Result.of(PDone.in(input.getPipeline()),
      EmptyErrors.in(input.getPipeline()));
}
 
Example #26
Source File: CdcPCollectionsFetchers.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
public Map<String, PCollection<Row>> changelogPcollections(Pipeline p) {
  Map<String, PCollection<Row>> result = new HashMap<>();

  List<TopicSubscriptionSchema> readSourceSchemas = buildTopicSubscriptionSchemas(
      options.as(GcpOptions.class).getProject(),
      options.getInputTopics(),
      options.getInputSubscriptions());

  for (TopicSubscriptionSchema rss: readSourceSchemas) {
    String transformTopicPrefix = rss.topic;

    PCollection<PubsubMessage> pubsubData;
    if (rss.subscription == null) {
      pubsubData = p.apply(
          String.format("%s/Read Updates from PubSub", transformTopicPrefix),
          PubsubIO.readMessagesWithAttributes()
              .fromTopic(String.format(
                  "projects/%s/topics/%s",
                  options.as(GcpOptions.class).getProject(), rss.topic)));
    } else {
      pubsubData = p.apply(
          String.format("%s/Read Updates from PubSub", transformTopicPrefix),
          PubsubIO.readMessagesWithAttributes().fromSubscription(String.format(
              "projects/%s/subscriptions/%s",
              options.as(GcpOptions.class).getProject(), rss.subscription)));
    }

    PCollection<Row> collectionOfRows = pubsubData
        .apply(String.format("%s/Extract payload", transformTopicPrefix),
            MapElements.into(TypeDescriptor.of(byte[].class))
                .via(PubsubMessage::getPayload))
        .apply(
            String.format("%s/Decode", transformTopicPrefix),
            DecodeRows.withSchema(rss.schema));

    result.put(transformTopicPrefix, collectionOfRows);
  }
  return result;
}
 
Example #27
Source File: StreamingDataGeneratorTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/** Tests the {@link MessageGeneratorFn} generates fake data. */
@Test
public void testMessageGenerator() throws IOException {
  // Arrange
  //
  String schema =
      "{"
          + "\"id\": \"{{uuid()}}\", "
          + "\"eventTime\": \"{{timestamp()}}\", "
          + "\"username\": \"{{username()}}\", "
          + "\"score\": {{integer(0,100)}}"
          + "}";

  File file = tempFolder.newFile();
  writeToFile(file.getAbsolutePath(), schema);

  // Act
  //
  PCollection<PubsubMessage> results =
      pipeline
          .apply("CreateInput", Create.of(0L))
          .apply("GenerateMessage", ParDo.of(new MessageGeneratorFn(file.getAbsolutePath())));

  // Assert
  //
  PAssert.that(results)
      .satisfies(
          input -> {
            PubsubMessage message = input.iterator().next();

            assertThat(message, is(notNullValue()));
            assertThat(message.getPayload(), is(notNullValue()));
            assertThat(message.getAttributeMap(), is(notNullValue()));

            return null;
          });

  pipeline.run();
}
 
Example #28
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Send {@code events} to Pubsub. */
private void sinkEventsToPubsub(PCollection<Event> events) {
  checkState(pubsubTopic != null, "Pubsub topic needs to be set up before initializing sink");
  NexmarkUtils.console("Writing events to Pubsub %s", pubsubTopic);

  PubsubIO.Write<PubsubMessage> io =
      PubsubIO.writeMessages().to(pubsubTopic).withIdAttribute(NexmarkUtils.PUBSUB_ID);
  if (!configuration.usePubsubPublishTime) {
    io = io.withTimestampAttribute(NexmarkUtils.PUBSUB_TIMESTAMP);
  }

  events
      .apply(queryName + ".EventToPubsubMessage", ParDo.of(new EventPubsubMessageDoFn()))
      .apply(queryName + ".WritePubsubEvents", io);
}
 
Example #29
Source File: Read.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PBegin input) {
  return input //
      .apply(PubsubIO.readMessagesWithAttributesAndMessageId().fromSubscription(subscription))
      .apply(MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via(message -> {
        Map<String, String> attributesWithMessageId = new HashMap<>(message.getAttributeMap());
        attributesWithMessageId.put(Attribute.MESSAGE_ID, message.getMessageId());
        return new PubsubMessage(message.getPayload(), attributesWithMessageId);
      }));
}
 
Example #30
Source File: IpPrivacyDecoder.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(IpPrivacyDecoderOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> errorCollections = new ArrayList<>();

  // We wrap pipeline in Optional for more convenience in chaining together transforms.
  Optional.of(pipeline) //
      .map(p -> p //
          .apply(options.getInputType().read(options)) //
          .apply(ParseUri.of()).failuresTo(errorCollections) //
          .apply("RestrictToMainPings",
              Filter
                  .by((message) -> "main".equals(message.getAttribute(Attribute.DOCUMENT_TYPE))))
          .apply(ParseProxy.of()) //
          .apply(ParseIp.of()) //
          .apply(GeoCityLookup.of(options.getGeoCityDatabase(), options.getGeoCityFilter())) //
          .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
          .apply(ExtractClientIdAndDropPayload.of()).failuresTo(errorCollections) //
          .apply(HashClientInfo.of(options.getClientIdHashKey(), options.getClientIpHashKey())) //
          .apply(NormalizeAttributes.of())) //
      .map(p -> p //
          .apply(RemoveAttributes.of()) //
          .apply(options.getOutputType().write(options)).failuresTo(errorCollections));

  // Write error output collections.
  PCollectionList.of(errorCollections) //
      .apply("FlattenErrorCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}