org.apache.beam.sdk.transforms.WithTimestamps Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.WithTimestamps. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RowToPubsubMessage.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PCollection<Row> input) {
  PCollection<Row> withTimestamp =
      (config.useTimestampAttribute())
          ? input.apply(
              WithTimestamps.of((row) -> row.getDateTime("event_timestamp").toInstant()))
          : input;

  return withTimestamp
      .apply(DropFields.fields("event_timestamp"))
      .apply(ToJson.of())
      .apply(
          MapElements.into(TypeDescriptor.of(PubsubMessage.class))
              .via(
                  (String json) ->
                      new PubsubMessage(
                          json.getBytes(StandardCharsets.ISO_8859_1), ImmutableMap.of())));
}
 
Example #2
Source File: BeamTableFunctionScanRel.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Extract timestamps from the windowFieldIndex, then window into windowFns. */
private PCollection<Row> assignTimestampsAndWindow(
    PCollection<Row> upstream, int windowFieldIndex, WindowFn<Row, IntervalWindow> windowFn) {
  PCollection<Row> windowedStream;
  windowedStream =
      upstream
          .apply(
              "assignEventTimestamp",
              WithTimestamps.<Row>of(row -> row.getDateTime(windowFieldIndex).toInstant())
                  .withAllowedTimestampSkew(new Duration(Long.MAX_VALUE)))
          .setCoder(upstream.getCoder())
          .apply(Window.into(windowFn));
  return windowedStream;
}
 
Example #3
Source File: BeamAggregationRel.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Extract timestamps from the windowFieldIndex, then window into windowFns. */
private PCollection<Row> assignTimestampsAndWindow(PCollection<Row> upstream) {
  PCollection<Row> windowedStream;
  windowedStream =
      upstream
          .apply(
              "assignEventTimestamp",
              WithTimestamps.<Row>of(row -> row.getDateTime(windowFieldIndex).toInstant())
                  .withAllowedTimestampSkew(new Duration(Long.MAX_VALUE)))
          .setCoder(upstream.getCoder())
          .apply(Window.into(windowFn));
  return windowedStream;
}
 
Example #4
Source File: GatherAllPanesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void singlePaneSingleReifiedPane() {
  PCollection<Iterable<ValueInSingleWindow<Iterable<Long>>>> accumulatedPanes =
      p.apply(GenerateSequence.from(0).to(20000))
          .apply(WithTimestamps.of(input -> new Instant(input * 10)))
          .apply(
              Window.<Long>into(FixedWindows.of(Duration.standardMinutes(1)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .withAllowedLateness(Duration.ZERO)
                  .discardingFiredPanes())
          .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {}))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(GatherAllPanes.globally());

  PAssert.that(accumulatedPanes)
      .satisfies(
          input -> {
            for (Iterable<ValueInSingleWindow<Iterable<Long>>> windowedInput : input) {
              if (Iterables.size(windowedInput) > 1) {
                fail("Expected all windows to have exactly one pane, got " + windowedInput);
                return null;
              }
            }
            return null;
          });

  p.run();
}
 
Example #5
Source File: GatherAllPanesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void multiplePanesMultipleReifiedPane() {
  PCollection<Long> someElems = p.apply("someLongs", GenerateSequence.from(0).to(20000));
  PCollection<Long> otherElems = p.apply("otherLongs", GenerateSequence.from(0).to(20000));
  PCollection<Iterable<ValueInSingleWindow<Iterable<Long>>>> accumulatedPanes =
      PCollectionList.of(someElems)
          .and(otherElems)
          .apply(Flatten.pCollections())
          .apply(WithTimestamps.of(input -> new Instant(input * 10)))
          .apply(
              Window.<Long>into(FixedWindows.of(Duration.standardMinutes(1)))
                  .triggering(
                      AfterWatermark.pastEndOfWindow()
                          .withEarlyFirings(AfterPane.elementCountAtLeast(1)))
                  .withAllowedLateness(Duration.ZERO)
                  .discardingFiredPanes())
          .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {}))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(GatherAllPanes.globally());

  PAssert.that(accumulatedPanes)
      .satisfies(
          input -> {
            for (Iterable<ValueInSingleWindow<Iterable<Long>>> windowedInput : input) {
              if (Iterables.size(windowedInput) > 1) {
                return null;
              }
            }
            fail("Expected at least one window to have multiple panes");
            return null;
          });

  p.run();
}
 
Example #6
Source File: CsvToElasticsearch.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(CsvToElasticsearchOptions options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  // Register the coder for pipeline
  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
      FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);

  // Throw error if containsHeaders is true and a schema or Udf is also set.
  if (options.getContainsHeaders()) {
    checkArgument(
        options.getJavascriptTextTransformGcsPath() == null
            && options.getJsonSchemaPath() == null,
        "Cannot parse file containing headers with UDF or Json schema.");
  }

  // Throw error if only one retry configuration parameter is set.
  if (options.getMaxRetryAttempts() != null || options.getMaxRetryDuration() != null) {
    checkArgument(
        options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null,
        "To specify retry configuration both max attempts and max duration must be set.");
  }

  /*
   * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
   *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
   *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
   *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
   */
  PCollectionTuple convertedCsvLines =
      pipeline
          /*
           * Step 1: Read CSV file(s) from Cloud Storage using {@link CsvConverters.ReadCsv}.
           */
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .build())
          /*
           * Step 2: Convert lines to Elasticsearch document.
           */
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .setUdfOutputTag(PROCESSING_OUT)
                  .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
                  .build());
  /*
   * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
   */
  convertedCsvLines
      .get(PROCESSING_OUT)
      .apply(
          "GetJsonDocuments",
          MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload))
      .apply(
          "WriteToElasticsearch",
          WriteToElasticsearch.newBuilder()
              .setOptions(options.as(WriteToElasticsearchOptions.class))
              .build());

  /*
   * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
   */
  convertedCsvLines
      .get(PROCESSING_DEADLETTER_OUT)
      .apply(
          "AddTimestamps",
          WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant()))
      .apply(
          "WriteFailedElementsToBigQuery",
          WriteStringMessageErrors.newBuilder()
              .setErrorRecordsTable(options.getDeadletterTable())
              .setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA)
              .build());

  return pipeline.run();
}
 
Example #7
Source File: Task.java    From beam with Apache License 2.0 4 votes vote down vote up
static PCollection<Event> applyTransform(PCollection<Event> events) {
  return events.apply(WithTimestamps.of(event -> event.getDate().toInstant()));
}
 
Example #8
Source File: StreamingBeamSQL.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var project = options.as(GcpOptions.class).getProject();
  var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString();

  var schema = Schema.builder()
      .addStringField("url")
      .addDoubleField("page_score")
      .addDateTimeField("processing_time")
      .build();

  var pipeline = Pipeline.create(options);
  pipeline
      // Read, parse, and validate messages from Pub/Sub.
      .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription))
      .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> {
        // This is a good place to add error handling.
        // The first transform should act as a validation layer to make sure
        // that any data coming to the processing pipeline must be valid.
        // See `MapElements.MapWithFailures` for more details.
        LOG.info("message: {}", message);
        var msg = GSON.fromJson(message, PageReviewMessage.class);
        return Row.withSchema(schema).addValues(
            msg.url,                                    // row url
            msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
            new Instant()                               // row processing_time
        ).build();
      })).setRowSchema(schema) // make sure to set the row schema for the PCollection

      // Add timestamps and bundle elements into windows.
      .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant()))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      // Apply a SQL query for every window of elements.
      .apply("Run Beam SQL query", SqlTransform.query(
          "SELECT " +
          "  url, " +
          "  COUNT(page_score) AS num_reviews, " +
          "  AVG(page_score) AS score, " +
          "  MIN(processing_time) AS first_date, " +
          "  MAX(processing_time) AS last_date " +
          "FROM PCOLLECTION " +
          "GROUP BY url"
      ))

      // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery.
      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> {
        LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"),
            row.getInt64("num_reviews"));
        return new TableRow()
            .set("url", row.getString("url"))
            .set("num_reviews", row.getInt64("num_reviews"))
            .set("score", row.getDouble("score"))
            .set("first_date", row.getDateTime("first_date").toInstant().toString())
            .set("last_date", row.getDateTime("last_date").toInstant().toString());
      }))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              // To learn more about the valid BigQuery types:
              //   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("num_reviews").setType("INTEGER"),
              new TableFieldSchema().setName("score").setType("FLOAT64"),
              new TableFieldSchema().setName("first_date").setType("TIMESTAMP"),
              new TableFieldSchema().setName("last_date").setType("TIMESTAMP"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}
 
Example #9
Source File: KafkaToBigQuery.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var pipeline = Pipeline.create(options);
  pipeline
      .apply("Read messages from Kafka",
          KafkaIO.<String, String>read()
              .withBootstrapServers(options.getBootstrapServer())
              .withTopic(options.getInputTopic())
              .withKeyDeserializer(StringDeserializer.class)
              .withValueDeserializer(StringDeserializer.class)
              .withoutMetadata())
      .apply("Get message contents", Values.<String>create())
      .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class))
          .via(message -> {
            LOG.info("Received: {}", message);
            return message;
          }))
      .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class))
          .via(message -> GSON.fromJson(message, PageRating.class)))

      .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime)))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
          .via(pageRating -> new TableRow()
              .set("processing_time", pageRating.processingTime.toString())
              .set("url", pageRating.url)
              .set("rating", pageRating.rating)))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"),
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("rating").setType("STRING"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}