org.apache.beam.sdk.transforms.WithTimestamps Java Exaples

Source File: RowToPubsubMessage.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<PubsubMessage> expand(PCollection<Row> input) {
  PCollection<Row> withTimestamp =
      (config.useTimestampAttribute())
          ? input.apply(
              WithTimestamps.of((row) -> row.getDateTime("event_timestamp").toInstant()))
          : input;

  return withTimestamp
      .apply(DropFields.fields("event_timestamp"))
      .apply(ToJson.of())
      .apply(
          MapElements.into(TypeDescriptor.of(PubsubMessage.class))
              .via(
                  (String json) ->
                      new PubsubMessage(
                          json.getBytes(StandardCharsets.ISO_8859_1), ImmutableMap.of())));
}

Source File: BeamTableFunctionScanRel.java From beam with Apache License 2.0

5 votes

/** Extract timestamps from the windowFieldIndex, then window into windowFns. */
private PCollection<Row> assignTimestampsAndWindow(
    PCollection<Row> upstream, int windowFieldIndex, WindowFn<Row, IntervalWindow> windowFn) {
  PCollection<Row> windowedStream;
  windowedStream =
      upstream
          .apply(
              "assignEventTimestamp",
              WithTimestamps.<Row>of(row -> row.getDateTime(windowFieldIndex).toInstant())
                  .withAllowedTimestampSkew(new Duration(Long.MAX_VALUE)))
          .setCoder(upstream.getCoder())
          .apply(Window.into(windowFn));
  return windowedStream;
}

Source File: BeamAggregationRel.java From beam with Apache License 2.0

5 votes

/** Extract timestamps from the windowFieldIndex, then window into windowFns. */
private PCollection<Row> assignTimestampsAndWindow(PCollection<Row> upstream) {
  PCollection<Row> windowedStream;
  windowedStream =
      upstream
          .apply(
              "assignEventTimestamp",
              WithTimestamps.<Row>of(row -> row.getDateTime(windowFieldIndex).toInstant())
                  .withAllowedTimestampSkew(new Duration(Long.MAX_VALUE)))
          .setCoder(upstream.getCoder())
          .apply(Window.into(windowFn));
  return windowedStream;
}

Source File: GatherAllPanesTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(NeedsRunner.class)
public void singlePaneSingleReifiedPane() {
  PCollection<Iterable<ValueInSingleWindow<Iterable<Long>>>> accumulatedPanes =
      p.apply(GenerateSequence.from(0).to(20000))
          .apply(WithTimestamps.of(input -> new Instant(input * 10)))
          .apply(
              Window.<Long>into(FixedWindows.of(Duration.standardMinutes(1)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .withAllowedLateness(Duration.ZERO)
                  .discardingFiredPanes())
          .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {}))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(GatherAllPanes.globally());

  PAssert.that(accumulatedPanes)
      .satisfies(
          input -> {
            for (Iterable<ValueInSingleWindow<Iterable<Long>>> windowedInput : input) {
              if (Iterables.size(windowedInput) > 1) {
                fail("Expected all windows to have exactly one pane, got " + windowedInput);
                return null;
              }
            }
            return null;
          });

  p.run();
}

Source File: GatherAllPanesTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(NeedsRunner.class)
public void multiplePanesMultipleReifiedPane() {
  PCollection<Long> someElems = p.apply("someLongs", GenerateSequence.from(0).to(20000));
  PCollection<Long> otherElems = p.apply("otherLongs", GenerateSequence.from(0).to(20000));
  PCollection<Iterable<ValueInSingleWindow<Iterable<Long>>>> accumulatedPanes =
      PCollectionList.of(someElems)
          .and(otherElems)
          .apply(Flatten.pCollections())
          .apply(WithTimestamps.of(input -> new Instant(input * 10)))
          .apply(
              Window.<Long>into(FixedWindows.of(Duration.standardMinutes(1)))
                  .triggering(
                      AfterWatermark.pastEndOfWindow()
                          .withEarlyFirings(AfterPane.elementCountAtLeast(1)))
                  .withAllowedLateness(Duration.ZERO)
                  .discardingFiredPanes())
          .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {}))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(GatherAllPanes.globally());

  PAssert.that(accumulatedPanes)
      .satisfies(
          input -> {
            for (Iterable<ValueInSingleWindow<Iterable<Long>>> windowedInput : input) {
              if (Iterables.size(windowedInput) > 1) {
                return null;
              }
            }
            fail("Expected at least one window to have multiple panes");
            return null;
          });

  p.run();
}

Source File: CsvToElasticsearch.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(CsvToElasticsearchOptions options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  // Register the coder for pipeline
  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
      FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);

  // Throw error if containsHeaders is true and a schema or Udf is also set.
  if (options.getContainsHeaders()) {
    checkArgument(
        options.getJavascriptTextTransformGcsPath() == null
            && options.getJsonSchemaPath() == null,
        "Cannot parse file containing headers with UDF or Json schema.");
  }

  // Throw error if only one retry configuration parameter is set.
  if (options.getMaxRetryAttempts() != null || options.getMaxRetryDuration() != null) {
    checkArgument(
        options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null,
        "To specify retry configuration both max attempts and max duration must be set.");
  }

  /*
   * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
   *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
   *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
   *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
   */
  PCollectionTuple convertedCsvLines =
      pipeline
          /*
           * Step 1: Read CSV file(s) from Cloud Storage using {@link CsvConverters.ReadCsv}.
           */
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .build())
          /*
           * Step 2: Convert lines to Elasticsearch document.
           */
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .setUdfOutputTag(PROCESSING_OUT)
                  .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
                  .build());
  /*
   * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
   */
  convertedCsvLines
      .get(PROCESSING_OUT)
      .apply(
          "GetJsonDocuments",
          MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload))
      .apply(
          "WriteToElasticsearch",
          WriteToElasticsearch.newBuilder()
              .setOptions(options.as(WriteToElasticsearchOptions.class))
              .build());

  /*
   * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
   */
  convertedCsvLines
      .get(PROCESSING_DEADLETTER_OUT)
      .apply(
          "AddTimestamps",
          WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant()))
      .apply(
          "WriteFailedElementsToBigQuery",
          WriteStringMessageErrors.newBuilder()
              .setErrorRecordsTable(options.getDeadletterTable())
              .setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA)
              .build());

  return pipeline.run();
}

Source File: Task.java From beam with Apache License 2.0

4 votes

static PCollection<Event> applyTransform(PCollection<Event> events) {
  return events.apply(WithTimestamps.of(event -> event.getDate().toInstant()));
}

Source File: StreamingBeamSQL.java From java-docs-samples with Apache License 2.0

4 votes

public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var project = options.as(GcpOptions.class).getProject();
  var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString();

  var schema = Schema.builder()
      .addStringField("url")
      .addDoubleField("page_score")
      .addDateTimeField("processing_time")
      .build();

  var pipeline = Pipeline.create(options);
  pipeline
      // Read, parse, and validate messages from Pub/Sub.
      .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription))
      .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> {
        // This is a good place to add error handling.
        // The first transform should act as a validation layer to make sure
        // that any data coming to the processing pipeline must be valid.
        // See `MapElements.MapWithFailures` for more details.
        LOG.info("message: {}", message);
        var msg = GSON.fromJson(message, PageReviewMessage.class);
        return Row.withSchema(schema).addValues(
            msg.url,                                    // row url
            msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
            new Instant()                               // row processing_time
        ).build();
      })).setRowSchema(schema) // make sure to set the row schema for the PCollection

      // Add timestamps and bundle elements into windows.
      .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant()))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      // Apply a SQL query for every window of elements.
      .apply("Run Beam SQL query", SqlTransform.query(
          "SELECT " +
          "  url, " +
          "  COUNT(page_score) AS num_reviews, " +
          "  AVG(page_score) AS score, " +
          "  MIN(processing_time) AS first_date, " +
          "  MAX(processing_time) AS last_date " +
          "FROM PCOLLECTION " +
          "GROUP BY url"
      ))

      // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery.
      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> {
        LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"),
            row.getInt64("num_reviews"));
        return new TableRow()
            .set("url", row.getString("url"))
            .set("num_reviews", row.getInt64("num_reviews"))
            .set("score", row.getDouble("score"))
            .set("first_date", row.getDateTime("first_date").toInstant().toString())
            .set("last_date", row.getDateTime("last_date").toInstant().toString());
      }))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              // To learn more about the valid BigQuery types:
              //   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("num_reviews").setType("INTEGER"),
              new TableFieldSchema().setName("score").setType("FLOAT64"),
              new TableFieldSchema().setName("first_date").setType("TIMESTAMP"),
              new TableFieldSchema().setName("last_date").setType("TIMESTAMP"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}

Source File: KafkaToBigQuery.java From java-docs-samples with Apache License 2.0

4 votes

public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var pipeline = Pipeline.create(options);
  pipeline
      .apply("Read messages from Kafka",
          KafkaIO.<String, String>read()
              .withBootstrapServers(options.getBootstrapServer())
              .withTopic(options.getInputTopic())
              .withKeyDeserializer(StringDeserializer.class)
              .withValueDeserializer(StringDeserializer.class)
              .withoutMetadata())
      .apply("Get message contents", Values.<String>create())
      .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class))
          .via(message -> {
            LOG.info("Received: {}", message);
            return message;
          }))
      .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class))
          .via(message -> GSON.fromJson(message, PageRating.class)))

      .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime)))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
          .via(pageRating -> new TableRow()
              .set("processing_time", pageRating.processingTime.toString())
              .set("url", pageRating.url)
              .set("rating", pageRating.rating)))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"),
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("rating").setType("STRING"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}

org.apache.beam.sdk.transforms.WithTimestamps Java Examples