org.apache.beam.sdk.transforms.Watch Java Exaples

Source File: TextToPubsubStream.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Executes the pipeline with the provided execution
 * parameters.
 *
 * @param options The execution parameters.
 */
public static PipelineResult run(Options options) {
  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  /*
   * Steps:
   *  1) Read from the text source.
   *  2) Write each text record to Pub/Sub
   */
  pipeline
    .apply(
      "Read Text Data",
      TextIO.read()
        .from(options.getInputFilePattern())
        .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
    .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic()));

  return pipeline.run();
}

Source File: FileIO.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<MatchResult.Metadata> expand(PCollection<String> input) {
  PCollection<MatchResult.Metadata> res;
  if (getConfiguration().getWatchInterval() == null) {
    res =
        input.apply(
            "Match filepatterns",
            ParDo.of(new MatchFn(getConfiguration().getEmptyMatchTreatment())));
  } else {
    res =
        input
            .apply(
                "Continuously match filepatterns",
                Watch.growthOf(
                        Contextful.of(new MatchPollFn(), Requirements.empty()),
                        new ExtractFilenameFn())
                    .withPollInterval(getConfiguration().getWatchInterval())
                    .withTerminationPerInput(getConfiguration().getWatchTerminationCondition()))
            .apply(Values.create());
  }
  return res.apply(Reshuffle.viaRandomKey());
}

Source File: HCatalogIO.java From beam with Apache License 2.0

6 votes

@Override
@SuppressWarnings("deprecation")
public PCollection<HCatRecord> expand(PBegin input) {
  checkArgument(getTable() != null, "withTable() is required");
  checkArgument(getConfigProperties() != null, "withConfigProperties() is required");
  Watch.Growth<Read, Integer, Integer> growthFn;
  if (getPollingInterval() != null) {
    growthFn = Watch.growthOf(new PartitionPollerFn()).withPollInterval(getPollingInterval());
    if (getTerminationCondition() != null) {
      growthFn = growthFn.withTerminationPerInput(getTerminationCondition());
    }
    return input
        .apply("ConvertToReadRequest", Create.of(this))
        .apply("WatchForNewPartitions", growthFn)
        .apply("PartitionReader", ParDo.of(new PartitionReaderFn(getConfigProperties())));
  } else {
    // Treat as Bounded
    checkArgument(
        getTerminationCondition() == null,
        "withTerminationCondition() is not required when using in bounded reads mode");
    return input.apply(org.apache.beam.sdk.io.Read.from(new BoundedHCatalogSource(this)));
  }
}

Source File: TextStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }

Source File: FileIO.java From beam with Apache License 2.0

5 votes

@Override
public Watch.Growth.PollResult<MatchResult.Metadata> apply(String element, Context c)
    throws Exception {
  Instant now = Instant.now();
  return Watch.Growth.PollResult.incomplete(
          now, FileSystems.match(element, EmptyMatchTreatment.ALLOW).metadata())
      .withWatermark(now);
}

Source File: TextIOReadTest.java From beam with Apache License 2.0

5 votes

@Test
@Category({NeedsRunner.class, UsesUnboundedSplittableParDo.class})
public void testReadWatchForNewFiles() throws IOException, InterruptedException {
  final Path basePath = tempFolder.getRoot().toPath().resolve("readWatch");
  basePath.toFile().mkdir();

  p.apply(GenerateSequence.from(0).to(10).withRate(1, Duration.millis(100)))
      .apply(
          Window.<Long>into(FixedWindows.of(Duration.millis(150)))
              .withAllowedLateness(Duration.ZERO)
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .discardingFiredPanes())
      .apply(ToString.elements())
      .apply(
          TextIO.write()
              .to(basePath.resolve("data").toString())
              .withNumShards(1)
              .withWindowedWrites());

  PCollection<String> lines =
      p.apply(
          TextIO.read()
              .from(basePath.resolve("*").toString())
              .watchForNewFiles(
                  Duration.millis(100),
                  Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))));

  PAssert.that(lines).containsInAnyOrder("0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
  p.run();
}

Source File: S3Import.java From dlp-dataflow-deidentification with Apache License 2.0

4 votes

public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}

Source File: CSVStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0

4 votes

@SuppressWarnings("serial")
public static void doTokenization(TokenizePipelineOptions options) {
  Pipeline p = Pipeline.create(options);

  PCollection<KV<String, List<String>>> filesAndContents =
      p.apply(
              FileIO.match()
                  .filepattern(options.getInputFile())
                  .continuously(
                      Duration.standardSeconds(options.getPollingInterval()),
                      Watch.Growth.never()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(
              "FileHandler",
              ParDo.of(
                  new CSVReader(
                      options.getCsek(),
                      options.getCsekhash(),
                      options.getFileDecryptKeyName(),
                      options.getFileDecryptKey(),
                      options.as(GcpOptions.class).getProject(),
                      options.getBatchSize())));

  PCollection<KV<String, Table>> dlpTables =
      filesAndContents.apply(
          "ContentHandler", ParDo.of(new CSVContentProcessorDoFn(options.getBatchSize())));

  PCollection<Row> dlpRows =
      dlpTables
          .apply(
              "DoDLPTokenization",
              ParDo.of(
                  new DLPTokenizationDoFn(
                      options.as(GcpOptions.class).getProject(),
                      options.getDeidentifyTemplateName(),
                      options.getInspectTemplateName())))
          .apply(
              Window.<Row>into(FixedWindows.of(Duration.standardSeconds(options.getInterval())))
                  .triggering(
                      AfterProcessingTime.pastFirstElementInPane()
                          .plusDelayOf(Duration.standardMinutes(1)))
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.standardMinutes(1)));

  dlpRows.apply(
      "WriteToBQ",
      BigQueryIO.<Row>write()
          .to(new BQDestination(options.getDataset(), options.as(GcpOptions.class).getProject()))
          .withFormatFunction(new BQTableRowSF())
          .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  dlpRows
      .apply(
          MapElements.via(
              new SimpleFunction<Row, KV<String, Row>>() {
                @Override
                public KV<String, Row> apply(Row row) {
                  return KV.of(row.getTableId(), row);
                }
              }))
      .apply(GroupByKey.<String, Row>create())
      .apply(
          "WriteToGCS",
          FileIO.<String, KV<String, Iterable<Row>>>writeDynamic()
              .by(
                  (SerializableFunction<KV<String, Iterable<Row>>, String>)
                      row -> {
                        return row.getKey();
                      })
              .via(new CSVSink())
              .to(options.getOutputFile())
              .withDestinationCoder(StringUtf8Coder.of())
              .withNumShards(1)
              .withNaming(key -> FileIO.Write.defaultNaming(key, ".csv")));

  p.run();
}

Source File: Snippets.java From beam with Apache License 2.0

4 votes

public static void fileProcessPattern() throws Exception {
  Pipeline p = Pipeline.create();

  // [START FileProcessPatternProcessNewFilesSnip1]
  // This produces PCollection<MatchResult.Metadata>
  p.apply(
      FileIO.match()
          .filepattern("...")
          .continuously(
              Duration.standardSeconds(30),
              Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1))));
  // [END FileProcessPatternProcessNewFilesSnip1]

  // [START FileProcessPatternProcessNewFilesSnip2]
  // This produces PCollection<String>
  p.apply(
      TextIO.read()
          .from("<path-to-files>/*")
          .watchForNewFiles(
              // Check for new files every minute.
              Duration.standardMinutes(1),
              // Stop watching the file pattern if no new files appear for an hour.
              Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1))));
  // [END FileProcessPatternProcessNewFilesSnip2]

  // [START FileProcessPatternAccessMetadataSnip1]
  p.apply(FileIO.match().filepattern("hdfs://path/to/*.gz"))
      // The withCompression method is optional. By default, the Beam SDK detects compression from
      // the filename.
      .apply(FileIO.readMatches().withCompression(Compression.GZIP))
      .apply(
          ParDo.of(
              new DoFn<FileIO.ReadableFile, String>() {
                @ProcessElement
                public void process(@Element FileIO.ReadableFile file) {
                  // We can now access the file and its metadata.
                  LOG.info("File Metadata resourceId is {} ", file.getMetadata().resourceId());
                }
              }));
  // [END FileProcessPatternAccessMetadataSnip1]

}

Source File: FileIOTest.java From beam with Apache License 2.0

4 votes

@Test
@Category({NeedsRunner.class, UsesUnboundedSplittableParDo.class})
public void testMatchWatchForNewFiles() throws IOException, InterruptedException {
  // Write some files to a "source" directory.
  final Path sourcePath = tmpFolder.getRoot().toPath().resolve("source");
  sourcePath.toFile().mkdir();
  Files.write(sourcePath.resolve("first"), new byte[42]);
  Files.write(sourcePath.resolve("second"), new byte[37]);
  Files.write(sourcePath.resolve("third"), new byte[99]);

  // Create a "watch" directory that the pipeline will copy files into.
  final Path watchPath = tmpFolder.getRoot().toPath().resolve("watch");
  watchPath.toFile().mkdir();
  PCollection<MatchResult.Metadata> matchMetadata =
      p.apply(
          FileIO.match()
              .filepattern(watchPath.resolve("*").toString())
              .continuously(
                  Duration.millis(100),
                  Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))));
  PCollection<MatchResult.Metadata> matchAllMetadata =
      p.apply(Create.of(watchPath.resolve("*").toString()))
          .apply(
              FileIO.matchAll()
                  .continuously(
                      Duration.millis(100),
                      Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))));
  assertEquals(PCollection.IsBounded.UNBOUNDED, matchMetadata.isBounded());
  assertEquals(PCollection.IsBounded.UNBOUNDED, matchAllMetadata.isBounded());

  // Copy the files to the "watch" directory, preserving the lastModifiedTime;
  // the COPY_ATTRIBUTES option ensures that we will at a minimum copy lastModifiedTime.
  CopyOption[] copyOptions = {StandardCopyOption.COPY_ATTRIBUTES};
  Thread writer =
      new Thread(
          () -> {
            try {
              Thread.sleep(1000);
              Files.copy(sourcePath.resolve("first"), watchPath.resolve("first"), copyOptions);
              Thread.sleep(300);
              Files.copy(sourcePath.resolve("second"), watchPath.resolve("second"), copyOptions);
              Thread.sleep(300);
              Files.copy(sourcePath.resolve("third"), watchPath.resolve("third"), copyOptions);
            } catch (IOException | InterruptedException e) {
              throw new RuntimeException(e);
            }
          });
  writer.start();

  // We fetch lastModifiedTime from the files in the "source" directory to avoid a race condition
  // with the writer thread.
  List<MatchResult.Metadata> expected =
      Arrays.asList(
          metadata(
              watchPath.resolve("first"), 42, lastModifiedMillis(sourcePath.resolve("first"))),
          metadata(
              watchPath.resolve("second"), 37, lastModifiedMillis(sourcePath.resolve("second"))),
          metadata(
              watchPath.resolve("third"), 99, lastModifiedMillis(sourcePath.resolve("third"))));
  PAssert.that(matchMetadata).containsInAnyOrder(expected);
  PAssert.that(matchAllMetadata).containsInAnyOrder(expected);
  p.run();

  writer.join();
}

Source File: HCatalogIOTest.java From beam with Apache License 2.0

4 votes

/** Perform end-to-end test of Write-then-Read operation. */
@Test
@NeedsEmptyTestTablesForUnboundedReads
public void testWriteThenUnboundedReadSuccess() throws Exception {

  defaultPipeline
      .apply(Create.of(buildHCatRecords(TEST_RECORDS_COUNT)))
      .apply(
          HCatalogIO.write()
              .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
              .withDatabase(TEST_DATABASE)
              .withTable(TEST_TABLE)
              .withPartition(getPartitions())
              .withBatchSize(512L));
  defaultPipeline.run();
  final ImmutableList<String> partitions = ImmutableList.of("load_date", "product_type");
  final PCollection<HCatRecord> data =
      readAfterWritePipeline
          .apply(
              "ReadData",
              HCatalogIO.read()
                  .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
                  .withDatabase(TEST_DATABASE)
                  .withPartitionCols(partitions)
                  .withTable(TEST_TABLE)
                  .withPollingInterval(Duration.millis(15000))
                  .withTerminationCondition(Watch.Growth.afterTotalOf(Duration.millis(60000))))
          .setCoder((Coder) WritableCoder.of(DefaultHCatRecord.class));

  final PCollection<String> output =
      data.apply(
          ParDo.of(
              new DoFn<HCatRecord, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().get(0).toString());
                }
              }));

  PAssert.that(output).containsInAnyOrder(getExpectedRecords(TEST_RECORDS_COUNT));
  readAfterWritePipeline.run();
}

org.apache.beam.sdk.transforms.Watch Java Examples