org.apache.beam.sdk.transforms.Watch Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.Watch.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TextToPubsubStream.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** * Executes the pipeline with the provided execution * parameters. * * @param options The execution parameters. */ public static PipelineResult run(Options options) { // Create the pipeline. Pipeline pipeline = Pipeline.create(options); /* * Steps: * 1) Read from the text source. * 2) Write each text record to Pub/Sub */ pipeline .apply( "Read Text Data", TextIO.read() .from(options.getInputFilePattern()) .watchForNewFiles(DEFAULT_POLL_INTERVAL, Watch.Growth.never())) .apply("Write to PubSub", PubsubIO.writeStrings().to(options.getOutputTopic())); return pipeline.run(); }
Example #2
Source File: FileIO.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<MatchResult.Metadata> expand(PCollection<String> input) { PCollection<MatchResult.Metadata> res; if (getConfiguration().getWatchInterval() == null) { res = input.apply( "Match filepatterns", ParDo.of(new MatchFn(getConfiguration().getEmptyMatchTreatment()))); } else { res = input .apply( "Continuously match filepatterns", Watch.growthOf( Contextful.of(new MatchPollFn(), Requirements.empty()), new ExtractFilenameFn()) .withPollInterval(getConfiguration().getWatchInterval()) .withTerminationPerInput(getConfiguration().getWatchTerminationCondition())) .apply(Values.create()); } return res.apply(Reshuffle.viaRandomKey()); }
Example #3
Source File: HCatalogIO.java From beam with Apache License 2.0 | 6 votes |
@Override @SuppressWarnings("deprecation") public PCollection<HCatRecord> expand(PBegin input) { checkArgument(getTable() != null, "withTable() is required"); checkArgument(getConfigProperties() != null, "withConfigProperties() is required"); Watch.Growth<Read, Integer, Integer> growthFn; if (getPollingInterval() != null) { growthFn = Watch.growthOf(new PartitionPollerFn()).withPollInterval(getPollingInterval()); if (getTerminationCondition() != null) { growthFn = growthFn.withTerminationPerInput(getTerminationCondition()); } return input .apply("ConvertToReadRequest", Create.of(this)) .apply("WatchForNewPartitions", growthFn) .apply("PartitionReader", ParDo.of(new PartitionReaderFn(getConfigProperties()))); } else { // Treat as Bounded checkArgument( getTerminationCondition() == null, "withTerminationCondition() is not required when using in bounded reads mode"); return input.apply(org.apache.beam.sdk.io.Read.from(new BoundedHCatalogSource(this))); } }
Example #4
Source File: TextStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException, GeneralSecurityException { TokenizePipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply( FileIO.match() .filepattern(options.getInputFile()) .continuously( Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply( "Text File Reader", ParDo.of( new TextFileReader( options.as(GcpOptions.class).getProject(), options.getFileDecryptKeyName(), options.getFileDecryptKey(), options.getBatchSize(), options.getCsek(), options.getCsekhash()))) .apply( "Tokenize Data", ParDo.of( new TokenizeData( options.as(GcpOptions.class).getProject(), options.getDeidentifyTemplateName(), options.getInspectTemplateName()))) .apply( Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval())))) .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1)); p.run(); }
Example #5
Source File: FileIO.java From beam with Apache License 2.0 | 5 votes |
@Override public Watch.Growth.PollResult<MatchResult.Metadata> apply(String element, Context c) throws Exception { Instant now = Instant.now(); return Watch.Growth.PollResult.incomplete( now, FileSystems.match(element, EmptyMatchTreatment.ALLOW).metadata()) .withWatermark(now); }
Example #6
Source File: TextIOReadTest.java From beam with Apache License 2.0 | 5 votes |
@Test @Category({NeedsRunner.class, UsesUnboundedSplittableParDo.class}) public void testReadWatchForNewFiles() throws IOException, InterruptedException { final Path basePath = tempFolder.getRoot().toPath().resolve("readWatch"); basePath.toFile().mkdir(); p.apply(GenerateSequence.from(0).to(10).withRate(1, Duration.millis(100))) .apply( Window.<Long>into(FixedWindows.of(Duration.millis(150))) .withAllowedLateness(Duration.ZERO) .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) .discardingFiredPanes()) .apply(ToString.elements()) .apply( TextIO.write() .to(basePath.resolve("data").toString()) .withNumShards(1) .withWindowedWrites()); PCollection<String> lines = p.apply( TextIO.read() .from(basePath.resolve("*").toString()) .watchForNewFiles( Duration.millis(100), Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3)))); PAssert.that(lines).containsInAnyOrder("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); p.run(); }
Example #7
Source File: S3Import.java From dlp-dataflow-deidentification with Apache License 2.0 | 4 votes |
public static void main(String[] args) { S3ImportOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class); AWSOptionParser.formatOptions(options); Pipeline p = Pipeline.create(options); // s3 PCollection<KV<String, ReadableFile>> s3Files = p.apply( "Poll S3 Files", FileIO.match() .filepattern(options.getS3BucketUrl()) .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())) .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO)) .apply( "Add S3 File Name as Key", WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString())) .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())); // gcs files PCollection<KV<String, ReadableFile>> gcsFiles = p.apply( "Poll GCS Files", FileIO.match() .filepattern(options.getGcsBucketUrl()) .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())) .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO)) .apply( "Add GCS File Name as Key", WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString())) .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())); PCollection<KV<String, ReadableFile>> files = PCollectionList.of(ImmutableList.of(gcsFiles, s3Files)) .apply("File List", Flatten.pCollections()) .apply( "Fixed Window", Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL)) .triggering(AfterWatermark.pastEndOfWindow()) .discardingFiredPanes() .withAllowedLateness(Duration.ZERO)); PCollectionTuple contents = files.apply( "Read File Contents", ParDo.of(new TextFileReader()) .withOutputTags( textReaderSuccessElements, TupleTagList.of(textReaderFailedElements))); PCollectionTuple inspectedContents = contents .get(textReaderSuccessElements) .apply( "DLP Inspection", ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName())) .withOutputTags( apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements))); inspectedContents .get(apiResponseSuccessElements) .apply( "BQ Write", BigQueryIO.<KV<String, TableRow>>write() .to(new BQDestination(options.getDataSetId(), options.getProject())) .withFormatFunction( element -> { return element.getValue(); }) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) .withoutValidation() .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); PCollectionList.of( ImmutableList.of( contents.get(textReaderFailedElements), inspectedContents.get(apiResponseFailedElements))) .apply("Combine Error Logs", Flatten.pCollections()) .apply( "Write Error Logs", ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext c) { LOG.error("***ERROR*** {}", c.element().toString()); c.output(c.element()); } })); p.run(); }
Example #8
Source File: CSVStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0 | 4 votes |
@SuppressWarnings("serial") public static void doTokenization(TokenizePipelineOptions options) { Pipeline p = Pipeline.create(options); PCollection<KV<String, List<String>>> filesAndContents = p.apply( FileIO.match() .filepattern(options.getInputFile()) .continuously( Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply( "FileHandler", ParDo.of( new CSVReader( options.getCsek(), options.getCsekhash(), options.getFileDecryptKeyName(), options.getFileDecryptKey(), options.as(GcpOptions.class).getProject(), options.getBatchSize()))); PCollection<KV<String, Table>> dlpTables = filesAndContents.apply( "ContentHandler", ParDo.of(new CSVContentProcessorDoFn(options.getBatchSize()))); PCollection<Row> dlpRows = dlpTables .apply( "DoDLPTokenization", ParDo.of( new DLPTokenizationDoFn( options.as(GcpOptions.class).getProject(), options.getDeidentifyTemplateName(), options.getInspectTemplateName()))) .apply( Window.<Row>into(FixedWindows.of(Duration.standardSeconds(options.getInterval()))) .triggering( AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardMinutes(1))) .discardingFiredPanes() .withAllowedLateness(Duration.standardMinutes(1))); dlpRows.apply( "WriteToBQ", BigQueryIO.<Row>write() .to(new BQDestination(options.getDataset(), options.as(GcpOptions.class).getProject())) .withFormatFunction(new BQTableRowSF()) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); dlpRows .apply( MapElements.via( new SimpleFunction<Row, KV<String, Row>>() { @Override public KV<String, Row> apply(Row row) { return KV.of(row.getTableId(), row); } })) .apply(GroupByKey.<String, Row>create()) .apply( "WriteToGCS", FileIO.<String, KV<String, Iterable<Row>>>writeDynamic() .by( (SerializableFunction<KV<String, Iterable<Row>>, String>) row -> { return row.getKey(); }) .via(new CSVSink()) .to(options.getOutputFile()) .withDestinationCoder(StringUtf8Coder.of()) .withNumShards(1) .withNaming(key -> FileIO.Write.defaultNaming(key, ".csv"))); p.run(); }
Example #9
Source File: Snippets.java From beam with Apache License 2.0 | 4 votes |
public static void fileProcessPattern() throws Exception { Pipeline p = Pipeline.create(); // [START FileProcessPatternProcessNewFilesSnip1] // This produces PCollection<MatchResult.Metadata> p.apply( FileIO.match() .filepattern("...") .continuously( Duration.standardSeconds(30), Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1)))); // [END FileProcessPatternProcessNewFilesSnip1] // [START FileProcessPatternProcessNewFilesSnip2] // This produces PCollection<String> p.apply( TextIO.read() .from("<path-to-files>/*") .watchForNewFiles( // Check for new files every minute. Duration.standardMinutes(1), // Stop watching the file pattern if no new files appear for an hour. Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1)))); // [END FileProcessPatternProcessNewFilesSnip2] // [START FileProcessPatternAccessMetadataSnip1] p.apply(FileIO.match().filepattern("hdfs://path/to/*.gz")) // The withCompression method is optional. By default, the Beam SDK detects compression from // the filename. .apply(FileIO.readMatches().withCompression(Compression.GZIP)) .apply( ParDo.of( new DoFn<FileIO.ReadableFile, String>() { @ProcessElement public void process(@Element FileIO.ReadableFile file) { // We can now access the file and its metadata. LOG.info("File Metadata resourceId is {} ", file.getMetadata().resourceId()); } })); // [END FileProcessPatternAccessMetadataSnip1] }
Example #10
Source File: FileIOTest.java From beam with Apache License 2.0 | 4 votes |
@Test @Category({NeedsRunner.class, UsesUnboundedSplittableParDo.class}) public void testMatchWatchForNewFiles() throws IOException, InterruptedException { // Write some files to a "source" directory. final Path sourcePath = tmpFolder.getRoot().toPath().resolve("source"); sourcePath.toFile().mkdir(); Files.write(sourcePath.resolve("first"), new byte[42]); Files.write(sourcePath.resolve("second"), new byte[37]); Files.write(sourcePath.resolve("third"), new byte[99]); // Create a "watch" directory that the pipeline will copy files into. final Path watchPath = tmpFolder.getRoot().toPath().resolve("watch"); watchPath.toFile().mkdir(); PCollection<MatchResult.Metadata> matchMetadata = p.apply( FileIO.match() .filepattern(watchPath.resolve("*").toString()) .continuously( Duration.millis(100), Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3)))); PCollection<MatchResult.Metadata> matchAllMetadata = p.apply(Create.of(watchPath.resolve("*").toString())) .apply( FileIO.matchAll() .continuously( Duration.millis(100), Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3)))); assertEquals(PCollection.IsBounded.UNBOUNDED, matchMetadata.isBounded()); assertEquals(PCollection.IsBounded.UNBOUNDED, matchAllMetadata.isBounded()); // Copy the files to the "watch" directory, preserving the lastModifiedTime; // the COPY_ATTRIBUTES option ensures that we will at a minimum copy lastModifiedTime. CopyOption[] copyOptions = {StandardCopyOption.COPY_ATTRIBUTES}; Thread writer = new Thread( () -> { try { Thread.sleep(1000); Files.copy(sourcePath.resolve("first"), watchPath.resolve("first"), copyOptions); Thread.sleep(300); Files.copy(sourcePath.resolve("second"), watchPath.resolve("second"), copyOptions); Thread.sleep(300); Files.copy(sourcePath.resolve("third"), watchPath.resolve("third"), copyOptions); } catch (IOException | InterruptedException e) { throw new RuntimeException(e); } }); writer.start(); // We fetch lastModifiedTime from the files in the "source" directory to avoid a race condition // with the writer thread. List<MatchResult.Metadata> expected = Arrays.asList( metadata( watchPath.resolve("first"), 42, lastModifiedMillis(sourcePath.resolve("first"))), metadata( watchPath.resolve("second"), 37, lastModifiedMillis(sourcePath.resolve("second"))), metadata( watchPath.resolve("third"), 99, lastModifiedMillis(sourcePath.resolve("third")))); PAssert.that(matchMetadata).containsInAnyOrder(expected); PAssert.that(matchAllMetadata).containsInAnyOrder(expected); p.run(); writer.join(); }
Example #11
Source File: HCatalogIOTest.java From beam with Apache License 2.0 | 4 votes |
/** Perform end-to-end test of Write-then-Read operation. */ @Test @NeedsEmptyTestTablesForUnboundedReads public void testWriteThenUnboundedReadSuccess() throws Exception { defaultPipeline .apply(Create.of(buildHCatRecords(TEST_RECORDS_COUNT))) .apply( HCatalogIO.write() .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf())) .withDatabase(TEST_DATABASE) .withTable(TEST_TABLE) .withPartition(getPartitions()) .withBatchSize(512L)); defaultPipeline.run(); final ImmutableList<String> partitions = ImmutableList.of("load_date", "product_type"); final PCollection<HCatRecord> data = readAfterWritePipeline .apply( "ReadData", HCatalogIO.read() .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf())) .withDatabase(TEST_DATABASE) .withPartitionCols(partitions) .withTable(TEST_TABLE) .withPollingInterval(Duration.millis(15000)) .withTerminationCondition(Watch.Growth.afterTotalOf(Duration.millis(60000)))) .setCoder((Coder) WritableCoder.of(DefaultHCatRecord.class)); final PCollection<String> output = data.apply( ParDo.of( new DoFn<HCatRecord, String>() { @ProcessElement public void processElement(ProcessContext c) { c.output(c.element().get(0).toString()); } })); PAssert.that(output).containsInAnyOrder(getExpectedRecords(TEST_RECORDS_COUNT)); readAfterWritePipeline.run(); }