org.apache.beam.sdk.io.Compression Java Examples
The following examples show how to use
org.apache.beam.sdk.io.Compression.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TikaIOTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testParseAndParseFiles() throws IOException { Path root = Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent(); List<ParseResult> expected = Arrays.asList( ParseResult.success( root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()), ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE)); PCollection<ParseResult> parse = p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString())) .apply("FilterParse", ParDo.of(new FilterMetadataFn())); PAssert.that(parse).containsInAnyOrder(expected); PCollection<ParseResult> parseFiles = p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(TikaIO.parseFiles()) .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn())); PAssert.that(parseFiles).containsInAnyOrder(expected); p.run(); }
Example #2
Source File: SinkOptions.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
/** * Set all the derived fields of a {@link SinkOptions.Parsed} instance. */ static void enrichSinkOptions(Parsed options) { validateSinkOptions(options); options.setParsedWindowDuration(Time.parseDuration(options.getWindowDuration())); options.setParsedBqTriggeringFrequency(Time.parseDuration(options.getBqTriggeringFrequency())); options.setParsedErrorBqTriggeringFrequency( Time.parseDuration(options.getErrorBqTriggeringFrequency())); options.setDecompressInputPayloads( providerWithDefault(options.getDecompressInputPayloads(), true)); options.setOutputTableRowFormat( providerWithDefault(options.getOutputTableRowFormat(), TableRowFormat.payload)); options.setOutputPubsubCompression( providerWithDefault(options.getOutputPubsubCompression(), Compression.GZIP)); options.setErrorOutputPubsubCompression( providerWithDefault(options.getErrorOutputPubsubCompression(), Compression.GZIP)); options.setOutputNumShards(providerWithDefault(options.getOutputNumShards(), 100)); options.setErrorOutputNumShards(providerWithDefault(options.getErrorOutputNumShards(), 100)); }
Example #3
Source File: TFRecordIOIT.java From beam with Apache License 2.0 | 6 votes |
@BeforeClass public static void setup() { FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions(); datasetSize = options.getDatasetSize(); expectedHash = options.getExpectedHash(); numberOfTextLines = options.getNumberOfRecords(); compressionType = Compression.valueOf(options.getCompressionType()); filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix()); bigQueryDataset = options.getBigQueryDataset(); bigQueryTable = options.getBigQueryTable(); settings = InfluxDBSettings.builder() .withHost(options.getInfluxHost()) .withDatabase(options.getInfluxDatabase()) .withMeasurement(options.getInfluxMeasurement()) .get(); }
Example #4
Source File: TextIOIT.java From beam with Apache License 2.0 | 6 votes |
@BeforeClass public static void setup() { FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions(); datasetSize = options.getDatasetSize(); expectedHash = options.getExpectedHash(); numberOfTextLines = options.getNumberOfRecords(); compressionType = Compression.valueOf(options.getCompressionType()); filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix()); numShards = options.getNumberOfShards(); bigQueryDataset = options.getBigQueryDataset(); bigQueryTable = options.getBigQueryTable(); gatherGcsPerformanceMetrics = options.getReportGcsPerformanceMetrics(); settings = InfluxDBSettings.builder() .withHost(options.getInfluxHost()) .withDatabase(options.getInfluxDatabase()) .withMeasurement(options.getInfluxMeasurement()) .get(); }
Example #5
Source File: TestUtils.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** * Helper to generate files for testing. * * @param filePath The path to the file to write. * @param lines The lines to write. * @param compression The compression type of the file. * @return The file written. * @throws IOException If an error occurs while creating or writing the file. */ public static ResourceId writeToFile( String filePath, List<String> lines, Compression compression) throws IOException { String fileContents = String.join(System.lineSeparator(), lines); ResourceId resourceId = FileSystems.matchNewResource(filePath, false); String mimeType = compression == Compression.UNCOMPRESSED ? MimeTypes.TEXT : MimeTypes.BINARY; // Write the file contents to the channel and close. try (ReadableByteChannel readChannel = Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) { try (WritableByteChannel writeChannel = compression.writeCompressed(FileSystems.create(resourceId, mimeType))) { ByteStreams.copy(readChannel, writeChannel); } } return resourceId; }
Example #6
Source File: PubsubIntegrationTest.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Test(timeout = 30000) public void canSendPubsubOutput() throws Exception { final List<String> inputLines = Lines.resources("testdata/pubsub-integration/input.ndjson"); pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false); SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class); sinkOptions.setOutput(pipeline.newProvider(topicName.toString())); // We would normally use pipeline.newProvider instead of StaticValueProvider in tests, // but something about this configuration causes the pipeline to stall when CompressPayload // accesses a method on the underlying enum value when defined via pipeline.newProvider. sinkOptions.setOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED)); pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode()) .apply(OutputType.pubsub.write(sinkOptions)); final PipelineResult result = pipeline.run(); System.err.println("Waiting for subscriber to receive messages published in the pipeline..."); List<String> expectedLines = Lines.resources("testdata/pubsub-integration/truncated.ndjson"); List<String> received = receiveLines(expectedLines.size()); assertThat(received, matchesInAnyOrder(expectedLines)); result.cancel(); }
Example #7
Source File: BulkCompressorTest.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */ @Test public void testCompressFile() throws Exception { // Setup test final Compression compression = Compression.GZIP; final ValueProvider<String> outputDirectoryProvider = pipeline.newProvider(tempFolderCompressedPath.toString()); final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression); final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString()); // Execute the compressor PCollection<String> lines = pipeline .apply("Create File Input", Create.of(metadata)) .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider))) .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO)); // Test the result PAssert.that(lines).containsInAnyOrder(FILE_CONTENT); pipeline.run(); }
Example #8
Source File: BulkDecompressor.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(ProcessContext context) { ResourceId inputFile = context.element().resourceId(); // Output a record to the failure file if the file doesn't match a known compression. if (!Compression.AUTO.isCompressed(inputFile.toString())) { String errorMsg = String.format(UNCOMPRESSED_ERROR_MSG, inputFile.toString(), SUPPORTED_COMPRESSIONS); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), errorMsg)); } else { try { ResourceId outputFile = decompress(inputFile); context.output(outputFile.toString()); } catch (IOException e) { LOG.error(e.getMessage()); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage())); } } }
Example #9
Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
/** Constructor. */ public PubsubOutput(ValueProvider<String> topic, ValueProvider<Compression> compression, int maxCompressedBytes) { this.topic = topic; this.compression = compression; this.maxCompressedBytes = maxCompressedBytes; }
Example #10
Source File: TextStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException, GeneralSecurityException { TokenizePipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply( FileIO.match() .filepattern(options.getInputFile()) .continuously( Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply( "Text File Reader", ParDo.of( new TextFileReader( options.as(GcpOptions.class).getProject(), options.getFileDecryptKeyName(), options.getFileDecryptKey(), options.getBatchSize(), options.getCsek(), options.getCsekhash()))) .apply( "Tokenize Data", ParDo.of( new TokenizeData( options.as(GcpOptions.class).getProject(), options.getDeidentifyTemplateName(), options.getInspectTemplateName()))) .apply( Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval())))) .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1)); p.run(); }
Example #11
Source File: CompressPayloadTest.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Test public void testGzipCompress() { String text = StringUtils.repeat("Lorem ipsum dolor sit amet ", 100); byte[] compressedBytes = CompressPayload.compress(text.getBytes(StandardCharsets.UTF_8), Compression.GZIP); assertThat(ArrayUtils.toObject(compressedBytes), Matchers.arrayWithSize(68)); }
Example #12
Source File: CompressPayloadTest.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Test public void testMaxCompressedBytes() { String text = StringUtils.repeat("Lorem ipsum dolor sit amet ", 100); int expectedCompressedSize = 68; CompressPayload transform = CompressPayload.of(StaticValueProvider.of(Compression.GZIP)) .withMaxCompressedBytes(expectedCompressedSize - 1); PubsubMessage truncated = transform .compress(new PubsubMessage(text.getBytes(StandardCharsets.UTF_8), new HashMap<>())); assertThat(ArrayUtils.toObject(truncated.getPayload()), Matchers.arrayWithSize(50)); }
Example #13
Source File: PubsubIntegrationTest.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@Test(timeout = 30000) public void canSendPubsubErrorOutput() throws Exception { final List<String> inputLines = Lines .resources("testdata/pubsub-integration/error-input.ndjson"); pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false); SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class); sinkOptions.setInput(pipeline.newProvider("test input")); sinkOptions.setJobName("test job name"); sinkOptions.setErrorOutput(pipeline.newProvider(topicName.toString())); // We would normally use pipeline.newProvider instead of StaticValueProvider in tests, // but something about this configuration causes the pipeline to stall when CompressPayload // accesses a method on the underlying enum value when defined via pipeline.newProvider. sinkOptions.setErrorOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED)); pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode()) .apply(ErrorOutputType.pubsub.write(sinkOptions)); final PipelineResult result = pipeline.run(); System.err.println("Waiting for subscriber to receive messages published in the pipeline..."); List<String> expectedLines = Lines.resources("testdata/pubsub-integration/error-output.ndjson"); List<String> received = receiveLines(expectedLines.size()); assertThat(received, matchesInAnyOrder(expectedLines)); result.cancel(); }
Example #14
Source File: BulkDecompressor.java From DataflowTemplates with Apache License 2.0 | 5 votes |
/** * Decompresses the inputFile using the specified compression and outputs to the main output of * the {@link Decompress} doFn. Files output to the destination will be first written as temp * files with a "temp-" prefix within the output directory. If a file fails decompression, the * filename and the associated error will be output to the dead-letter. * * @param inputFile The inputFile to decompress. * @return A {@link ResourceId} which points to the resulting file from the decompression. */ private ResourceId decompress(ResourceId inputFile) throws IOException { // Remove the compressed extension from the file. Example: demo.txt.gz -> demo.txt String outputFilename = Files.getNameWithoutExtension(inputFile.toString()); // Resolve the necessary resources to perform the transfer. ResourceId outputDir = FileSystems.matchNewResource(destinationLocation.get(), true); ResourceId outputFile = outputDir.resolve(outputFilename, StandardResolveOptions.RESOLVE_FILE); ResourceId tempFile = outputDir.resolve(Files.getFileExtension(inputFile.toString()) + "-temp-" + outputFilename, StandardResolveOptions.RESOLVE_FILE); // Resolve the compression Compression compression = Compression.detect(inputFile.toString()); // Perform the copy of the decompressed channel into the destination. try (ReadableByteChannel readerChannel = compression.readDecompressed(FileSystems.open(inputFile))) { try (WritableByteChannel writerChannel = FileSystems.create(tempFile, MimeTypes.TEXT)) { ByteStreams.copy(readerChannel, writerChannel); } // Rename the temp file to the output file. FileSystems.rename( ImmutableList.of(tempFile), ImmutableList.of(outputFile), MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); } catch (IOException e) { String msg = e.getMessage(); LOG.error("Error occurred during decompression of {}", inputFile.toString(), e); throw new IOException(sanitizeDecompressionErrorMsg(msg, inputFile, compression)); } return outputFile; }
Example #15
Source File: BulkCompressor.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext context) { ResourceId inputFile = context.element().resourceId(); Compression compression = compressionValue.get(); // Add the compression extension to the output filename. Example: demo.txt -> demo.txt.gz String outputFilename = inputFile.getFilename() + compression.getSuggestedSuffix(); // Resolve the necessary resources to perform the transfer ResourceId outputDir = FileSystems.matchNewResource(destinationLocation.get(), true); ResourceId outputFile = outputDir.resolve(outputFilename, StandardResolveOptions.RESOLVE_FILE); ResourceId tempFile = outputDir.resolve("temp-" + outputFilename, StandardResolveOptions.RESOLVE_FILE); // Perform the copy of the compressed channel to the destination. try (ReadableByteChannel readerChannel = FileSystems.open(inputFile)) { try (WritableByteChannel writerChannel = compression.writeCompressed(FileSystems.create(tempFile, MimeTypes.BINARY))) { // Execute the copy to the temporary file ByteStreams.copy(readerChannel, writerChannel); } // Rename the temporary file to the output file FileSystems.rename(ImmutableList.of(tempFile), ImmutableList.of(outputFile)); // Output the path to the uncompressed file context.output(outputFile.toString()); } catch (IOException e) { LOG.error("Error occurred during compression of {}", inputFile.toString(), e); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage())); } }
Example #16
Source File: BulkDecompressorTest.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@BeforeClass public static void setupClass() throws IOException { Path tempFolderRootPath = tempFolder.getRoot().toPath(); tempFolderOutputPath = tempFolder.newFolder("output").toPath(); // Test files compressedFile = TestUtils.writeToFile( tempFolderRootPath .resolve(FILE_BASE_NAME + Compression.GZIP.getSuggestedSuffix()) .toString(), FILE_CONTENT, Compression.GZIP); wrongCompressionExtFile = TestUtils.writeToFile( tempFolderRootPath .resolve(FILE_BASE_NAME + Compression.DEFLATE.getSuggestedSuffix()) .toString(), FILE_CONTENT, Compression.BZIP2); uncompressedFile = TestUtils.writeToFile( tempFolderRootPath .resolve(FILE_BASE_NAME + Compression.BZIP2.getSuggestedSuffix()) .toString(), FILE_CONTENT, Compression.UNCOMPRESSED); unknownCompressionFile = TestUtils.writeToFile( tempFolderRootPath.resolve(FILE_BASE_NAME).toString(), FILE_CONTENT, Compression.UNCOMPRESSED); }
Example #17
Source File: JdbcAvroIO.java From dbeam with Apache License 2.0 | 5 votes |
JdbcAvroSink( ValueProvider<ResourceId> filenamePrefix, DynamicAvroDestinations<UserT, Void, String> dynamicDestinations, JdbcAvroArgs jdbcAvroArgs) { super(filenamePrefix, dynamicDestinations, Compression.UNCOMPRESSED); this.dynamicDestinations = dynamicDestinations; this.jdbcAvroArgs = jdbcAvroArgs; }
Example #18
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<ParseResult> expand(PBegin input) { return input .apply(FileIO.match().filepattern(getFilepattern())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(parseFiles()); }
Example #19
Source File: SnowflakeIO.java From beam with Apache License 2.0 | 5 votes |
private PCollection<String> writeFiles(PCollection<T> input, String stagingBucketDir) { PCollection<String> mappedUserData = input .apply( MapElements.via( new SimpleFunction<T, Object[]>() { @Override public Object[] apply(T element) { return getUserDataMapper().mapRow(element); } })) .apply("Map Objects array to CSV lines", ParDo.of(new MapObjectsArrayToCsvFn())) .setCoder(StringUtf8Coder.of()); WriteFilesResult filesResult = mappedUserData.apply( "Write files to specified location", FileIO.<String>write() .via(TextIO.sink()) .to(stagingBucketDir) .withPrefix(getFileNameTemplate()) .withSuffix(".csv") .withCompression(Compression.GZIP)); return (PCollection) filesResult .getPerDestinationOutputFilenames() .apply("Parse KV filenames to Strings", Values.<String>create()); }
Example #20
Source File: Transforms.java From nomulus with Apache License 2.0 | 5 votes |
/** * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using * caller-provided {@code transformer}. */ static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles( DoFn<ReadableFile, VersionedEntity> transformer) { return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() { @Override public PCollection<VersionedEntity> expand(PCollection<Metadata> input) { return input .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer)); // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide } }; }
Example #21
Source File: BeamInputTransform.java From hop with Apache License 2.0 | 5 votes |
@Override public PCollection<HopRow> expand( PBegin input ) { try { // Only initialize once on this node/vm // BeamHop.init(transformPluginClasses, xpPluginClasses); // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)"); TextIO.Read ioRead = TextIO.read() .from( inputLocation ) .withCompression( Compression.UNCOMPRESSED ) ; StringToHopFn stringToHopFn = new StringToHopFn( transformName, rowMetaJson, separator, transformPluginClasses, xpPluginClasses ); PCollection<HopRow> output = input // We read a bunch of Strings, one per line basically // .apply( transformName + " READ FILE", ioRead ) // We need to transform these lines into Hop fields // .apply( transformName, ParDo.of( stringToHopFn ) ); return output; } catch ( Exception e ) { numErrors.inc(); LOG.error( "Error in beam input transform", e ); throw new RuntimeException( "Error in beam input transform", e ); } }
Example #22
Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
/** Public constructor. */ public FileOutput(ValueProvider<String> outputPrefix, OutputFileFormat format, Duration windowDuration, ValueProvider<Integer> numShards, Compression compression, InputType inputType) { this.outputPrefix = outputPrefix; this.format = format; this.windowDuration = windowDuration; this.numShards = numShards; this.compression = compression; this.inputType = inputType; }
Example #23
Source File: UtilTest.java From dlp-dataflow-deidentification with Apache License 2.0 | 5 votes |
@Test public void testGetReader() throws IOException { Path firstPath = tmpFolder.newFile("first").toPath(); int firstSize = 37; Files.write(firstPath, new byte[firstSize]); ValueProvider<String> testValueProvider = null; PCollection<String> br = p.apply(FileIO.match().filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*")) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply( ParDo.of( new DoFn<FileIO.ReadableFile, String>() { @ProcessElement public void processElement( @Element FileIO.ReadableFile f, OutputReceiver<String> out) throws IOException { out.output( Util.getReader( false, "object_name", "bucket_name", f, "key_name", testValueProvider) .readLine()); } })); p.run(); assertNotNull(br); }
Example #24
Source File: BeamInputTransform.java From kettle-beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KettleRow> expand( PBegin input ) { try { // Only initialize once on this node/vm // BeamKettle.init(stepPluginClasses, xpPluginClasses); // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)"); TextIO.Read ioRead = TextIO.read() .from( inputLocation ) .withCompression( Compression.UNCOMPRESSED ) ; StringToKettleFn stringToKettleFn = new StringToKettleFn( stepname, rowMetaJson, separator, stepPluginClasses, xpPluginClasses ); PCollection<KettleRow> output = input // We read a bunch of Strings, one per line basically // .apply( stepname + " READ FILE", ioRead ) // We need to transform these lines into Kettle fields // .apply( stepname, ParDo.of( stringToKettleFn ) ); return output; } catch ( Exception e ) { numErrors.inc(); LOG.error( "Error in beam input transform", e ); throw new RuntimeException( "Error in beam input transform", e ); } }
Example #25
Source File: CompressPayload.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
@VisibleForTesting static byte[] compress(byte[] payload, Compression compression) { ByteArrayOutputStream out = new ByteArrayOutputStream(); // We use a try-with-resources statement to ensure everything gets closed appropriately. try (ReadableByteChannel inChannel = Channels.newChannel(new ByteArrayInputStream(payload)); WritableByteChannel outChannel = compression.writeCompressed(Channels.newChannel(out))) { ByteStreams.copy(inChannel, outChannel); } catch (IOException e) { return payload; } return out.toByteArray(); }
Example #26
Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
/** Public constructor. */ public AvroOutput(ValueProvider<String> outputPrefix, Duration windowDuration, ValueProvider<Integer> numShards, Compression compression, InputType inputType, ValueProvider<String> schemasLocation) { this.outputPrefix = outputPrefix; this.windowDuration = windowDuration; this.numShards = numShards; this.compression = compression; this.inputType = inputType; this.schemasLocation = schemasLocation; this.pathTemplate = NestedValueProvider.of(outputPrefix, DynamicPathTemplate::new); }
Example #27
Source File: SinkOptions.java From gcp-ingestion with Mozilla Public License 2.0 | 4 votes |
@Description("Compression format for --outputType=file") @Default.Enum("GZIP") Compression getOutputFileCompression();
Example #28
Source File: S3Import.java From dlp-dataflow-deidentification with Apache License 2.0 | 4 votes |
public static void main(String[] args) { S3ImportOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class); AWSOptionParser.formatOptions(options); Pipeline p = Pipeline.create(options); // s3 PCollection<KV<String, ReadableFile>> s3Files = p.apply( "Poll S3 Files", FileIO.match() .filepattern(options.getS3BucketUrl()) .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())) .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO)) .apply( "Add S3 File Name as Key", WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString())) .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())); // gcs files PCollection<KV<String, ReadableFile>> gcsFiles = p.apply( "Poll GCS Files", FileIO.match() .filepattern(options.getGcsBucketUrl()) .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())) .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO)) .apply( "Add GCS File Name as Key", WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString())) .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())); PCollection<KV<String, ReadableFile>> files = PCollectionList.of(ImmutableList.of(gcsFiles, s3Files)) .apply("File List", Flatten.pCollections()) .apply( "Fixed Window", Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL)) .triggering(AfterWatermark.pastEndOfWindow()) .discardingFiredPanes() .withAllowedLateness(Duration.ZERO)); PCollectionTuple contents = files.apply( "Read File Contents", ParDo.of(new TextFileReader()) .withOutputTags( textReaderSuccessElements, TupleTagList.of(textReaderFailedElements))); PCollectionTuple inspectedContents = contents .get(textReaderSuccessElements) .apply( "DLP Inspection", ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName())) .withOutputTags( apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements))); inspectedContents .get(apiResponseSuccessElements) .apply( "BQ Write", BigQueryIO.<KV<String, TableRow>>write() .to(new BQDestination(options.getDataSetId(), options.getProject())) .withFormatFunction( element -> { return element.getValue(); }) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) .withoutValidation() .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); PCollectionList.of( ImmutableList.of( contents.get(textReaderFailedElements), inspectedContents.get(apiResponseFailedElements))) .apply("Combine Error Logs", Flatten.pCollections()) .apply( "Write Error Logs", ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext c) { LOG.error("***ERROR*** {}", c.element().toString()); c.output(c.element()); } })); p.run(); }
Example #29
Source File: CSVStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0 | 4 votes |
@SuppressWarnings("serial") public static void doTokenization(TokenizePipelineOptions options) { Pipeline p = Pipeline.create(options); PCollection<KV<String, List<String>>> filesAndContents = p.apply( FileIO.match() .filepattern(options.getInputFile()) .continuously( Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply( "FileHandler", ParDo.of( new CSVReader( options.getCsek(), options.getCsekhash(), options.getFileDecryptKeyName(), options.getFileDecryptKey(), options.as(GcpOptions.class).getProject(), options.getBatchSize()))); PCollection<KV<String, Table>> dlpTables = filesAndContents.apply( "ContentHandler", ParDo.of(new CSVContentProcessorDoFn(options.getBatchSize()))); PCollection<Row> dlpRows = dlpTables .apply( "DoDLPTokenization", ParDo.of( new DLPTokenizationDoFn( options.as(GcpOptions.class).getProject(), options.getDeidentifyTemplateName(), options.getInspectTemplateName()))) .apply( Window.<Row>into(FixedWindows.of(Duration.standardSeconds(options.getInterval()))) .triggering( AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardMinutes(1))) .discardingFiredPanes() .withAllowedLateness(Duration.standardMinutes(1))); dlpRows.apply( "WriteToBQ", BigQueryIO.<Row>write() .to(new BQDestination(options.getDataset(), options.as(GcpOptions.class).getProject())) .withFormatFunction(new BQTableRowSF()) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); dlpRows .apply( MapElements.via( new SimpleFunction<Row, KV<String, Row>>() { @Override public KV<String, Row> apply(Row row) { return KV.of(row.getTableId(), row); } })) .apply(GroupByKey.<String, Row>create()) .apply( "WriteToGCS", FileIO.<String, KV<String, Iterable<Row>>>writeDynamic() .by( (SerializableFunction<KV<String, Iterable<Row>>, String>) row -> { return row.getKey(); }) .via(new CSVSink()) .to(options.getOutputFile()) .withDestinationCoder(StringUtf8Coder.of()) .withNumShards(1) .withNaming(key -> FileIO.Write.defaultNaming(key, ".csv"))); p.run(); }
Example #30
Source File: XmlIO.java From beam with Apache License 2.0 | 4 votes |
/** Decompresses all input files using the specified compression type. */ public Read<T> withCompression(Compression compression) { return toBuilder().setCompression(compression).build(); }