org.apache.beam.sdk.io.Compression Java Exaples

Source File: TikaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testParseAndParseFiles() throws IOException {
  Path root =
      Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent();

  List<ParseResult> expected =
      Arrays.asList(
          ParseResult.success(
              root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()),
          ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE));

  PCollection<ParseResult> parse =
      p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString()))
          .apply("FilterParse", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parse).containsInAnyOrder(expected);

  PCollection<ParseResult> parseFiles =
      p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(TikaIO.parseFiles())
          .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parseFiles).containsInAnyOrder(expected);
  p.run();
}

Source File: SinkOptions.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

/**
 * Set all the derived fields of a {@link SinkOptions.Parsed} instance.
 */
static void enrichSinkOptions(Parsed options) {
  validateSinkOptions(options);
  options.setParsedWindowDuration(Time.parseDuration(options.getWindowDuration()));
  options.setParsedBqTriggeringFrequency(Time.parseDuration(options.getBqTriggeringFrequency()));
  options.setParsedErrorBqTriggeringFrequency(
      Time.parseDuration(options.getErrorBqTriggeringFrequency()));
  options.setDecompressInputPayloads(
      providerWithDefault(options.getDecompressInputPayloads(), true));
  options.setOutputTableRowFormat(
      providerWithDefault(options.getOutputTableRowFormat(), TableRowFormat.payload));
  options.setOutputPubsubCompression(
      providerWithDefault(options.getOutputPubsubCompression(), Compression.GZIP));
  options.setErrorOutputPubsubCompression(
      providerWithDefault(options.getErrorOutputPubsubCompression(), Compression.GZIP));
  options.setOutputNumShards(providerWithDefault(options.getOutputNumShards(), 100));
  options.setErrorOutputNumShards(providerWithDefault(options.getErrorOutputNumShards(), 100));
}

Source File: TFRecordIOIT.java From beam with Apache License 2.0

6 votes

@BeforeClass
public static void setup() {
  FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
  datasetSize = options.getDatasetSize();
  expectedHash = options.getExpectedHash();
  numberOfTextLines = options.getNumberOfRecords();
  compressionType = Compression.valueOf(options.getCompressionType());
  filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
  bigQueryDataset = options.getBigQueryDataset();
  bigQueryTable = options.getBigQueryTable();
  settings =
      InfluxDBSettings.builder()
          .withHost(options.getInfluxHost())
          .withDatabase(options.getInfluxDatabase())
          .withMeasurement(options.getInfluxMeasurement())
          .get();
}

Source File: TextIOIT.java From beam with Apache License 2.0

6 votes

@BeforeClass
public static void setup() {
  FileBasedIOTestPipelineOptions options = readFileBasedIOITPipelineOptions();
  datasetSize = options.getDatasetSize();
  expectedHash = options.getExpectedHash();
  numberOfTextLines = options.getNumberOfRecords();
  compressionType = Compression.valueOf(options.getCompressionType());
  filenamePrefix = appendTimestampSuffix(options.getFilenamePrefix());
  numShards = options.getNumberOfShards();
  bigQueryDataset = options.getBigQueryDataset();
  bigQueryTable = options.getBigQueryTable();
  gatherGcsPerformanceMetrics = options.getReportGcsPerformanceMetrics();
  settings =
      InfluxDBSettings.builder()
          .withHost(options.getInfluxHost())
          .withDatabase(options.getInfluxDatabase())
          .withMeasurement(options.getInfluxMeasurement())
          .get();
}

Source File: TestUtils.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Helper to generate files for testing.
 *
 * @param filePath The path to the file to write.
 * @param lines The lines to write.
 * @param compression The compression type of the file.
 * @return The file written.
 * @throws IOException If an error occurs while creating or writing the file.
 */
public static ResourceId writeToFile(
    String filePath, List<String> lines, Compression compression) throws IOException {

  String fileContents = String.join(System.lineSeparator(), lines);

  ResourceId resourceId = FileSystems.matchNewResource(filePath, false);

  String mimeType =
      compression == Compression.UNCOMPRESSED ? MimeTypes.TEXT : MimeTypes.BINARY;

  // Write the file contents to the channel and close.
  try (ReadableByteChannel readChannel =
      Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) {
    try (WritableByteChannel writeChannel =
        compression.writeCompressed(FileSystems.create(resourceId, mimeType))) {
      ByteStreams.copy(readChannel, writeChannel);
    }
  }

  return resourceId;
}

Source File: PubsubIntegrationTest.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

@Test(timeout = 30000)
public void canSendPubsubOutput() throws Exception {
  final List<String> inputLines = Lines.resources("testdata/pubsub-integration/input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setOutput(pipeline.newProvider(topicName.toString()));
  // We would normally use pipeline.newProvider instead of StaticValueProvider in tests,
  // but something about this configuration causes the pipeline to stall when CompressPayload
  // accesses a method on the underlying enum value when defined via pipeline.newProvider.
  sinkOptions.setOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED));

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(OutputType.pubsub.write(sinkOptions));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/truncated.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}

Source File: BulkCompressorTest.java From DataflowTemplates with Apache License 2.0

6 votes

/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}

Source File: BulkDecompressor.java From DataflowTemplates with Apache License 2.0

6 votes

@ProcessElement
public void processElement(ProcessContext context) {
  ResourceId inputFile = context.element().resourceId();

  // Output a record to the failure file if the file doesn't match a known compression.
  if (!Compression.AUTO.isCompressed(inputFile.toString())) {
    String errorMsg =
        String.format(UNCOMPRESSED_ERROR_MSG, inputFile.toString(), SUPPORTED_COMPRESSIONS);

    context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), errorMsg));
  } else {
    try {
      ResourceId outputFile = decompress(inputFile);
      context.output(outputFile.toString());
    } catch (IOException e) {
      LOG.error(e.getMessage());
      context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage()));
    }
  }
}

Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

/** Constructor. */
public PubsubOutput(ValueProvider<String> topic, ValueProvider<Compression> compression,
    int maxCompressedBytes) {
  this.topic = topic;
  this.compression = compression;
  this.maxCompressedBytes = maxCompressedBytes;
}

Source File: TextStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }

Source File: CompressPayloadTest.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

@Test
public void testGzipCompress() {
  String text = StringUtils.repeat("Lorem ipsum dolor sit amet ", 100);
  byte[] compressedBytes = CompressPayload.compress(text.getBytes(StandardCharsets.UTF_8),
      Compression.GZIP);
  assertThat(ArrayUtils.toObject(compressedBytes), Matchers.arrayWithSize(68));
}

Source File: CompressPayloadTest.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

@Test
public void testMaxCompressedBytes() {
  String text = StringUtils.repeat("Lorem ipsum dolor sit amet ", 100);
  int expectedCompressedSize = 68;
  CompressPayload transform = CompressPayload.of(StaticValueProvider.of(Compression.GZIP))
      .withMaxCompressedBytes(expectedCompressedSize - 1);
  PubsubMessage truncated = transform
      .compress(new PubsubMessage(text.getBytes(StandardCharsets.UTF_8), new HashMap<>()));
  assertThat(ArrayUtils.toObject(truncated.getPayload()), Matchers.arrayWithSize(50));
}

Source File: PubsubIntegrationTest.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

@Test(timeout = 30000)
public void canSendPubsubErrorOutput() throws Exception {
  final List<String> inputLines = Lines
      .resources("testdata/pubsub-integration/error-input.ndjson");

  pipeline.getOptions().as(DirectOptions.class).setBlockOnRun(false);

  SinkOptions.Parsed sinkOptions = pipeline.getOptions().as(SinkOptions.Parsed.class);
  sinkOptions.setInput(pipeline.newProvider("test input"));
  sinkOptions.setJobName("test job name");
  sinkOptions.setErrorOutput(pipeline.newProvider(topicName.toString()));
  // We would normally use pipeline.newProvider instead of StaticValueProvider in tests,
  // but something about this configuration causes the pipeline to stall when CompressPayload
  // accesses a method on the underlying enum value when defined via pipeline.newProvider.
  sinkOptions.setErrorOutputPubsubCompression(StaticValueProvider.of(Compression.UNCOMPRESSED));

  pipeline.apply(Create.of(inputLines)).apply(InputFileFormat.json.decode())
      .apply(ErrorOutputType.pubsub.write(sinkOptions));

  final PipelineResult result = pipeline.run();

  System.err.println("Waiting for subscriber to receive messages published in the pipeline...");
  List<String> expectedLines = Lines.resources("testdata/pubsub-integration/error-output.ndjson");
  List<String> received = receiveLines(expectedLines.size());
  assertThat(received, matchesInAnyOrder(expectedLines));
  result.cancel();
}

Source File: BulkDecompressor.java From DataflowTemplates with Apache License 2.0

5 votes

/**
 * Decompresses the inputFile using the specified compression and outputs to the main output of
 * the {@link Decompress} doFn. Files output to the destination will be first written as temp
 * files with a "temp-" prefix within the output directory. If a file fails decompression, the
 * filename and the associated error will be output to the dead-letter.
 *
 * @param inputFile The inputFile to decompress.
 * @return A {@link ResourceId} which points to the resulting file from the decompression.
 */
private ResourceId decompress(ResourceId inputFile) throws IOException {
  // Remove the compressed extension from the file. Example: demo.txt.gz -> demo.txt
  String outputFilename = Files.getNameWithoutExtension(inputFile.toString());

  // Resolve the necessary resources to perform the transfer.
  ResourceId outputDir = FileSystems.matchNewResource(destinationLocation.get(), true);
  ResourceId outputFile =
      outputDir.resolve(outputFilename, StandardResolveOptions.RESOLVE_FILE);
  ResourceId tempFile =
      outputDir.resolve(Files.getFileExtension(inputFile.toString())
          + "-temp-" + outputFilename, StandardResolveOptions.RESOLVE_FILE);

  // Resolve the compression
  Compression compression = Compression.detect(inputFile.toString());

  // Perform the copy of the decompressed channel into the destination.
  try (ReadableByteChannel readerChannel =
      compression.readDecompressed(FileSystems.open(inputFile))) {
    try (WritableByteChannel writerChannel = FileSystems.create(tempFile, MimeTypes.TEXT)) {
      ByteStreams.copy(readerChannel, writerChannel);
    }

    // Rename the temp file to the output file.
    FileSystems.rename(
        ImmutableList.of(tempFile),
        ImmutableList.of(outputFile),
        MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
  } catch (IOException e) {
    String msg = e.getMessage();

    LOG.error("Error occurred during decompression of {}", inputFile.toString(), e);
    throw new IOException(sanitizeDecompressionErrorMsg(msg, inputFile, compression));
  }

  return outputFile;
}

Source File: BulkCompressor.java From DataflowTemplates with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext context) {
  ResourceId inputFile = context.element().resourceId();
  Compression compression = compressionValue.get();

  // Add the compression extension to the output filename. Example: demo.txt -> demo.txt.gz
  String outputFilename = inputFile.getFilename() + compression.getSuggestedSuffix();

  // Resolve the necessary resources to perform the transfer
  ResourceId outputDir = FileSystems.matchNewResource(destinationLocation.get(), true);
  ResourceId outputFile =
      outputDir.resolve(outputFilename, StandardResolveOptions.RESOLVE_FILE);
  ResourceId tempFile =
      outputDir.resolve("temp-" + outputFilename, StandardResolveOptions.RESOLVE_FILE);

  // Perform the copy of the compressed channel to the destination.
  try (ReadableByteChannel readerChannel = FileSystems.open(inputFile)) {
    try (WritableByteChannel writerChannel =
        compression.writeCompressed(FileSystems.create(tempFile, MimeTypes.BINARY))) {

      // Execute the copy to the temporary file
      ByteStreams.copy(readerChannel, writerChannel);
    }

    // Rename the temporary file to the output file
    FileSystems.rename(ImmutableList.of(tempFile), ImmutableList.of(outputFile));

    // Output the path to the uncompressed file
    context.output(outputFile.toString());
  } catch (IOException e) {
    LOG.error("Error occurred during compression of {}", inputFile.toString(), e);
    context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage()));
  }
}

Source File: BulkDecompressorTest.java From DataflowTemplates with Apache License 2.0

5 votes

@BeforeClass
public static void setupClass() throws IOException {
  Path tempFolderRootPath = tempFolder.getRoot().toPath();
  tempFolderOutputPath = tempFolder.newFolder("output").toPath();

  // Test files
  compressedFile =
      TestUtils.writeToFile(
          tempFolderRootPath
              .resolve(FILE_BASE_NAME + Compression.GZIP.getSuggestedSuffix())
              .toString(),
          FILE_CONTENT,
          Compression.GZIP);

  wrongCompressionExtFile =
      TestUtils.writeToFile(
          tempFolderRootPath
              .resolve(FILE_BASE_NAME + Compression.DEFLATE.getSuggestedSuffix())
              .toString(),
          FILE_CONTENT,
          Compression.BZIP2);

  uncompressedFile =
      TestUtils.writeToFile(
          tempFolderRootPath
              .resolve(FILE_BASE_NAME + Compression.BZIP2.getSuggestedSuffix())
              .toString(),
          FILE_CONTENT,
          Compression.UNCOMPRESSED);

  unknownCompressionFile =
      TestUtils.writeToFile(
          tempFolderRootPath.resolve(FILE_BASE_NAME).toString(),
          FILE_CONTENT,
          Compression.UNCOMPRESSED);
}

Source File: JdbcAvroIO.java From dbeam with Apache License 2.0

5 votes

JdbcAvroSink(
    ValueProvider<ResourceId> filenamePrefix,
    DynamicAvroDestinations<UserT, Void, String> dynamicDestinations,
    JdbcAvroArgs jdbcAvroArgs) {
  super(filenamePrefix, dynamicDestinations, Compression.UNCOMPRESSED);
  this.dynamicDestinations = dynamicDestinations;
  this.jdbcAvroArgs = jdbcAvroArgs;
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<ParseResult> expand(PBegin input) {
  return input
      .apply(FileIO.match().filepattern(getFilepattern()))
      .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
      .apply(parseFiles());
}

Source File: SnowflakeIO.java From beam with Apache License 2.0

5 votes

private PCollection<String> writeFiles(PCollection<T> input, String stagingBucketDir) {

      PCollection<String> mappedUserData =
          input
              .apply(
                  MapElements.via(
                      new SimpleFunction<T, Object[]>() {
                        @Override
                        public Object[] apply(T element) {
                          return getUserDataMapper().mapRow(element);
                        }
                      }))
              .apply("Map Objects array to CSV lines", ParDo.of(new MapObjectsArrayToCsvFn()))
              .setCoder(StringUtf8Coder.of());

      WriteFilesResult filesResult =
          mappedUserData.apply(
              "Write files to specified location",
              FileIO.<String>write()
                  .via(TextIO.sink())
                  .to(stagingBucketDir)
                  .withPrefix(getFileNameTemplate())
                  .withSuffix(".csv")
                  .withCompression(Compression.GZIP));

      return (PCollection)
          filesResult
              .getPerDestinationOutputFilenames()
              .apply("Parse KV filenames to Strings", Values.<String>create());
    }

Source File: Transforms.java From nomulus with Apache License 2.0

5 votes

/**
 * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using
 * caller-provided {@code transformer}.
 */
static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles(
    DoFn<ReadableFile, VersionedEntity> transformer) {
  return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() {
    @Override
    public PCollection<VersionedEntity> expand(PCollection<Metadata> input) {
      return input
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer));
      // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide
    }
  };
}

Source File: BeamInputTransform.java From hop with Apache License 2.0

5 votes

@Override public PCollection<HopRow> expand( PBegin input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamHop.init(transformPluginClasses, xpPluginClasses);

      // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)");

      TextIO.Read ioRead = TextIO.read()
        .from( inputLocation )
        .withCompression( Compression.UNCOMPRESSED )
        ;

      StringToHopFn stringToHopFn = new StringToHopFn( transformName, rowMetaJson, separator, transformPluginClasses, xpPluginClasses );

      PCollection<HopRow> output = input

        // We read a bunch of Strings, one per line basically
        //
        .apply( transformName + " READ FILE",  ioRead )

        // We need to transform these lines into Hop fields
        //
        .apply( transformName, ParDo.of( stringToHopFn ) );

      return output;

    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in beam input transform", e );
      throw new RuntimeException( "Error in beam input transform", e );
    }

  }

Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

/** Public constructor. */
public FileOutput(ValueProvider<String> outputPrefix, OutputFileFormat format,
    Duration windowDuration, ValueProvider<Integer> numShards, Compression compression,
    InputType inputType) {
  this.outputPrefix = outputPrefix;
  this.format = format;
  this.windowDuration = windowDuration;
  this.numShards = numShards;
  this.compression = compression;
  this.inputType = inputType;
}

Source File: UtilTest.java From dlp-dataflow-deidentification with Apache License 2.0

5 votes

@Test
public void testGetReader() throws IOException {
  Path firstPath = tmpFolder.newFile("first").toPath();
  int firstSize = 37;
  Files.write(firstPath, new byte[firstSize]);

  ValueProvider<String> testValueProvider = null;
  PCollection<String> br =
      p.apply(FileIO.match().filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*"))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(
              ParDo.of(
                  new DoFn<FileIO.ReadableFile, String>() {
                    @ProcessElement
                    public void processElement(
                        @Element FileIO.ReadableFile f, OutputReceiver<String> out)
                        throws IOException {
                      out.output(
                          Util.getReader(
                                  false,
                                  "object_name",
                                  "bucket_name",
                                  f,
                                  "key_name",
                                  testValueProvider)
                              .readLine());
                    }
                  }));
  p.run();
  assertNotNull(br);
}

Source File: BeamInputTransform.java From kettle-beam with Apache License 2.0

5 votes

@Override public PCollection<KettleRow> expand( PBegin input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamKettle.init(stepPluginClasses, xpPluginClasses);

      // System.out.println("-------------- TextIO.Read from "+inputLocation+" (UNCOMPRESSED)");

      TextIO.Read ioRead = TextIO.read()
        .from( inputLocation )
        .withCompression( Compression.UNCOMPRESSED )
        ;

      StringToKettleFn stringToKettleFn = new StringToKettleFn( stepname, rowMetaJson, separator, stepPluginClasses, xpPluginClasses );

      PCollection<KettleRow> output = input

        // We read a bunch of Strings, one per line basically
        //
        .apply( stepname + " READ FILE",  ioRead )

        // We need to transform these lines into Kettle fields
        //
        .apply( stepname, ParDo.of( stringToKettleFn ) );

      return output;

    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in beam input transform", e );
      throw new RuntimeException( "Error in beam input transform", e );
    }

  }

Source File: CompressPayload.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

@VisibleForTesting
static byte[] compress(byte[] payload, Compression compression) {
  ByteArrayOutputStream out = new ByteArrayOutputStream();
  // We use a try-with-resources statement to ensure everything gets closed appropriately.
  try (ReadableByteChannel inChannel = Channels.newChannel(new ByteArrayInputStream(payload));
      WritableByteChannel outChannel = compression.writeCompressed(Channels.newChannel(out))) {
    ByteStreams.copy(inChannel, outChannel);
  } catch (IOException e) {
    return payload;
  }
  return out.toByteArray();
}

Source File: Write.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

/** Public constructor. */
public AvroOutput(ValueProvider<String> outputPrefix, Duration windowDuration,
    ValueProvider<Integer> numShards, Compression compression, InputType inputType,
    ValueProvider<String> schemasLocation) {
  this.outputPrefix = outputPrefix;
  this.windowDuration = windowDuration;
  this.numShards = numShards;
  this.compression = compression;
  this.inputType = inputType;
  this.schemasLocation = schemasLocation;
  this.pathTemplate = NestedValueProvider.of(outputPrefix, DynamicPathTemplate::new);
}

Source File: SinkOptions.java From gcp-ingestion with Mozilla Public License 2.0

4 votes

@Description("Compression format for --outputType=file")
@Default.Enum("GZIP")
Compression getOutputFileCompression();

Source File: S3Import.java From dlp-dataflow-deidentification with Apache License 2.0

4 votes

public static void main(String[] args) {
  S3ImportOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class);

  AWSOptionParser.formatOptions(options);

  Pipeline p = Pipeline.create(options);
  // s3
  PCollection<KV<String, ReadableFile>> s3Files =
      p.apply(
              "Poll S3 Files",
              FileIO.match()
                  .filepattern(options.getS3BucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add S3 File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  // gcs files
  PCollection<KV<String, ReadableFile>> gcsFiles =
      p.apply(
              "Poll GCS Files",
              FileIO.match()
                  .filepattern(options.getGcsBucketUrl())
                  .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never()))
          .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO))
          .apply(
              "Add GCS File Name as Key",
              WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString()))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of()));

  PCollection<KV<String, ReadableFile>> files =
      PCollectionList.of(ImmutableList.of(gcsFiles, s3Files))
          .apply("File List", Flatten.pCollections())
          .apply(
              "Fixed Window",
              Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.ZERO));

  PCollectionTuple contents =
      files.apply(
          "Read File Contents",
          ParDo.of(new TextFileReader())
              .withOutputTags(
                  textReaderSuccessElements, TupleTagList.of(textReaderFailedElements)));

  PCollectionTuple inspectedContents =
      contents
          .get(textReaderSuccessElements)
          .apply(
              "DLP Inspection",
              ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName()))
                  .withOutputTags(
                      apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements)));

  inspectedContents
      .get(apiResponseSuccessElements)
      .apply(
          "BQ Write",
          BigQueryIO.<KV<String, TableRow>>write()
              .to(new BQDestination(options.getDataSetId(), options.getProject()))
              .withFormatFunction(
                  element -> {
                    return element.getValue();
                  })
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withoutValidation()
              .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  PCollectionList.of(
          ImmutableList.of(
              contents.get(textReaderFailedElements),
              inspectedContents.get(apiResponseFailedElements)))
      .apply("Combine Error Logs", Flatten.pCollections())
      .apply(
          "Write Error Logs",
          ParDo.of(
              new DoFn<String, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.error("***ERROR*** {}", c.element().toString());
                  c.output(c.element());
                }
              }));

  p.run();
}

Source File: CSVStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0

4 votes

@SuppressWarnings("serial")
public static void doTokenization(TokenizePipelineOptions options) {
  Pipeline p = Pipeline.create(options);

  PCollection<KV<String, List<String>>> filesAndContents =
      p.apply(
              FileIO.match()
                  .filepattern(options.getInputFile())
                  .continuously(
                      Duration.standardSeconds(options.getPollingInterval()),
                      Watch.Growth.never()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(
              "FileHandler",
              ParDo.of(
                  new CSVReader(
                      options.getCsek(),
                      options.getCsekhash(),
                      options.getFileDecryptKeyName(),
                      options.getFileDecryptKey(),
                      options.as(GcpOptions.class).getProject(),
                      options.getBatchSize())));

  PCollection<KV<String, Table>> dlpTables =
      filesAndContents.apply(
          "ContentHandler", ParDo.of(new CSVContentProcessorDoFn(options.getBatchSize())));

  PCollection<Row> dlpRows =
      dlpTables
          .apply(
              "DoDLPTokenization",
              ParDo.of(
                  new DLPTokenizationDoFn(
                      options.as(GcpOptions.class).getProject(),
                      options.getDeidentifyTemplateName(),
                      options.getInspectTemplateName())))
          .apply(
              Window.<Row>into(FixedWindows.of(Duration.standardSeconds(options.getInterval())))
                  .triggering(
                      AfterProcessingTime.pastFirstElementInPane()
                          .plusDelayOf(Duration.standardMinutes(1)))
                  .discardingFiredPanes()
                  .withAllowedLateness(Duration.standardMinutes(1)));

  dlpRows.apply(
      "WriteToBQ",
      BigQueryIO.<Row>write()
          .to(new BQDestination(options.getDataset(), options.as(GcpOptions.class).getProject()))
          .withFormatFunction(new BQTableRowSF())
          .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  dlpRows
      .apply(
          MapElements.via(
              new SimpleFunction<Row, KV<String, Row>>() {
                @Override
                public KV<String, Row> apply(Row row) {
                  return KV.of(row.getTableId(), row);
                }
              }))
      .apply(GroupByKey.<String, Row>create())
      .apply(
          "WriteToGCS",
          FileIO.<String, KV<String, Iterable<Row>>>writeDynamic()
              .by(
                  (SerializableFunction<KV<String, Iterable<Row>>, String>)
                      row -> {
                        return row.getKey();
                      })
              .via(new CSVSink())
              .to(options.getOutputFile())
              .withDestinationCoder(StringUtf8Coder.of())
              .withNumShards(1)
              .withNaming(key -> FileIO.Write.defaultNaming(key, ".csv")));

  p.run();
}

Source File: XmlIO.java From beam with Apache License 2.0

4 votes

/** Decompresses all input files using the specified compression type. */
public Read<T> withCompression(Compression compression) {
  return toBuilder().setCompression(compression).build();
}

org.apache.beam.sdk.io.Compression Java Examples