org.apache.beam.sdk.io.ReadableFileCoder Java Examples
The following examples show how to use
org.apache.beam.sdk.io.ReadableFileCoder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileShard.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public void encode(FileShard value, OutputStream os) throws IOException { StringUtf8Coder.of().encode(value.getTableName(), os); ReadableFileCoder.of().encode(value.getFile(), os); VarLongCoder.of().encode(value.getRange().getFrom(), os); VarLongCoder.of().encode(value.getRange().getTo(), os); }
Example #2
Source File: FileShard.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public FileShard decode(InputStream is) throws IOException { String tableName = StringUtf8Coder.of().decode(is); ReadableFile file = ReadableFileCoder.of().decode(is); long from = VarLongCoder.of().decode(is); long to = VarLongCoder.of().decode(is); return new AutoValue_FileShard(tableName, file, new OffsetRange(from, to)); }
Example #3
Source File: S3Import.java From dlp-dataflow-deidentification with Apache License 2.0 | 4 votes |
public static void main(String[] args) { S3ImportOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(S3ImportOptions.class); AWSOptionParser.formatOptions(options); Pipeline p = Pipeline.create(options); // s3 PCollection<KV<String, ReadableFile>> s3Files = p.apply( "Poll S3 Files", FileIO.match() .filepattern(options.getS3BucketUrl()) .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())) .apply("S3 File Match", FileIO.readMatches().withCompression(Compression.AUTO)) .apply( "Add S3 File Name as Key", WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString())) .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())); // gcs files PCollection<KV<String, ReadableFile>> gcsFiles = p.apply( "Poll GCS Files", FileIO.match() .filepattern(options.getGcsBucketUrl()) .continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())) .apply("GCS File Match", FileIO.readMatches().withCompression(Compression.AUTO)) .apply( "Add GCS File Name as Key", WithKeys.of(file -> file.getMetadata().resourceId().getFilename().toString())) .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())); PCollection<KV<String, ReadableFile>> files = PCollectionList.of(ImmutableList.of(gcsFiles, s3Files)) .apply("File List", Flatten.pCollections()) .apply( "Fixed Window", Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL)) .triggering(AfterWatermark.pastEndOfWindow()) .discardingFiredPanes() .withAllowedLateness(Duration.ZERO)); PCollectionTuple contents = files.apply( "Read File Contents", ParDo.of(new TextFileReader()) .withOutputTags( textReaderSuccessElements, TupleTagList.of(textReaderFailedElements))); PCollectionTuple inspectedContents = contents .get(textReaderSuccessElements) .apply( "DLP Inspection", ParDo.of(new TokenizeData(options.getProject(), options.getInspectTemplateName())) .withOutputTags( apiResponseSuccessElements, TupleTagList.of(apiResponseFailedElements))); inspectedContents .get(apiResponseSuccessElements) .apply( "BQ Write", BigQueryIO.<KV<String, TableRow>>write() .to(new BQDestination(options.getDataSetId(), options.getProject())) .withFormatFunction( element -> { return element.getValue(); }) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) .withoutValidation() .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); PCollectionList.of( ImmutableList.of( contents.get(textReaderFailedElements), inspectedContents.get(apiResponseFailedElements))) .apply("Combine Error Logs", Flatten.pCollections()) .apply( "Write Error Logs", ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext c) { LOG.error("***ERROR*** {}", c.element().toString()); c.output(c.element()); } })); p.run(); }
Example #4
Source File: DLPTextToBigQueryStreamingTest.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Tests reading from a sample CSV file in chunks and create DLP Table from the contents and * process the contents by converting to Table Row. */ @Test public void testFileIOToBigQueryStreamingE2E() throws IOException { ValueProvider<Integer> batchSize = p.newProvider(10); PCollectionView<List<KV<String, List<String>>>> headerMap = p.apply(Create.of(KV.of("tokenization_data", Arrays.asList(HEADER_ROW.split(","))))) .apply(View.asList()); PCollection<KV<String, Table>> dlpTable = p.apply("Match", FileIO.match().filepattern(tokenizedFilePath)) .apply("Read File", FileIO.readMatches().withCompression(Compression.AUTO)) .apply("Add Keys", WithKeys.of(key -> "tokenization_data")) .setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())) .apply( "Create DLP Table", ParDo.of(new CSVReader(batchSize, headerMap)).withSideInputs(headerMap)); PAssert.that(dlpTable) .satisfies( collection -> { KV<String, Table> tableData = collection.iterator().next(); assertThat(tableData.getKey(), is(equalTo("tokenization_data"))); assertThat(tableData.getValue().getHeadersCount(), is(equalTo(11))); assertThat(tableData.getValue().getRowsCount(), is(equalTo(1))); return null; }); PCollection<KV<String, TableRow>> tableRowMap = dlpTable.apply(ParDo.of(new TableRowProcessorDoFn()).withSideInputs(headerMap)); PAssert.that(tableRowMap) .satisfies( collection -> { KV<String, TableRow> result = collection.iterator().next(); assertThat(result.getValue().get("CardTypeCode"), is(equalTo("MC"))); assertThat(result.getValue().get("CardTypeFullName"), is(equalTo("Master Card"))); assertThat(result.getValue().get("IssuingBank"), is(equalTo("Wells Fargo"))); assertThat(result.getValue().get("CardNumber"), is(equalTo("E5ssxfuqnGfF36Kk"))); assertThat(result.getValue().get("CardHoldersName"), is(equalTo("Jeremy O Wilson"))); assertThat(result.getValue().get("CVVCVV2"), is(equalTo("NK3"))); assertThat(result.getValue().get("IssueDate"), is(equalTo("12/2007"))); assertThat(result.getValue().get("ExpiryDate"), is(equalTo("12/2008"))); assertThat(result.getValue().get("BillingDate"), is(equalTo("3"))); assertThat(result.getValue().get("CardPIN"), is(equalTo("vmFF"))); assertThat(result.getValue().get("CreditLimit"), is(equalTo("19800"))); return null; }); p.run(); }
Example #5
Source File: CoderRegistry.java From beam with Apache License 2.0 | 4 votes |
private CommonTypes() { ImmutableMap.Builder<Class<?>, CoderProvider> builder = ImmutableMap.builder(); builder.put( Boolean.class, CoderProviders.fromStaticMethods(Boolean.class, BooleanCoder.class)); builder.put(Byte.class, CoderProviders.fromStaticMethods(Byte.class, ByteCoder.class)); builder.put(BitSet.class, CoderProviders.fromStaticMethods(BitSet.class, BitSetCoder.class)); builder.put(Float.class, CoderProviders.fromStaticMethods(Float.class, FloatCoder.class)); builder.put(Double.class, CoderProviders.fromStaticMethods(Double.class, DoubleCoder.class)); builder.put( Instant.class, CoderProviders.fromStaticMethods(Instant.class, InstantCoder.class)); builder.put( Integer.class, CoderProviders.fromStaticMethods(Integer.class, VarIntCoder.class)); builder.put( Iterable.class, CoderProviders.fromStaticMethods(Iterable.class, IterableCoder.class)); builder.put(KV.class, CoderProviders.fromStaticMethods(KV.class, KvCoder.class)); builder.put(List.class, CoderProviders.fromStaticMethods(List.class, ListCoder.class)); builder.put(Long.class, CoderProviders.fromStaticMethods(Long.class, VarLongCoder.class)); builder.put(Map.class, CoderProviders.fromStaticMethods(Map.class, MapCoder.class)); builder.put( Metadata.class, CoderProviders.fromStaticMethods(Metadata.class, MetadataCoder.class)); builder.put( ResourceId.class, CoderProviders.fromStaticMethods(ResourceId.class, ResourceIdCoder.class)); builder.put( FileIO.ReadableFile.class, CoderProviders.fromStaticMethods(FileIO.ReadableFile.class, ReadableFileCoder.class)); builder.put(Set.class, CoderProviders.fromStaticMethods(Set.class, SetCoder.class)); builder.put( String.class, CoderProviders.fromStaticMethods(String.class, StringUtf8Coder.class)); builder.put( TimestampedValue.class, CoderProviders.fromStaticMethods( TimestampedValue.class, TimestampedValue.TimestampedValueCoder.class)); builder.put(Void.class, CoderProviders.fromStaticMethods(Void.class, VoidCoder.class)); builder.put( byte[].class, CoderProviders.fromStaticMethods(byte[].class, ByteArrayCoder.class)); builder.put( IntervalWindow.class, CoderProviders.forCoder( TypeDescriptor.of(IntervalWindow.class), IntervalWindow.getCoder())); commonTypesToCoderProviders = builder.build(); }