org.apache.beam.sdk.io.FileBasedSink Java Examples
The following examples show how to use
org.apache.beam.sdk.io.FileBasedSink.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: InvoicingUtilsTest.java From nomulus with Apache License 2.0 | 6 votes |
@Test public void testDestinationFunction_generatesProperFileParams() { SerializableFunction<BillingEvent, Params> destinationFunction = InvoicingUtils.makeDestinationFunction("my/directory", StaticValueProvider.of("2017-10")); BillingEvent billingEvent = mock(BillingEvent.class); // We mock BillingEvent to make the test independent of the implementation of toFilename() when(billingEvent.toFilename(any())).thenReturn("invoice_details_2017-10_registrar_tld"); assertThat(destinationFunction.apply(billingEvent)) .isEqualTo( new Params() .withShardTemplate("") .withSuffix(".csv") .withBaseFilename( FileBasedSink.convertToFileResourceIfPossible( "my/directory/2017-10/invoice_details_2017-10_registrar_tld"))); }
Example #2
Source File: WriteOneFilePerWindow.java From incubator-nemo with Apache License 2.0 | 6 votes |
@Override public ResourceId windowedFilename( final int shardNumber, final int numShards, final BoundedWindow window, final PaneInfo paneInfo, final FileBasedSink.OutputFileHints outputFileHints) { System.out.println("Windowd file name: " + window); final IntervalWindow intervalWindow = (IntervalWindow) window; final String filename = String.format( "%s-%s-of-%s%s", filenamePrefixForWindow(intervalWindow), shardNumber, numShards, outputFileHints.getSuggestedFilenameSuffix()); return baseFilename .getCurrentDirectory() .resolve(filename, ResolveOptions.StandardResolveOptions.RESOLVE_FILE); }
Example #3
Source File: InvoicingUtils.java From nomulus with Apache License 2.0 | 6 votes |
/** * Returns a function mapping from {@code BillingEvent} to filename {@code Params}. * * <p>Beam uses this to determine which file a given {@code BillingEvent} should get placed into. * * @param outputBucket the GCS bucket we're outputting reports to * @param yearMonthProvider a runtime provider for the yyyy-MM we're generating the invoice for */ static SerializableFunction<BillingEvent, Params> makeDestinationFunction( String outputBucket, ValueProvider<String> yearMonthProvider) { return billingEvent -> new Params() .withShardTemplate("") .withSuffix(".csv") .withBaseFilename( NestedValueProvider.of( yearMonthProvider, yearMonth -> FileBasedSink.convertToFileResourceIfPossible( String.format( "%s/%s/%s", outputBucket, yearMonth, billingEvent.toFilename(yearMonth))))); }
Example #4
Source File: InvoicingPipeline.java From nomulus with Apache License 2.0 | 6 votes |
/** Returns an IO transform that writes detail reports to registrar-tld keyed CSV files. */ private TextIO.TypedWrite<BillingEvent, Params> writeDetailReports( ValueProvider<String> yearMonthProvider) { return TextIO.<BillingEvent>writeCustomType() .to( InvoicingUtils.makeDestinationFunction( String.format("%s/%s", billingBucketUrl, BillingModule.INVOICES_DIRECTORY), yearMonthProvider), InvoicingUtils.makeEmptyDestinationParams(billingBucketUrl + "/errors")) .withFormatFunction(BillingEvent::toCsv) .withoutSharding() .withTempDirectory( FileBasedSink.convertToFileResourceIfPossible(beamBucketUrl + "/temporary")) .withHeader(BillingEvent.getHeader()) .withSuffix(".csv"); }
Example #5
Source File: WriteToGCSAvro.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@Override public PDone expand(PCollection<KV<String, String>> kafkaRecords) { return kafkaRecords /* * Converting KV<String, String> records to GenericRecord using DoFn and {@link * KeyValueToGenericRecordFn} class. */ .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn())) .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA)) /* * Writing as avro file using {@link AvroIO}. * * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file. * The {@link withNumShards} option specifies the number of shards passed by the user. * The {@link withTempDirectory} option sets the base directory used to generate temporary files. */ .apply( "Writing as Avro", AvroIO.writeGenericRecords(KeyValueToGenericRecordFn.SCHEMA) .to( new WindowedFilenamePolicy( outputDirectory(), outputFilenamePrefix(), WriteToGCSUtility.SHARD_TEMPLATE, WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.AVRO))) .withTempDirectory( FileBasedSink.convertToFileResourceIfPossible(tempLocation()) .getCurrentDirectory()) .withWindowedWrites() .withNumShards(numShards())); }
Example #6
Source File: WriteFilesTranslationTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testEncodedProto() throws Exception { SdkComponents components = SdkComponents.create(); components.registerEnvironment(Environments.createDockerEnvironment("java")); RunnerApi.WriteFilesPayload payload = WriteFilesTranslation.payloadForWriteFiles(writeFiles, components); assertThat( payload.getRunnerDeterminedSharding(), equalTo( writeFiles.getNumShardsProvider() == null && writeFiles.getComputeNumShards() == null)); assertThat(payload.getWindowedWrites(), equalTo(writeFiles.getWindowedWrites())); assertThat( (FileBasedSink<String, Void, String>) WriteFilesTranslation.sinkFromProto(payload.getSink()), equalTo(writeFiles.getSink())); }
Example #7
Source File: JdbcAvroIO.java From dbeam with Apache License 2.0 | 6 votes |
public static PTransform<PCollection<String>, WriteFilesResult<Void>> createWrite( String filenamePrefix, String filenameSuffix, Schema schema, JdbcAvroArgs jdbcAvroArgs) { filenamePrefix = filenamePrefix.replaceAll("/+$", "") + "/part"; ValueProvider<ResourceId> prefixProvider = StaticValueProvider.of(FileBasedSink.convertToFileResourceIfPossible(filenamePrefix)); FileBasedSink.FilenamePolicy filenamePolicy = DefaultFilenamePolicy.fromStandardParameters( prefixProvider, DEFAULT_SHARD_TEMPLATE, filenameSuffix, false); final DynamicAvroDestinations<String, Void, String> destinations = AvroIO.constantDestinations( filenamePolicy, schema, ImmutableMap.of(), // since Beam does not support zstandard CodecFactory.nullCodec(), SerializableFunctions.identity()); final FileBasedSink<String, Void, String> sink = new JdbcAvroSink<>(prefixProvider, destinations, jdbcAvroArgs); return WriteFiles.to(sink); }
Example #8
Source File: WriteWithShardingFactoryTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void withNoShardingSpecifiedReturnsNewTransform() { ResourceId outputDirectory = LocalResources.fromString("/foo", true /* isDirectory */); PTransform<PCollection<Object>, WriteFilesResult<Void>> original = WriteFiles.to( new FileBasedSink<Object, Void, Object>( StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(new FakeFilenamePolicy())) { @Override public WriteOperation<Void, Object> createWriteOperation() { throw new IllegalArgumentException("Should not be used"); } }); @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of())); AppliedPTransform< PCollection<Object>, WriteFilesResult<Void>, PTransform<PCollection<Object>, WriteFilesResult<Void>>> originalApplication = AppliedPTransform.of("write", objs.expand(), Collections.emptyMap(), original, p); assertThat( factory.getReplacementTransform(originalApplication).getTransform(), not(equalTo((Object) original))); }
Example #9
Source File: WriteOneFilePerWindow.java From beam with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<String> input) { ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); TextIO.Write write = TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } return input.apply(write); }
Example #10
Source File: WriteFilesTranslation.java From beam with Apache License 2.0 | 5 votes |
@VisibleForTesting static FileBasedSink<?, ?, ?> sinkFromProto(FunctionSpec sinkProto) throws IOException { checkArgument( sinkProto.getUrn().equals(CUSTOM_JAVA_FILE_BASED_SINK_URN), "Cannot extract %s instance from %s with URN %s", FileBasedSink.class.getSimpleName(), FunctionSpec.class.getSimpleName(), sinkProto.getUrn()); byte[] serializedSink = sinkProto.getPayload().toByteArray(); return (FileBasedSink<?, ?, ?>) SerializableUtils.deserializeFromByteArray( serializedSink, FileBasedSink.class.getSimpleName()); }
Example #11
Source File: WriteFilesTranslation.java From beam with Apache License 2.0 | 5 votes |
public static <UserT, DestinationT, OutputT> FileBasedSink<UserT, DestinationT, OutputT> getSink( AppliedPTransform< PCollection<UserT>, WriteFilesResult<DestinationT>, ? extends PTransform<PCollection<UserT>, WriteFilesResult<DestinationT>>> transform) throws IOException { return (FileBasedSink<UserT, DestinationT, OutputT>) sinkFromProto(getWriteFilesPayload(transform).getSink()); }
Example #12
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void writeWithRunnerDeterminedSharding() { ResourceId outputDirectory = LocalResources.fromString("/foo/bar", true /* isDirectory */); FilenamePolicy policy = DefaultFilenamePolicy.fromStandardParameters( StaticValueProvider.of(outputDirectory), DefaultFilenamePolicy.DEFAULT_UNWINDOWED_SHARD_TEMPLATE, "", false); WriteFiles<Integer, Void, Integer> write = WriteFiles.to( new FileBasedSink<Integer, Void, Integer>( StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(policy)) { @Override public WriteOperation<Void, Integer> createWriteOperation() { return null; } }); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(write)), is(true)); WriteFiles<Integer, Void, Integer> withStaticSharding = write.withNumShards(3); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding() .matches(appliedWrite(withStaticSharding)), is(false)); WriteFiles<Integer, Void, Integer> withCustomSharding = write.withSharding(Sum.integersGlobally().asSingletonView()); assertThat( PTransformMatchers.writeWithRunnerDeterminedSharding() .matches(appliedWrite(withCustomSharding)), is(false)); }
Example #13
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 5 votes |
@Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, FileBasedSink.OutputFileHints outputFileHints) { throw new UnsupportedOperationException("should not be called"); }
Example #14
Source File: DataflowRunner.java From beam with Apache License 2.0 | 5 votes |
@Override public PTransformReplacement<PCollection<UserT>, WriteFilesResult<DestinationT>> getReplacementTransform( AppliedPTransform< PCollection<UserT>, WriteFilesResult<DestinationT>, WriteFiles<UserT, DestinationT, OutputT>> transform) { // By default, if numShards is not set WriteFiles will produce one file per bundle. In // streaming, there are large numbers of small bundles, resulting in many tiny files. // Instead we pick max workers * 2 to ensure full parallelism, but prevent too-many files. // (current_num_workers * 2 might be a better choice, but that value is not easily available // today). // If the user does not set either numWorkers or maxNumWorkers, default to 10 shards. int numShards; if (options.getMaxNumWorkers() > 0) { numShards = options.getMaxNumWorkers() * 2; } else if (options.getNumWorkers() > 0) { numShards = options.getNumWorkers() * 2; } else { numShards = DEFAULT_NUM_SHARDS; } try { List<PCollectionView<?>> sideInputs = WriteFilesTranslation.getDynamicDestinationSideInputs(transform); FileBasedSink sink = WriteFilesTranslation.getSink(transform); WriteFiles<UserT, DestinationT, OutputT> replacement = WriteFiles.to(sink).withSideInputs(sideInputs); if (WriteFilesTranslation.isWindowedWrites(transform)) { replacement = replacement.withWindowedWrites(); } return PTransformReplacement.of( PTransformReplacements.getSingletonMainInput(transform), replacement.withNumShards(numShards)); } catch (Exception e) { throw new RuntimeException(e); } }
Example #15
Source File: WriteOneFilePerWindow.java From dlp-dataflow-deidentification with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<String> input) { ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); TextIO.Write write = TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } return input.apply(write); }
Example #16
Source File: WriteWithShardingFactoryTest.java From beam with Apache License 2.0 | 5 votes |
@Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, FileBasedSink.OutputFileHints outputFileHints) { throw new IllegalArgumentException("Should not be used"); }
Example #17
Source File: InvoicingUtils.java From nomulus with Apache License 2.0 | 5 votes |
/** * Returns the default filename parameters for an unmappable {@code BillingEvent}. * * <p>The "failed" file should only be populated when an error occurs, which warrants further * investigation. */ static Params makeEmptyDestinationParams(String outputBucket) { return new Params() .withBaseFilename( FileBasedSink.convertToFileResourceIfPossible( String.format("%s/%s", outputBucket, "FAILURES"))); }
Example #18
Source File: InvoicingUtilsTest.java From nomulus with Apache License 2.0 | 5 votes |
@Test public void testEmptyDestinationParams() { assertThat(InvoicingUtils.makeEmptyDestinationParams("my/directory")) .isEqualTo( new Params() .withBaseFilename( FileBasedSink.convertToFileResourceIfPossible("my/directory/FAILURES"))); }
Example #19
Source File: DynamicOneFilePerWindow.java From dlp-dataflow-deidentification with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<KV<String, String>> input) { PCollection<String> contents = input.apply( ParDo.of( new DoFn<KV<String, String>, String>() { @ProcessElement public void processElement(ProcessContext c) { filenamePrefix = String.format("%s%s", filenamePrefix, c.element().getKey()); LOG.info("File Prefix {}", filenamePrefix); c.output(c.element().getValue()); } })); ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); TextIO.Write write = TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } return contents.apply(write); }
Example #20
Source File: WriteOneFilePerWindow.java From incubator-nemo with Apache License 2.0 | 5 votes |
@Override public PDone expand(final PCollection<String> input) { final ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); TextIO.Write write = TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } return input.apply(write); }
Example #21
Source File: JdbcAvroIO.java From dbeam with Apache License 2.0 | 5 votes |
JdbcAvroWriter( FileBasedSink.WriteOperation<Void, String> writeOperation, DynamicAvroDestinations<?, Void, String> dynamicDestinations, JdbcAvroArgs jdbcAvroArgs) { super(writeOperation, MimeTypes.BINARY); this.dynamicDestinations = dynamicDestinations; this.jdbcAvroArgs = jdbcAvroArgs; this.metering = JdbcAvroMetering.create(); }
Example #22
Source File: WriteToText.java From deployment-examples with MIT License | 5 votes |
@Override public PDone expand(PCollection<String> input) { // Verify that the input has a compatible window type. checkArgument( input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder()); ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); return input.apply( TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites() .withNumShards(3)); }
Example #23
Source File: WriteOneFilePerWindow.java From deployment-examples with MIT License | 5 votes |
@Override public PDone expand(PCollection<String> input) { ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); TextIO.Write write = TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } return input.apply(write); }
Example #24
Source File: JdbcAvroIO.java From dbeam with Apache License 2.0 | 5 votes |
private JdbcAvroWriteOperation( FileBasedSink<?, Void, String> sink, DynamicAvroDestinations<?, Void, String> dynamicDestinations, JdbcAvroArgs jdbcAvroArgs) { super(sink); this.dynamicDestinations = dynamicDestinations; this.jdbcAvroArgs = jdbcAvroArgs; }
Example #25
Source File: ExportTransform.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public FileBasedSink.FilenamePolicy getFilenamePolicy(final String destination) { final String uniqueId = sideInput(uniqueIdView); return DefaultFilenamePolicy.fromStandardParameters( ValueProvider.NestedValueProvider.of( baseDir, (SerializableFunction<ResourceId, ResourceId>) r -> r.resolve( GcsUtil.joinPath(uniqueId, destination + ".avro"), ResolveOptions.StandardResolveOptions.RESOLVE_FILE)), null, null, false); }
Example #26
Source File: WriteToText.java From beam with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<String> input) { // Verify that the input has a compatible window type. checkArgument( input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder()); ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); return input.apply( TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites() .withNumShards(3)); }
Example #27
Source File: WriteToGCSText.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<KV<String, String>> kafkaRecords) { return kafkaRecords /* * Converting KV<String, String> records to String using DoFn. */ .apply( "Converting to String", ParDo.of( new DoFn<KV<String, String>, String>() { @ProcessElement public void processElement(ProcessContext c) { c.output(c.element().getValue()); } })) /* * Writing as text file using {@link TextIO}. * * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file. * The {@link withNumShards} option specifies the number of shards passed by the user. * The {@link withTempDirectory} option sets the base directory used to generate temporary files. */ .apply( "Writing as Text", TextIO.write() .to( new WindowedFilenamePolicy( outputDirectory(), outputFilenamePrefix(), WriteToGCSUtility.SHARD_TEMPLATE, WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.TEXT))) .withTempDirectory( FileBasedSink.convertToFileResourceIfPossible(tempLocation()) .getCurrentDirectory()) .withWindowedWrites() .withNumShards(numShards())); }
Example #28
Source File: PubsubToText.java From DataflowTemplates with Apache License 2.0 | 5 votes |
/** * Runs the pipeline with the supplied options. * * @param options The execution parameters to the pipeline. * @return The result of the pipeline execution. */ public static PipelineResult run(Options options) { // Create the pipeline Pipeline pipeline = Pipeline.create(options); /* * Steps: * 1) Read string messages from PubSub * 2) Window the messages into minute intervals specified by the executor. * 3) Output the windowed files to GCS */ pipeline .apply("Read PubSub Events", PubsubIO.readStrings().fromTopic(options.getInputTopic())) .apply( options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))) // Apply windowed file writes. Use a NestedValueProvider because the filename // policy requires a resourceId generated from the input value at runtime. .apply( "Write File(s)", TextIO.write() .withWindowedWrites() .withNumShards(options.getNumShards()) .to( new WindowedFilenamePolicy( options.getOutputDirectory(), options.getOutputFilenamePrefix(), options.getOutputShardTemplate(), options.getOutputFilenameSuffix())) .withTempDirectory(NestedValueProvider.of( maybeUseUserTempLocation( options.getUserTempLocation(), options.getOutputDirectory()), (SerializableFunction<String, ResourceId>) input -> FileBasedSink.convertToFileResourceIfPossible(input)))); // Execute the pipeline and return the result. return pipeline.run(); }
Example #29
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 4 votes |
@Nullable @Override public ResourceId unwindowedFilename( int shardNumber, int numShards, FileBasedSink.OutputFileHints outputFileHints) { throw new UnsupportedOperationException("should not be called"); }
Example #30
Source File: WriteOneFilePerWindow.java From incubator-nemo with Apache License 2.0 | 4 votes |
@Override public ResourceId unwindowedFilename( final int shardNumber, final int numShards, final FileBasedSink.OutputFileHints outputFileHints) { throw new UnsupportedOperationException("Unsupported."); }