org.apache.beam.sdk.io.fs.ResourceId Java Examples
The following examples show how to use
org.apache.beam.sdk.io.fs.ResourceId.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WindowedFilenamePolicyTest.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** * Tests that windowedFilename() constructs the filename correctly according to the parameters * when using ValueProviders. */ @Test public void testWindowedFilenameFormatValueProvider() throws IOException { // Arrange // ResourceId outputDirectory = getBaseTempDirectory(); WindowedContext context = mock(WindowedContext.class); BoundedWindow window = mock(BoundedWindow.class); PaneInfo paneInfo = PaneInfo.createPane(false, true, Timing.ON_TIME, 0, 0); WindowedFilenamePolicy policy = new WindowedFilenamePolicy( StaticValueProvider.of(outputDirectory.toString()), StaticValueProvider.of("output"), StaticValueProvider.of("-SSS-of-NNN"), StaticValueProvider.of(".txt")); // Act // ResourceId filename = policy.windowedFilename(1, 1, window, paneInfo, new TestOutputFileHints()); // Assert // assertThat(filename, is(notNullValue())); assertThat(filename.getFilename(), is(equalTo("output-001-of-001.txt"))); }
Example #2
Source File: IsmReaderFactoryTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testFactory() throws Exception { WindowedValueCoder<?> coder = WindowedValue.getFullCoder( IsmRecordCoder.of( 1, 0, ImmutableList.<Coder<?>>of(StringUtf8Coder.of()), VarLongCoder.of()), GlobalWindow.Coder.INSTANCE); String tmpFile = tmpFolder.newFile().getPath(); ResourceId tmpResourceId = FileSystems.matchSingleFileSpec(tmpFile).resourceId(); @SuppressWarnings("rawtypes") IsmReader<?> ismReader = (IsmReader) new IsmReaderFactory() .create( createSpecForFilename(tmpFile), coder, options, executionContext, operationContext); assertEquals(coder.getValueCoder(), ismReader.getCoder()); assertEquals(tmpResourceId, ismReader.getResourceId()); }
Example #3
Source File: FileUtils.java From beam with Apache License 2.0 | 6 votes |
public static String copyFile(ResourceId sourceFile, ResourceId destinationFile) throws IOException { try (WritableByteChannel writeChannel = FileSystems.create(destinationFile, "text/plain")) { try (ReadableByteChannel readChannel = FileSystems.open(sourceFile)) { final ByteBuffer buffer = ByteBuffer.allocateDirect(16 * 1024); while (readChannel.read(buffer) != -1) { buffer.flip(); writeChannel.write(buffer); buffer.compact(); } buffer.flip(); while (buffer.hasRemaining()) { writeChannel.write(buffer); } } } return destinationFile.toString(); }
Example #4
Source File: DefaultFilenamePolicy.java From beam with Apache License 2.0 | 6 votes |
/** * Construct a {@link DefaultFilenamePolicy}. * * <p>This is a shortcut for: * * <pre>{@code * DefaultFilenamePolicy.fromParams(new Params() * .withBaseFilename(baseFilename) * .withShardTemplate(shardTemplate) * .withSuffix(filenameSuffix) * .withWindowedWrites()) * }</pre> * * <p>Where the respective {@code with} methods are invoked only if the value is non-null or true. */ public static DefaultFilenamePolicy fromStandardParameters( ValueProvider<ResourceId> baseFilename, @Nullable String shardTemplate, @Nullable String filenameSuffix, boolean windowedWrites) { Params params = new Params().withBaseFilename(baseFilename); if (shardTemplate != null) { params = params.withShardTemplate(shardTemplate); } if (filenameSuffix != null) { params = params.withSuffix(filenameSuffix); } if (windowedWrites) { params = params.withWindowedWrites(); } return fromParams(params); }
Example #5
Source File: FileSystems.java From beam with Apache License 2.0 | 6 votes |
private static KV<List<ResourceId>, List<ResourceId>> filterMissingFiles( List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds) throws IOException { validateSrcDestLists(srcResourceIds, destResourceIds); if (srcResourceIds.isEmpty()) { // Short-circuit. return KV.of(Collections.<ResourceId>emptyList(), Collections.<ResourceId>emptyList()); } List<ResourceId> srcToHandle = new ArrayList<>(); List<ResourceId> destToHandle = new ArrayList<>(); List<MatchResult> matchResults = matchResources(srcResourceIds); for (int i = 0; i < matchResults.size(); ++i) { if (!matchResults.get(i).status().equals(Status.NOT_FOUND)) { srcToHandle.add(srcResourceIds.get(i)); destToHandle.add(destResourceIds.get(i)); } } return KV.of(srcToHandle, destToHandle); }
Example #6
Source File: DynamicOneFilePerWindow.java From dlp-dataflow-deidentification with Apache License 2.0 | 6 votes |
@Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, OutputFileHints outputFileHints) { IntervalWindow intervalWindow = (IntervalWindow) window; String filename = String.format( "%s-%s-of-%s%s", filenamePrefixForWindow(intervalWindow), shardNumber, numShards, outputFileHints.getSuggestedFilenameSuffix()); return baseFilename .getCurrentDirectory() .resolve(filename, StandardResolveOptions.RESOLVE_FILE); }
Example #7
Source File: FileBasedSinkTest.java From beam with Apache License 2.0 | 6 votes |
/** * Writer opens the correct file, writes the header, footer, and elements in the correct order, * and returns the correct filename. */ @Test public void testWriter() throws Exception { String testUid = "testId"; ResourceId expectedTempFile = getBaseTempDirectory().resolve(testUid, StandardResolveOptions.RESOLVE_FILE); List<String> values = Arrays.asList("sympathetic vulture", "boresome hummingbird"); List<String> expected = new ArrayList<>(); expected.add(SimpleSink.SimpleWriter.HEADER); expected.addAll(values); expected.add(SimpleSink.SimpleWriter.FOOTER); SimpleSink.SimpleWriter<Void> writer = buildWriteOperationWithTempDir(getBaseTempDirectory()).createWriter(); writer.open(testUid); for (String value : values) { writer.write(value); } writer.close(); assertEquals(expectedTempFile, writer.getOutputFile()); assertFileContains(expected, expectedTempFile); }
Example #8
Source File: FileBasedSink.java From beam with Apache License 2.0 | 6 votes |
@Experimental(Kind.FILESYSTEM) public ResourceId getDestinationFile( boolean windowedWrites, DynamicDestinations<?, DestinationT, ?> dynamicDestinations, int numShards, OutputFileHints outputFileHints) { checkArgument(getShard() != UNKNOWN_SHARDNUM); checkArgument(numShards > 0); FilenamePolicy policy = dynamicDestinations.getFilenamePolicy(destination); if (windowedWrites) { return policy.windowedFilename( getShard(), numShards, getWindow(), getPaneInfo(), outputFileHints); } else { return policy.unwindowedFilename(getShard(), numShards, outputFileHints); } }
Example #9
Source File: WriteFilesTest.java From beam with Apache License 2.0 | 6 votes |
@Override public ResourceId unwindowedFilename( int shardNumber, int numShards, OutputFileHints outputFileHints) { DecimalFormat df = new DecimalFormat("0000"); String prefix = baseFilename.isDirectory() ? "" : firstNonNull(baseFilename.getFilename(), ""); String filename = String.format( "%s-%s-of-%s%s%s", prefix, df.format(shardNumber), df.format(numShards), outputFileHints.getSuggestedFilenameSuffix(), suffix); return baseFilename .getCurrentDirectory() .resolve(filename, StandardResolveOptions.RESOLVE_FILE); }
Example #10
Source File: WindowedFilenamePolicyTest.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** * Tests that windowedFilename() constructs the filename correctly according to the parameters * when using Strings. */ @Test public void testWindowedFilenameFormatString() throws IOException { // Arrange // ResourceId outputDirectory = getBaseTempDirectory(); WindowedContext context = mock(WindowedContext.class); BoundedWindow window = mock(BoundedWindow.class); PaneInfo paneInfo = PaneInfo.createPane(false, true, Timing.ON_TIME, 0, 0); WindowedFilenamePolicy policy = new WindowedFilenamePolicy( outputDirectory.toString(), "string-output", "-SSS-of-NNN", ".csv"); // Act // ResourceId filename = policy.windowedFilename(1, 1, window, paneInfo, new TestOutputFileHints()); // Assert // assertThat(filename, is(notNullValue())); assertThat(filename.getFilename(), is(equalTo("string-output-001-of-001.csv"))); }
Example #11
Source File: FileUtils.java From deployment-examples with MIT License | 6 votes |
public static String copyFileFromWorkerToGCS( SubProcessConfiguration configuration, Path fileToUpload) throws Exception { Path fileName; if ((fileName = fileToUpload.getFileName()) == null) { throw new IllegalArgumentException("FileName can not be null."); } ResourceId sourceFile = getFileResourceId(configuration.getWorkerPath(), fileName.toString()); LOG.info("Copying file from worker " + sourceFile); ResourceId destinationFile = getFileResourceId(configuration.getSourcePath(), fileName.toString()); // TODO currently not supported with different schemas for example GCS to local, else could use // FileSystems.copy(ImmutableList.of(sourceFile), ImmutableList.of(destinationFile)); try { return copyFile(sourceFile, destinationFile); } catch (Exception ex) { LOG.error( String.format("Error copying file from %s to %s", sourceFile, destinationFile), ex); throw ex; } }
Example #12
Source File: HadoopResourceId.java From beam with Apache License 2.0 | 6 votes |
@Override public ResourceId resolve(String other, ResolveOptions resolveOptions) { checkState( isDirectory(), String.format("Expected this resource is a directory, but had [%s].", uri)); if (resolveOptions == StandardResolveOptions.RESOLVE_DIRECTORY) { if (!other.endsWith("/")) { other += "/"; } return new HadoopResourceId(uri.resolve(other)); } else if (resolveOptions == StandardResolveOptions.RESOLVE_FILE) { checkArgument(!other.endsWith("/"), "Resolving a file with a directory path: %s", other); return new HadoopResourceId(uri.resolve(other)); } else { throw new UnsupportedOperationException( String.format("Unexpected StandardResolveOptions %s", resolveOptions)); } }
Example #13
Source File: FileUtils.java From deployment-examples with MIT License | 6 votes |
public static String copyFile(ResourceId sourceFile, ResourceId destinationFile) throws IOException { try (WritableByteChannel writeChannel = FileSystems.create(destinationFile, "text/plain")) { try (ReadableByteChannel readChannel = FileSystems.open(sourceFile)) { final ByteBuffer buffer = ByteBuffer.allocateDirect(16 * 1024); while (readChannel.read(buffer) != -1) { buffer.flip(); writeChannel.write(buffer); buffer.compact(); } buffer.flip(); while (buffer.hasRemaining()) { writeChannel.write(buffer); } } } return destinationFile.toString(); }
Example #14
Source File: TestUtils.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** * Helper to generate files for testing. * * @param filePath The path to the file to write. * @param lines The lines to write. * @param compression The compression type of the file. * @return The file written. * @throws IOException If an error occurs while creating or writing the file. */ public static ResourceId writeToFile( String filePath, List<String> lines, Compression compression) throws IOException { String fileContents = String.join(System.lineSeparator(), lines); ResourceId resourceId = FileSystems.matchNewResource(filePath, false); String mimeType = compression == Compression.UNCOMPRESSED ? MimeTypes.TEXT : MimeTypes.BINARY; // Write the file contents to the channel and close. try (ReadableByteChannel readChannel = Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) { try (WritableByteChannel writeChannel = compression.writeCompressed(FileSystems.create(resourceId, mimeType))) { ByteStreams.copy(readChannel, writeChannel); } } return resourceId; }
Example #15
Source File: WriteOneFilePerWindow.java From deployment-examples with MIT License | 6 votes |
@Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, OutputFileHints outputFileHints) { IntervalWindow intervalWindow = (IntervalWindow) window; String filename = String.format( "%s-%s-of-%s%s", filenamePrefixForWindow(intervalWindow), shardNumber, numShards, outputFileHints.getSuggestedFilenameSuffix()); return baseFilename .getCurrentDirectory() .resolve(filename, StandardResolveOptions.RESOLVE_FILE); }
Example #16
Source File: FileSystems.java From beam with Apache License 2.0 | 6 votes |
/** * Renames a {@link List} of file-like resources from one location to another. * * <p>The number of source resources must equal the number of destination resources. Destination * resources will be created recursively. * * <p>{@code srcResourceIds} and {@code destResourceIds} must have the same scheme. * * <p>It doesn't support renaming globs. * * @param srcResourceIds the references of the source resources * @param destResourceIds the references of the destination resources */ public static void rename( List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds, MoveOptions... moveOptions) throws IOException { validateSrcDestLists(srcResourceIds, destResourceIds); if (srcResourceIds.isEmpty()) { // Short-circuit. return; } List<ResourceId> srcToRename = srcResourceIds; List<ResourceId> destToRename = destResourceIds; if (Sets.newHashSet(moveOptions) .contains(MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES)) { KV<List<ResourceId>, List<ResourceId>> existings = filterMissingFiles(srcResourceIds, destResourceIds); srcToRename = existings.getKey(); destToRename = existings.getValue(); } if (srcToRename.isEmpty()) { return; } getFileSystemInternal(srcToRename.iterator().next().getScheme()) .rename(srcToRename, destToRename); }
Example #17
Source File: WindowedFilenamePolicy.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** * The windowed filename method will construct filenames per window according to the baseFile, * suffix, and shardTemplate supplied. Directories with date templates in them will automatically * have their values resolved. For example the outputDirectory of /YYYY/MM/DD would resolve to * /2017/01/08 on January 8th, 2017. */ @Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, OutputFileHints outputFileHints) { ResourceId outputFile = resolveWithDateTemplates(outputDirectory, window) .resolve(outputFilenamePrefix.get(), StandardResolveOptions.RESOLVE_FILE); DefaultFilenamePolicy policy = DefaultFilenamePolicy.fromStandardParameters( StaticValueProvider.of(outputFile), shardTemplate.get(), suffix.get(), true); ResourceId result = policy.windowedFilename(shardNumber, numShards, window, paneInfo, outputFileHints); LOG.debug("Windowed file name policy created: {}", result.toString()); return result; }
Example #18
Source File: ParquetIO.java From beam with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(ProcessContext processContext) throws Exception { FileIO.ReadableFile file = processContext.element(); if (!file.getMetadata().isReadSeekEfficient()) { ResourceId filename = file.getMetadata().resourceId(); throw new RuntimeException(String.format("File has to be seekable: %s", filename)); } SeekableByteChannel seekableByteChannel = file.openSeekable(); AvroParquetReader.Builder builder = AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel)); if (modelClass != null) { // all GenericData implementations have a static get method builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null)); } try (ParquetReader<GenericRecord> reader = builder.build()) { GenericRecord read; while ((read = reader.read()) != null) { processContext.output(read); } } }
Example #19
Source File: DefaultFilenamePolicy.java From beam with Apache License 2.0 | 6 votes |
@Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, OutputFileHints outputFileHints) { String paneStr = paneInfoToString(paneInfo); String windowStr = windowToString(window); return constructName( params.baseFilename.get(), params.shardTemplate, params.suffix + outputFileHints.getSuggestedFilenameSuffix(), shardNumber, numShards, paneStr, windowStr); }
Example #20
Source File: WindowedFilenamePolicy.java From DataflowTemplates with Apache License 2.0 | 6 votes |
/** * The windowed filename method will construct filenames per window according to the baseFile, * suffix, and shardTemplate supplied. Directories with date templates in them will automatically * have their values resolved. For example the outputDirectory of /YYYY/MM/DD would resolve to * /2017/01/08 on January 8th, 2017. */ @Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, OutputFileHints outputFileHints) { ResourceId outputFile = resolveWithDateTemplates(outputDirectory, window) .resolve(outputFilenamePrefix.get(), StandardResolveOptions.RESOLVE_FILE); DefaultFilenamePolicy policy = DefaultFilenamePolicy.fromStandardParameters( StaticValueProvider.of(outputFile), shardTemplate.get(), suffix.get(), true); ResourceId result = policy.windowedFilename(shardNumber, numShards, window, paneInfo, outputFileHints); LOG.debug("Windowed file name policy created: {}", result.toString()); return result; }
Example #21
Source File: BulkDecompressor.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(ProcessContext context) { ResourceId inputFile = context.element().resourceId(); // Output a record to the failure file if the file doesn't match a known compression. if (!Compression.AUTO.isCompressed(inputFile.toString())) { String errorMsg = String.format(UNCOMPRESSED_ERROR_MSG, inputFile.toString(), SUPPORTED_COMPRESSIONS); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), errorMsg)); } else { try { ResourceId outputFile = decompress(inputFile); context.output(outputFile.toString()); } catch (IOException e) { LOG.error(e.getMessage()); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage())); } } }
Example #22
Source File: BoundedSideInputJoinTest.java From beam with Apache License 2.0 | 5 votes |
/** Test {@code query} matches {@code model}. */ private <T extends KnownSize> void queryMatchesModel( String name, NexmarkConfiguration config, NexmarkQueryTransform<T> query, NexmarkQueryModel<T> model, boolean streamingMode) throws Exception { ResourceId sideInputResourceId = FileSystems.matchNewResource( String.format( "%s/BoundedSideInputJoin-%s", p.getOptions().getTempLocation(), new Random().nextInt()), false); config.sideInputUrl = sideInputResourceId.toString(); try { PCollection<KV<Long, String>> sideInput = NexmarkUtils.prepareSideInput(p, config); query.setSideInput(sideInput); PCollection<Event> events = p.apply( name + ".Read", streamingMode ? NexmarkUtils.streamEventsSource(config) : NexmarkUtils.batchEventsSource(config)); PCollection<TimestampedValue<T>> results = (PCollection<TimestampedValue<T>>) events.apply(new NexmarkQuery<>(config, query)); PAssert.that(results).satisfies(model.assertionFor()); PipelineResult result = p.run(); result.waitUntilFinish(); } finally { NexmarkUtils.cleanUpSideInput(config); } }
Example #23
Source File: JdbcAvroIO.java From dbeam with Apache License 2.0 | 5 votes |
JdbcAvroSink( ValueProvider<ResourceId> filenamePrefix, DynamicAvroDestinations<UserT, Void, String> dynamicDestinations, JdbcAvroArgs jdbcAvroArgs) { super(filenamePrefix, dynamicDestinations, Compression.UNCOMPRESSED); this.dynamicDestinations = dynamicDestinations; this.jdbcAvroArgs = jdbcAvroArgs; }
Example #24
Source File: DynamicOneFilePerWindow.java From dlp-dataflow-deidentification with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<KV<String, String>> input) { PCollection<String> contents = input.apply( ParDo.of( new DoFn<KV<String, String>, String>() { @ProcessElement public void processElement(ProcessContext c) { filenamePrefix = String.format("%s%s", filenamePrefix, c.element().getKey()); LOG.info("File Prefix {}", filenamePrefix); c.output(c.element().getValue()); } })); ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); TextIO.Write write = TextIO.write() .to(new PerWindowFiles(resource)) .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } return contents.apply(write); }
Example #25
Source File: FlinkTransformOverridesTest.java From beam with Apache License 2.0 | 5 votes |
@Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, OutputFileHints outputFileHints) { throw new UnsupportedOperationException("should not be called"); }
Example #26
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 5 votes |
@Override public ResourceId windowedFilename( int shardNumber, int numShards, BoundedWindow window, PaneInfo paneInfo, FileBasedSink.OutputFileHints outputFileHints) { throw new UnsupportedOperationException("should not be called"); }
Example #27
Source File: TextImportTransform.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public PCollection<ImportManifest> expand(PBegin input) { return input .apply("Read manifest", FileIO.match().filepattern(importManifest)) .apply( "Resource id", MapElements.into(TypeDescriptor.of(ResourceId.class)) .via((MatchResult.Metadata::resourceId))) .apply( "Read manifest json", MapElements.into(TypeDescriptor.of(ImportManifest.class)) .via(ReadImportManifest::readManifest)); }
Example #28
Source File: DefaultFilenamePolicy.java From beam with Apache License 2.0 | 5 votes |
@Override public Params decode(InputStream inStream) throws IOException { ResourceId prefix = FileBasedSink.convertToFileResourceIfPossible(stringCoder.decode(inStream)); String shardTemplate = stringCoder.decode(inStream); String suffix = stringCoder.decode(inStream); return new Params() .withBaseFilename(prefix) .withShardTemplate(shardTemplate) .withSuffix(suffix); }
Example #29
Source File: SnowflakeIO.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) throws IOException { String combinedPath = stagingBucketDir + "/**"; List<ResourceId> paths = FileSystems.match(combinedPath).metadata().stream() .map(metadata -> metadata.resourceId()) .collect(Collectors.toList()); FileSystems.delete(paths, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); }
Example #30
Source File: GcsResourceIdTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testResolveInvalidNotDirectory() { ResourceId tmpDir = toResourceIdentifier("gs://my_bucket/") .resolve("tmp dir", StandardResolveOptions.RESOLVE_FILE); thrown.expect(IllegalStateException.class); thrown.expectMessage("Expected the gcsPath is a directory, but had [gs://my_bucket/tmp dir]."); tmpDir.resolve("aa", StandardResolveOptions.RESOLVE_FILE); }